实验环境突然有个etcd节点出问题了,研究了一下,以下是我的处理过程
~]# systemctl status -l etcd
● etcd.service - etcd
Loaded: loaded (/usr/lib/systemd/system/etcd.service; enabled; vendor preset: disabled)
Active: activating (auto-restart) (Result: exit-code) since Tue 2022-08-09 21:29:17 CST; 1s ago
Docs: https://github.com/coreos
Process: 28649 ExecStart=/usr/local/bin/etcd --name k8s03 --cert-file=/etc/etcd/ssl/etcd.pem --key-file=/etc/etcd/ssl/etcd-key.pem --peer-cert-file=/etc/etcd/ssl/etcd.pem --peer-key-file=/etc/etcd/ssl/etcd-key.pem --trusted-ca-file=/etc/etcd/ssl/ca.pem --peer-trusted-ca-file=/etc/etcd/ssl/ca.pem --peer-client-cert-auth --client-cert-auth --initial-advertise-peer-urls https://10.0.0.53:2380 --listen-peer-urls https://10.0.0.53:2380 --listen-client-urls https://10.0.0.53:2379,https://127.0.0.1:2379 --advertise-client-urls https://10.0.0.53:2379 --initial-cluster-token etcd-cluster --initial-cluster k8s01=https://10.0.0.51:2380,k8s02=https://10.0.0.52:2380,k8s03=https://10.0.0.53:2380 --initial-cluster-state exiting --data-dir=/var/lib/etcd (code=exited, status=2)
Main PID: 28649 (code=exited, status=2)
Aug 09 21:29:17 k8s03 systemd[1]: etcd.service: main process exited, code=exited, status=2/INVALIDARGUMENT
Aug 09 21:29:17 k8s03 systemd[1]: Unit etcd.service entered failed state.
Aug 09 21:29:17 k8s03 systemd[1]: etcd.service failed.
#尝试重启了下起不来
~]# kubectl get cs
NAME STATUS MESSAGE ERROR
controller-manager Healthy ok
etcd-2 Unhealthy Get https://10.0.0.53:2379/health: dial tcp 10.0.0.53:2379: connect: connection refused
scheduler Healthy ok
etcd-0 Healthy {"health":"true"}
etcd-1 Healthy {"health":"true"}
~]# export ETCDCTL_API=3
~]# etcdctl --cacert=/etc/etcd/ssl/etcd.pem --cert=/etc/etcd/ssl/etcd.pem --key=/etc/etcd/ssl/etcd-key.pem --endpoints="https://10.0.0.51:2379" --insecure-skip-tls-verify member list
586b20e827f322a8, started, k8s03, https://10.0.0.53:2380, https://10.0.0.53:2379
9dbb0caefffd472c, started, k8s01, https://10.0.0.51:2380, https://10.0.0.51:2379
d17369d9f3d45ce2, started, k8s02, https://10.0.0.52:2380, https://10.0.0.52:2379
~]# etcdctl --cacert=/etc/etcd/ssl/etcd.pem --cert=/etc/etcd/ssl/etcd.pem --key=/etc/etcd/ssl/etcd-key.pem --endpoints="https://10.0.0.51:2379" --insecure-skip-tls-verify member remove 586b20e827f322a8
~]# rm -rf /var/lib/etcd/member/snap/
~]# rm -rf /var/lib/etcd/member/wal/
~]# etcdctl --cacert=/etc/etcd/ssl/etcd.pem --cert=/etc/etcd/ssl/etcd.pem --key=/etc/etcd/ssl/etcd-key.pem --endpoints="https://10.0.0.51:2379" --insecure-skip-tls-verify member add k8s03 --peer-urls="https://10.0.0.53:2380"
Member 374c8cab6ca896f5 added to cluster 13bec12661bf84a1
ETCD_NAME="k8s03"
ETCD_INITIAL_CLUSTER="k8s03=https://10.0.0.53:2380,k8s01=https://10.0.0.51:2380,k8s02=https://10.0.0.52:2380"
ETCD_INITIAL_ADVERTISE_PEER_URLS="https://10.0.0.53:2380"
ETCD_INITIAL_CLUSTER_STATE="existing"
~]# vi /usr/lib/systemd/system/etcd.service
[Unit]
Description=etcd
Documentation=https://github.com/coreos
[Service]
ExecStart=/usr/local/bin/etcd \
--name k8s03 \
--cert-file=/etc/etcd/ssl/etcd.pem \
--key-file=/etc/etcd/ssl/etcd-key.pem \
--peer-cert-file=/etc/etcd/ssl/etcd.pem \
--peer-key-file=/etc/etcd/ssl/etcd-key.pem \
--trusted-ca-file=/etc/etcd/ssl/ca.pem \
--peer-trusted-ca-file=/etc/etcd/ssl/ca.pem \
--peer-client-cert-auth \
--client-cert-auth \
--initial-advertise-peer-urls https://10.0.0.53:2380 \
--listen-peer-urls https://10.0.0.53:2380 \
--listen-client-urls https://10.0.0.53:2379,https://127.0.0.1:2379 \
--advertise-client-urls https://10.0.0.53:2379 \
--initial-cluster-token etcd-cluster \
--initial-cluster k8s01=https://10.0.0.51:2380,k8s02=https://10.0.0.52:2380,k8s03=https://10.0.0.53:2380 \
--initial-cluster-state existing \ # 修改new为existing
--data-dir=/var/lib/etcd
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
#重启启动etcd服务
~]# systemctl start etcd
#查看状态已经正常
~]# systemctl status -l etcd
● etcd.service - etcd
Loaded: loaded (/usr/lib/systemd/system/etcd.service; enabled; vendor preset: disabled)
Active: active (running) since Tue 2022-08-09 21:32:37 CST; 15min ago
Docs: https://github.com/coreos
Main PID: 30179 (etcd)
Tasks: 12
Memory: 67.3M
CGroup: /system.slice/etcd.service
└─30179 /usr/local/bin/etcd --name k8s03 --cert-file=/etc/etcd/ssl/etcd.pem --key-file=/etc/etcd/ssl/etcd-key.pem --peer-cert-file=/etc/etcd/ssl/etcd.pem --peer-key-file=/etc/etcd/ssl/etcd-key.pem --trusted-ca-file=/etc/etcd/ssl/ca.pem --peer-trusted-ca-file=/etc/etcd/ssl/ca.pem --peer-client-cert-auth --client-cert-auth --initial-advertise-peer-urls https://10.0.0.53:2380 --listen-peer-urls https://10.0.0.53:2380 --listen-client-urls https://10.0.0.53:2379,https://127.0.0.1:2379 --advertise-client-urls https://10.0.0.53:2379 --initial-cluster-token etcd-cluster --initial-cluster k8s01=https://10.0.0.51:2380,k8s02=https://10.0.0.52:2380,k8s03=https://10.0.0.53:2380 --initial-cluster-state existing --data-dir=/var/lib/etcd
]# etcdctl --cacert=/etc/etcd/ssl/etcd.pem --cert=/etc/etcd/ssl/etcd.pem --key=/etc/etcd/ssl/etcd-key.pem --endpoints="https://10.0.0.51:2379,https://10.0.0.52:2379,https://10.0.0.53:2379" --insecure-skip-tls-verify member list
374c8cab6ca896f5, started, k8s03, https://10.0.0.53:2380, https://10.0.0.53:2379
9dbb0caefffd472c, started, k8s01, https://10.0.0.51:2380, https://10.0.0.51:2379
d17369d9f3d45ce2, started, k8s02, https://10.0.0.52:2380, https://10.0.0.52:2379
]# kubectl get cs
NAME STATUS MESSAGE ERROR
scheduler Healthy ok
controller-manager Healthy ok
etcd-1 Healthy {"health":"true"}
etcd-2 Healthy {"health":"true"}
etcd-0 Healthy {"health":"true"}