参考文档
官方文档: 使用 kubeadm 创建集群
containerd镜像配置: containerd官方的HOST配置
镜像设置: containerd k3s 镜像加速
版本列表
- 操作系统: CentOS Linux release 7.9.2009 (Core)
- Kubernetes 版本: v1.29.12
- 无需科学上网,使用镜像源,而非代理模式
- 如无特别指出,本文所有的命令均以root执行
1. 运行环境检测
# 1.1 检验所有节点之中不可以有重复的主机名、MAC 地址或 product_uuid。
ifconfig -a
sudo cat /sys/class/dmi/id/product_uuid
# 1.2 关闭防火墙
# 查询状态
systemctl status firewalld
# 关闭防火墙
systemctl stop firewalld
# 禁用服务(重启后依旧关闭)
systemctl disable firewalld
# (附)启用防火墙的指令
# systemctl enable firewalld
# systemctl start firewalld
# 1.3 禁用交换分区
swapoff -a
# 需要永久关闭请修改/etc/fstab
# 重启
reboot
# 检查是否关闭如果输入为0则已关闭
free -m
# 1.4 由于centos7停止服务,yum拉取不到,设置镜像源
sudo cp /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak
sudo curl -o /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
2.安装容器运行时CRI(containerd)
# 2.1 先决条件
# 转发 IPv4 并让 iptables 看到桥接流量
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF
sudo modprobe overlay
sudo modprobe br_netfilter
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
# 应用 sysctl 参数而不重新启动
sudo sysctl --system
# 验证配置,运行以下指令确认 br_netfilter 和 overlay 模块被加载:
lsmod | grep br_netfilter
lsmod | grep overlay
# 验证配置,确认 net.bridge.bridge-nf-call-iptables、net.bridge.bridge-nf-call-ip6tables 和 net.ipv4.ip_forward 系统变量在你的 sysctl 配置中被设置为 1
sysctl net.bridge.bridge-nf-call-iptables net.bridge.bridge-nf-call-ip6tables net.ipv4.ip_forward
# 配置yum的源,如果出现网络错误:Network file descriptor is not. 反复尝试十几次,一般都能设置上
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
# 安装containerd.io,如果报错: 获取 GPG 密钥失败 依旧是重试几遍即可,不行加上--nogpgcheck
yum install containerd.io -y --nogpgcheck
# 检查运行状态,如果是dead很正常,缺少配置文件
systemctl status containerd
# 设置默认的配置文件,重启程序,设置开机启动,再查看状态应该是running了
containerd config default > /etc/containerd/config.toml
systemctl start containerd
systemctl enable containerd
systemctl status containerd
3.安装 kubeadm、kubelet 和 kubectl
# 3.1 将 SELinux 设置为 permissive 模式(相当于将其禁用)
sudo setenforce 0
sudo sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config
# 3.2 添加 Kubernetes 的 yum 仓库。(这里用的阿里云的)
# 此操作会覆盖 /etc/yum.repos.d/kubernetes.repo 中现存的所有配置
cat <<EOF | tee /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.29/rpm/
enabled=1
gpgcheck=1
gpgkey=https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.29/rpm/repodata/repomd.xml.key
EOF
# 3.3 安装 kubelet、kubeadm 和 kubectl,并启用 kubelet 以确保它在启动时自动启动:
yum install -y kubelet kubeadm kubectl --disableexcludes=kubernetes
systemctl enable --now kubelet
# 3.4 验证安装
# kubeadm 如果显示版本则安装ok
# 输出应该如下: kubeadm version: &version.Info{Major:"1", Minor:"29", GitVersion:"v1.29.12", GitCommit:"9253c9bda3d8bd76848bb4a21b309c28c0aab2f7", GitTreeState:"clean", BuildDate:"2024-12-10T11:34:15Z", GoVersion:"go1.22.9", Compiler:"gc", Platform:"linux/amd64"}
kubeadm version
# kubectl 如果显示版本则安装ok
# 输出应该如下: Client Version: v1.29.12 Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3 The connection to the server localhost:8080 was refused - did you specify the right host or port?
kubectl version
# systemctl 检查运行状态,应该是不能正常运行的,这样是正常的: activating (auto-restart) kubelet.service - kubelet: The Kubernetes Node Agent.
systemctl status kubelet
4.kubeadm 初始化
# 4.1 前置条件,非常重要
# 设置containerd的镜像源,否则无法拉取镜像的
# 编辑 /etc/containerd/config.toml
# 找到这一行,修改config_path 为
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/certs.d"
# 重启服务
systemctl restart containerd
# 配置镜像源
cd /etc/containerd/
mkdir -p certs.d && cd certs.d/
# docker hub镜像加速
mkdir -p /etc/containerd/certs.d/docker.io
cat > /etc/containerd/certs.d/docker.io/hosts.toml << EOF
server = "https://docker.io"
[host."https://dockerproxy.net"]
capabilities = ["pull", "resolve"]
[host."https://docker.m.daocloud.io"]
capabilities = ["pull", "resolve"]
[host."https://reg-mirror.qiniu.com"]
capabilities = ["pull", "resolve"]
[host."https://registry.docker-cn.com"]
capabilities = ["pull", "resolve"]
[host."http://hub-mirror.c.163.com"]
capabilities = ["pull", "resolve"]
EOF
# registry.k8s.io镜像加速
mkdir -p /etc/containerd/certs.d/registry.k8s.io
tee /etc/containerd/certs.d/registry.k8s.io/hosts.toml << 'EOF'
server = "https://registry.k8s.io"
[host."https://k8s.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# docker.elastic.co镜像加速
mkdir -p /etc/containerd/certs.d/docker.elastic.co
tee /etc/containerd/certs.d/docker.elastic.co/hosts.toml << 'EOF'
server = "https://docker.elastic.co"
[host."https://elastic.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# gcr.io镜像加速
mkdir -p /etc/containerd/certs.d/gcr.io
tee /etc/containerd/certs.d/gcr.io/hosts.toml << 'EOF'
server = "https://gcr.io"
[host."https://gcr.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# ghcr.io镜像加速
mkdir -p /etc/containerd/certs.d/ghcr.io
tee /etc/containerd/certs.d/ghcr.io/hosts.toml << 'EOF'
server = "https://ghcr.io"
[host."https://ghcr.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# k8s.gcr.io镜像加速
mkdir -p /etc/containerd/certs.d/k8s.gcr.io
tee /etc/containerd/certs.d/k8s.gcr.io/hosts.toml << 'EOF'
server = "https://k8s.gcr.io"
[host."https://k8s-gcr.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# mcr.m.daocloud.io镜像加速
mkdir -p /etc/containerd/certs.d/mcr.microsoft.com
tee /etc/containerd/certs.d/mcr.microsoft.com/hosts.toml << 'EOF'
server = "https://mcr.microsoft.com"
[host."https://mcr.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# nvcr.io镜像加速
mkdir -p /etc/containerd/certs.d/nvcr.io
tee /etc/containerd/certs.d/nvcr.io/hosts.toml << 'EOF'
server = "https://nvcr.io"
[host."https://nvcr.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# quay.io镜像加速
mkdir -p /etc/containerd/certs.d/quay.io
tee /etc/containerd/certs.d/quay.io/hosts.toml << 'EOF'
server = "https://quay.io"
[host."https://quay.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# registry.jujucharms.com镜像加速
mkdir -p /etc/containerd/certs.d/registry.jujucharms.com
tee /etc/containerd/certs.d/registry.jujucharms.com/hosts.toml << 'EOF'
server = "https://registry.jujucharms.com"
[host."https://jujucharms.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# rocks.canonical.com镜像加速
mkdir -p /etc/containerd/certs.d/rocks.canonical.com
tee /etc/containerd/certs.d/rocks.canonical.com/hosts.toml << 'EOF'
server = "https://rocks.canonical.com"
[host."https://rocks-canonical.m.daocloud.io"]
capabilities = ["pull", "resolve", "push"]
EOF
# 4.2 (可选)测试拉取镜像,这样可以看到是否使用了镜像源,流量限制,不会很快的,稍等几分钟
ctr --debug images pull --hosts-dir=/etc/containerd/certs.d registry.k8s.io/kube-apiserver:v1.29.12
# 4.3 拉取k8sd的镜像,这步耗时有点长,我这里实测大约7分钟
kubeadm config images pull
# 4.4 选择合适的ip段,查找以 "default via" 开头的行,查询可用的ip段,别和列出的重复即可
ip route show
# 4.5 初始化,我这里使用的IP段为:10.244.0.0/16
# 显示Your Kubernetes control-plane has initialized successfully!即初始化成功
kubeadm init --pod-network-cidr=10.244.0.0/16
5. 安装容器网络插件(CNI)
# 5.1 配置 kubeconfig 文件。
#直接执行以下命令.会报错E1219 16:46:59.873640 8689 memcache.go:265] couldn't get current server API group list
kubectl get pods --all-namespaces
# 为了能够使用 kubectl 与 Kubernetes 集群进行交互,您需要配置 kubeconfig 文件。
export KUBECONFIG=/etc/kubernetes/admin.conf
# 这个export 只能设置在当前会话,这个配置到root上,永久生效
cp /etc/kubernetes/admin.conf ~/.kube/config
# 重新执行应该就可以了 输出形如
#[root@etronprobe1 certs.d]# kubectl get pods --all-namespaces
#NAMESPACE NAME READY STATUS RESTARTS AGE
#kube-system coredns-76f75df574-7grrb 0/1 Pending 0 4m19s
#kube-system coredns-76f75df574-jxp4m 0/1 Pending 0 4m19s
#kube-system etcd-etronprobe1 1/1 Running 0 4m34s
#kube-system kube-apiserver-etronprobe1 1/1 Running 0 4m34s
#kube-system kube-controller-manager-etronprobe1 1/1 Running 0 4m34s
#kube-system kube-proxy-l64gg 1/1 Running 0 4m19s
#kube-system kube-scheduler-etronprobe1 1/1 Running 0 4m34s
kubectl get pods --all-namespaces
# 5.2 安装网络插件,从网络上
kubectl apply -f https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
# github不可达时,下载文件到本地执行(本文提供了附件)
kubectl apply -f ./kube-flannel.yml
# 稍等一会,等flannelinitok,如果执行命令,显示全部running,特别是kube-flannel,即配置成功
kubectl get pods --all-namespaces
# 5.3 检查是否安装成功,是否ready
kubectl get nodes
# 测试部署pod
kubectl run nginx --image=nginx --restart=Never
# 查看是否正常运行
kubectl get pods --all-namespaces -o wide
# 查看记录,如果失败了
kubectl describe pod nginx
# 只有单节点会报错
# Warning FailedScheduling 61s default-scheduler 0/1 nodes are available: 1 node(s) had untolerated taint {node-role.kubernetes.io/control-plane: }. preemption: 0/1 nodes are available: 1 Preemption is not helpful for scheduling.
# 移除污点,重试
kubectl taint nodes etronprobe node-role.kubernetes.io/control-plane-
# 再次查看
kubectl get pods --all-namespaces -o wide
6. 多Master集群部署
# 集群部署至少需要3台主机
# 执行到第三步为止都和单机一样,初始化时变更一下命令,如果已经初始化完成,执行重置k8s集群的命令(见下文),然后重新初始化.
kubeadm init --pod-network-cidr=10.244.0.0/16 --control-plane-endpoint=192.168.30.40:6443 --apiserver-advertise-address=192.168.30.148 --upload-certs
# 注意一下config需要重新写
cp /etc/kubernetes/admin.conf ~/.kube/config
# 如果需要所有的节点都能使用kubectl,则复制文件到所有节点
scp /root/.kube/config root@192.168.30.53:/root/.kube/config
scp /root/.kube/config root@192.168.30.43:/root/.kube/config
# 注意网络插件记得装
kubectl apply -f ./kube-flannel.yml
# 执行完成后会有加入命令.执行有control-plan,命令类似
kubeadm join 192.168.30.40:6443 --token q679s4.pc6ngel53iyzxnos --discovery-token-ca-cert-hash sha256:bd970ad5d12fbc8beed02a8ca292ac4ae14ea93fffa5854444bd97a1510ca335 --control-plane --certificate-key 65c7f411ac47cfa9c29a9a39ffa46b0cd3ee75af04fe6943c4993322b41bd17f
## 注意,如果出现证书问题,重置k8s集群,在所有的节点上,然后重新初始化,加入,注意init的参数--upload-certs,除非你手动设置证书
额外命令
# 重置k8s集群
kubeadm reset
rm -rf /etc/kubernetes/pki
rm -rf /etc/kubernetes/manifests/*
rm -rf /etc/kubernetes/kubelet.conf
# 其他命令
openssl genpkey -algorithm RSA -out /etc/kubernetes/pki/etcd/server.key
openssl req -new -key /etc/kubernetes/pki/etcd/server.key -out /etc/kubernetes/pki/etcd/server.csr -config /root/etcd-server-openssl.cnf
easyrsa import-req /etc/kubernetes/pki/etcd/server.csr server
easyrsa --batch sign-req server server
openssl genpkey -algorithm RSA -out /etc/kubernetes/pki/etcd/peer.key
openssl req -new -key /etc/kubernetes/pki/etcd/peer.key -out /etc/kubernetes/pki/etcd/peer.csr -config /root/etcd-peer-openssl.cnf
easyrsa import-req /etc/kubernetes/pki/etcd/peer.csr peer
easyrsa --batch sign-req server peer
openssl genpkey -algorithm RSA -out /etc/kubernetes/pki/etcd/healthcheck-client.key
openssl req -new -key /etc/kubernetes/pki/etcd/healthcheck-client.key -out /etc/kubernetes/pki/etcd/healthcheck-client.csr -config /root/etcd-healthcheck-client.cnf
easyrsa import-req /etc/kubernetes/pki/etcd/healthcheck-client.csr healthcheck-client
easyrsa --batch sign-req server healthcheck-client
cp /root/pki/issued/* /etc/kubernetes/pki/etcd/
openssl req -in /etc/kubernetes/pki/etcd/server.csr -noout -text
openssl x509 -in /etc/kubernetes/pki/etcd/server.crt -text -noout
scp -r /etc/kubernetes/pki/ root@192.168.30.53:/etc/kubernetes
scp -r /etc/kubernetes/pki/ root@192.168.30.43:/etc/kubernetes
scp /root/.kube/config root@192.168.30.53:/root/.kube/config
scp /root/.kube/config root@192.168.30.43:/root/.kube/config
附件:kube-flannel.yml
apiVersion: v1
kind: Namespace
metadata:
labels:
k8s-app: flannel
pod-security.kubernetes.io/enforce: privileged
name: kube-flannel
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: flannel
name: flannel
namespace: kube-flannel
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
k8s-app: flannel
name: flannel
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
k8s-app: flannel
name: flannel
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: flannel
subjects:
- kind: ServiceAccount
name: flannel
namespace: kube-flannel
---
apiVersion: v1
data:
cni-conf.json: |
{
"name": "cbr0",
"cniVersion": "0.3.1",
"plugins": [
{
"type": "flannel",
"delegate": {
"hairpinMode": true,
"isDefaultGateway": true
}
},
{
"type": "portmap",
"capabilities": {
"portMappings": true
}
}
]
}
net-conf.json: |
{
"Network": "10.244.0.0/16",
"EnableNFTables": false,
"Backend": {
"Type": "vxlan"
}
}
kind: ConfigMap
metadata:
labels:
app: flannel
k8s-app: flannel
tier: node
name: kube-flannel-cfg
namespace: kube-flannel
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: flannel
k8s-app: flannel
tier: node
name: kube-flannel-ds
namespace: kube-flannel
spec:
selector:
matchLabels:
app: flannel
k8s-app: flannel
template:
metadata:
labels:
app: flannel
k8s-app: flannel
tier: node
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
containers:
- args:
- --ip-masq
- --kube-subnet-mgr
command:
- /opt/bin/flanneld
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: EVENT_QUEUE_DEPTH
value: "5000"
image: docker.io/flannel/flannel:v0.26.2
name: kube-flannel
resources:
requests:
cpu: 100m
memory: 50Mi
securityContext:
capabilities:
add:
- NET_ADMIN
- NET_RAW
privileged: false
volumeMounts:
- mountPath: /run/flannel
name: run
- mountPath: /etc/kube-flannel/
name: flannel-cfg
- mountPath: /run/xtables.lock
name: xtables-lock
hostNetwork: true
initContainers:
- args:
- -f
- /flannel
- /opt/cni/bin/flannel
command:
- cp
image: docker.io/flannel/flannel-cni-plugin:v1.6.0-flannel1
name: install-cni-plugin
volumeMounts:
- mountPath: /opt/cni/bin
name: cni-plugin
- args:
- -f
- /etc/kube-flannel/cni-conf.json
- /etc/cni/net.d/10-flannel.conflist
command:
- cp
image: docker.io/flannel/flannel:v0.26.2
name: install-cni
volumeMounts:
- mountPath: /etc/cni/net.d
name: cni
- mountPath: /etc/kube-flannel/
name: flannel-cfg
priorityClassName: system-node-critical
serviceAccountName: flannel
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- hostPath:
path: /run/flannel
name: run
- hostPath:
path: /opt/cni/bin
name: cni-plugin
- hostPath:
path: /etc/cni/net.d
name: cni
- configMap:
name: kube-flannel-cfg
name: flannel-cfg
- hostPath:
path: /run/xtables.lock
type: FileOrCreate
name: xtables-lock
安装helm和Dashboard
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
./get_helm.sh
helm repo add kubernetes-dashboard https://kubernetes.github.io/dashboard/
helm upgrade --install kubernetes-dashboard kubernetes-dashboard/kubernetes-dashboard \
--create-namespace --namespace kubernetes-dashboard \
--version 7.5.0
# 如果只有主节点,没有工作节点,默认是不然dashboard负载到主节点上的,因为所有的主节点都有一个不让调度的污点:可以通过这个查看
kubectl get nodes -o=jsonpath='{.items[*].spec.taints}'
# 移除污点,dashboard应该可以正常运行了
kubectl taint nodes etronprobe0 node-role.kubernetes.io/control-plane:NoSchedule-
# 改变ClusterIP为NodePort,以外部访问
kubectl -n kubernetes-dashboard patch svc kubernetes-dashboard-kong-proxy -p '{"spec": {"type": "NodePort"}}'
# 创建用户和token
kubectl apply -f admin-service-account.yml
kubectl create token admin-user -n kube-system --duration=24h