背景
角色 | IP | K8S 版本 | 容器运行时 |
---|---|---|---|
k8s-master-1 | 172.16.16.108 | v1.24.1 | containerd://1.6.8 |
k8s-node-1 | 172.16.16.109 | v1.24.1 | containerd://1.6.8 |
k8s-node-2 | 172.16.16.110 | v1.24.1 | containerd://1.6.8 |
安装 kube-prometheus
mkdir -p /data/yaml/kube-prometheus/prometheus && cd /data/yaml/kube-prometheus/prometheus
# 添加 bitnami charts 仓库
helm repo add bitnami https://charts.bitnami.com/bitnami
helm search repo kube-prometheus
helm pull bitnami/kube-prometheus --version 8.3.0
tar -zxvf kube-prometheus-8.3.0.tgz
cat > my-values.yaml << EOF
global:
storageClass: "nfs-client" # 默认 storageClass
prometheus:
service:
type: NodePort # 配置 NodePort
nodePorts:
http: 30090 # 配置 NodePort 端口
persistence:
enabled: true # 开启持久化
size: 9Gi # 存储大小
alertmanager:
service:
type: NodePort # 配置 NodePort
nodePorts:
http: 30093 # 配置 NodePort 端口
persistence:
enabled: true # 开启持久化
size: 9Gi # 存储大小
config:
route:
receiver: 'devops' # 告警接收者
routes:
- match:
receiver: 'devops'
receivers:
- name: 'devops' # 告警接收者
webhook_configs:
- url: 'http://prometheus-webhook-dingtalk.kube-prometheus:8060/dingtalk/devops/send' # 注意这里的 devops 需要与 prometheus-webhook-dingtalk 中的 --ding.profile 值相同
send_resolved: true
EOF
# 创建命名空间
kubectl create ns kube-prometheus
# 测试
helm install --namespace kube-prometheus prometheus -f my-values.yaml --dry-run kube-prometheus
# 启动
helm install --namespace kube-prometheus prometheus -f my-values.yaml kube-prometheus
# 查看
helm -n kube-prometheus ls
kubectl -n kube-prometheus get pod
访问 Prometheus
http://172.16.16.108:30090/
配置 Pod 告警策略
mkdir -p /data/yaml/kube-prometheus/prometheus/rules && cd /data/yaml/kube-prometheus/prometheus/rules
cat >> k8s-pod-rules.yaml << -'EOF'
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus-name: kube-prometheus-prometheus
managed-by: prometheus-operator
name: prometheus-k8s-pod-rules
namespace: kube-prometheus
spec:
groups:
- name: PodMemUsage
rules:
- alert: Pod内存使用率告警
expr: sum by (pod, namespace, job, container) (container_memory_working_set_bytes{pod!="",container !=""}) / sum by (pod, namespace, job, container) (container_spec_memory_limit_bytes{pod!="",container !=""}) * 100 != +Inf > 95
for: 1m
labels:
severity: 紧急告警
service: pods
annotations:
description: "{{$labels.instance}}: 当前Pod内存使用率大于95% ,使用率为: {{ $value }}"
summary: "Pod:{{ $labels.pod }} 检测到内存使用率超过limit值95%"
- name: Pod_cpu
rules:
- alert: Pod_CPU使用率告警
expr: sum(irate(container_cpu_usage_seconds_total{pod!="",container !=""}[1m])) by (container, pod) / (sum(container_spec_cpu_quota{pod!="",container !=""}/100000) by (container, pod)) * 100 > 130
for: 1m
labels:
severity: 严重告警
service: pods
annotations:
description: "{{$labels.pod}}: 一分钟内Pod的cpu使用率大于130%,当前的使用率为: {{ $value }}"
- name: Pod_Network_rx
rules:
- alert: Pod网络IO(rx方向)告警
expr: (sum (rate (container_network_receive_bytes_total{pod!=""}[1m])) by (pod)) / 1024 / 1024 > 200
for: 1m
labels:
severity: 严重告警
service: pods
annotations:
description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(rx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
summary: "Pod:{{ $labels.pod }} 检测到一分钟内网络IO(rx方向)过高"
- name: Pod_Network_tx
rules:
- alert: Pod网络IO(tx方向)告警
expr: (sum (rate (container_network_transmit_bytes_total{pod!=""}[1m])) by (pod)) / 1024 / 1024 > 200
for: 1m
labels:
severity: 严重告警
service: pods
annotations:
description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(tx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
summary: "检测到一分钟内Pod网络IO(tx方向)过高"
- name: imagepullbackoff
rules:
- alert: 拉取镜像失败
expr: kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"} == 1
for: 1m
labels:
severity: 紧急告警
annotations:
summary: "POD:{{ $labels.pod }} 拉取镜像失败,无法创建容器"
description: "请确认镜像是否存在"
- name: Pod_Start_Exception
rules:
- alert: POD 资源配置不正确
expr: sum by (namespace, pod) (kube_pod_status_phase{ phase=~"Pending|Unknown"}) == 1
for: 15s
labels:
severity: 紧急告警
annotations:
summary: "POD:{{ $labels.pod }} 启动失败,请及时查看"
description: "POD 无法正常启动,请查看资源是否配置正确"
- name: crashloopbackoff
rules:
- alert: POD启动失败
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} == 1
for: 1m
labels:
severity: 紧急告警
annotations:
summary: "POD:{{ $labels.pod }} 启动失败,请查看程序日志"
description: "确认配置参数是否正确"
-EOF
kubectl apply -f k8s-pod-rules.yaml
# 检查
kubectl -n kube-prometheus get cm