一、需求
对入库到clickhouse的业务日志进行告警,达阀值后发送企业微信告警。
方法一、
fluent-bit–>clickhouse(http)<–shell脚本,每隔一分钟获取分析结果 --> 把结果保存到/dev/shm/目录下 <-- node_exporter读取指标入库到prometheus<-- rules根据告警规则生产告警–>alertmanager–>webhook --> 企业微信。
方法二、
fluent-bit–>clickhouse(http)<–python,每隔一分钟获取分析结果 --> pushgateway–>指标入库到prometheus<-- rules根据告警规则生产告警–>alertmanager–>webhook --> 企业微信。
二、告警组件
clickhouse
prometheus
alertmanager
node_exporter+查询脚本或者(python脚本+pushgateway)
webhook
三、clickhouse搭建和建表
业务日志库
四、node_exporter
启动参数添加 --collector.textfile.directory=/dev/shm/
[Unit]
Description=node_exporter Service
After=network.target
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
WorkingDirectory=/data/node_exporter
ExecStart=/data/node_exporter/node_exporter \
--web.config.file=/data/node_exporter/etc/config.yml \
--collector.filesystem.mount-points-exclude="^/(sys|proc|dev|host|etc|var/lib/docker/.+|var/lib/kubelet/.+)($|/)" \
--collector.systemd \
--collector.systemd.unit-include="(docker|sshd|isg|sgadmin).service" \
--web.listen-address=:19100 \
--collector.textfile.directory=/dev/shm/ \
--web.telemetry-path=/metrics
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target
五、shell脚本
使用crontab定时,一分钟执行一次
#!/usr/bin/env bash
#
# Generate node_resolv_info
# which are not handled by node_exporter's own collector
set -e
#ch的IP
ch_host=xx.xx.xx.xx
#ch的端口
ch_port=9000
#ch的用户
ch_user=xxxx
#ch的密码
ch_password=xxxxxxxxxxxxxxxxxxxx
#ch的数据库
ch_database=xxxxxxxxxxxxxx
#ch的表名
ch_table=xxxxxxxxxxxxx
#查询推后
query_delay=60
#因入库时间较慢,查询前一分钟所
#站点(聚合)
site_sql="SELECT splitByChar('/',req_path)[2] as paasid , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0))) as suc, count(1) as total , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0)) / count(1)*100, 5) AS val FROM ${ch_database}.${ch_table} PREWHERE (create_time >= toDateTime(now() - 60 - ${query_delay})) AND (create_time < toDateTime(now() - ${query_delay})) GROUP BY paasid HAVING total >= 5 ORDER BY val DESC"
SITE_ARRAY=(`docker exec -i ch clickhouse-client --user=${ch_user} --password=${ch_password} --host ${ch_host} --port ${ch_port} -n -m -q "${site_sql}"| tr -d '\r'`)
site_num=${#SITE_ARRAY[@]}
cat <<EOS >> /dev/shm/site_rate.prom.tmp
# HELP site_rate
# TYPE site_rate gauge
EOS
for ((i=0;i<site_num;i=i+4)); do
REQ_PATH="${SITE_ARRAY[i]}"
SUC="${SITE_ARRAY[i+1]}"
TOL="${SITE_ARRAY[i+2]}"
VAL="${SITE_ARRAY[i+3]}"
cat <<EOS >> /dev/shm/site_rate.prom.tmp
site_rate{site_path="${REQ_PATH}",suc="${SUC}",total="${TOL}"} ${VAL}
EOS
done
\mv /dev/shm/site_rate.prom.tmp /dev/shm/site_rate.prom
#------------------------------------
#API接口
api_sql="SELECT req_path , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0))) as suc, count(1) as total , round(sum(if((toInt64(res_statuscode) >= 200) AND (toInt64(res_statuscode) < 400), 1, 0)) / count(1)*100, 5) AS val FROM ${ch_database}.${ch_table} PREWHERE req_path like '/ebus/%' and (create_time >= toDateTime(now() - 60 - ${query_delay})) AND (create_time < toDateTime(now() - ${query_delay})) GROUP BY req_path HAVING total >= 3 ORDER BY val DESC"
API_ARRAY=(`docker exec -i ch clickhouse-client --user=${ch_user} --password=${ch_password} --host ${ch_host} --port ${ch_port} -n -m -q "${api_sql}"| tr -d '\r'`)
api_num=${#API_ARRAY[@]}
cat <<EOS >> /dev/shm/api_rate.prom.tmp
# HELP api_rate
# TYPE api_rate gauge
EOS
for ((i=0;i<api_num;i=i+4)); do
REQ_PATH="${API_ARRAY[i]}"
SUC="${API_ARRAY[i+1]}"
TOL="${API_ARRAY[i+2]}"
VAL="${API_ARRAY[i+3]}"
cat <<EOS >> /dev/shm/interface_rate.prom.tmp
api_rate{api_path="${REQ_PATH}",suc="${SUC}",total="${TOL}"} ${VAL}
EOS
done
\mv /dev/shm/api_rate.prom.tmp /dev/shm/api_rate.prom
#脚本生成结果1
cat /dev/shm/site_rate.prom
# HELP site_rate
# TYPE site_rate gauge
site_rate{site_path="/metrics/",suc="49",total="49"} 100
site_rate{site_path="/grafana/",suc="9",total="9"} 100
site_rate{site_path="/dail_healthcheck/",suc="16",total="16"} 100
site_rate{site_path="/abcyhzx5/",suc="64",total="64"} 100
site_rate{site_path="/abcapm/",suc="30",total="32"} 93.75
site_rate{site_path="/abc/",suc="333",total="370"} 90
site_rate{site_path="/ebus/",suc="2",total="14"} 14.28571
六、prometheus告警规则
groups:
- name: 接口成功率-监控告警
rules:
- alert: 接口成功率低于85%
expr: avg by (api_path,suc,total) (api_rate) <= 85
for: 0m
labels:
severity: 一般
alert: api
annotations:
description: "接口成功率低于85%\n(suc:{{$labels.suc}} total:{{$labels.total}})\n成功率:{{printf \"%.0f\" $value}}%"
- alert: 站点成功率低于85%
expr: avg by (site_path,suc,total) (site_rate) <= 85
for: 0m
labels:
severity: 一般
alert: api
annotations:
description: "站点成功率低于85%\n(suc:{{$labels.suc}} total:{{$labels.total}})\n成功率:{{printf \"%.0f\" $value}}%"
七、alertmanager
global:
resolve_timeout: 1m
smtp_from: 'xxxxxxxx@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: 'xxxxxx@qqq.com'
smtp_auth_password: 'XXXXXX'
smtp_require_tls: false
smtp_hello: 'qq.com'
templates:
- '/etc/alertmanager/email.tmpl' #邮件模板文件,容器内的路径
route:
receiver: 'ding2wechat'
#按alertname等进行分组
group_by: ['alertname']
#周期内有同一组的报警到来则一起发送
group_wait: 1m
#报警发送周期
group_interval: 10m
#与上次相同的报警延迟30m才发送,这里应该是(10+30)m左右
repeat_interval: 30m
routes:
#可以使用match_re正则匹配
- match:
severity: 严重
#匹配上则发给下面的name=ding2wechat
receiver: ding2wechat
- match:
alert: api
#匹配上则发给下面的name=api_ding2wechat
receiver: api_ding2wechat
repeat_interval: 24h
group_interval: 1m
receivers:
##企微机器人2,通过prometheus-webhook-dingtalk后,再通过ding2wechat
- name: 'ding2wechat'
webhook_configs:
- url: 'http://172.xxx.xxx.xxx:8060/dingtalk/ding2wechat/send'
send_resolved: true
- name: 'api_ding2wechat'
webhook_configs:
#不需要发送恢复告警
- url: 'http://172.xxx.xxx.xxx:8060/dingtalk/ding2wechat/send'
send_resolved: false
- name: 'email'
email_configs:
- to: 'xxxxxxxx@qq.com'
html: '{{ template "email.jwolf.html" . }}'
send_resolved: true
#抑制规则,(如果是critical时,抑制warning警报)
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']