Prometheus监控非K8S环境的docker

环境说明

Centos 7.4
Docker version 18.09.0

环境部署

1
2
3
4
[root@1-206 ~]# yum install -y yum-utils device-mapper-persistent-data lvm2
[root@1-206 ~]# yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
[root@1-206 ~]# yum install -y docker-ce
[root@1-206 ~]# systemctl start docker && systemctl enable docker

拉取容器镜像

1
2
3
4
5
6
7
8
[root@1-206 ~]# docker pull nginx
[root@1-206 ~]# docker pull google/cadvisor
[root@1-206 ~]# docker pull yfshare/node-exporter:0.17.0
[root@1-206 ~]# docker pull yfshare/prometheus:2.5.0
[root@1-206 ~]# docker pull prom/alertmanager
[root@1-206 ~]# docker pull prom/blackbox-exporter:v0.13.0
[root@1-206 ~]# docker pull prom/snmp-exporter
[root@1-206 ~]# docker pull grafana/grafana

启动容器

1
2
3
4
5
6
7
8
[root@1-206 ~]# docker run -it -d -p 80:80 -v /data1/tmp:/tmp/:rw --name nginx nginx
[root@1-206 ~]# docker run -d -p 8080:8080 -v /data1/tmp:/tmp:rw --name cadvisor google/cadvisor
[root@1-206 ~]# docker run -d -p 9100:9100 -v /data1/tmp:/tmp:rw --name node-exporter yfshare/node-exporter:0.17.0
[root@1-206 ~]# docker run -d -p 9090:9090 -v /data1/tmp:/tmp:rw --name prometheus yfshare/prometheus:2.5.0
[root@1-206 ~]# docker run -d -p 9093:9093 -v /data1/tmp:/tmp:rw --name alertmanager prom/alertmanager
[root@1-206 ~]# docker run -d -p 9115:9115 -v /data1/tmp:/tmp:rw --name blackbox-exporter prom/blackbox-exporter:v0.13.0
[root@1-206 ~]# docker run -d -p 9116:9116 -v /data1/tmp:/tmp:rw --name snmp-exporter prom/snmp-exporter
[root@1-206 ~]# docker run -d -p 3000:3000 -v /data1/tmp:/tmp:rw --name grafana grafana/grafana

查看容器状态

1
2
3
4
5
6
7
8
9
10
11
[root@1-206 ~]# docker ps
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
c66f56e6f287 yfshare/grafana:5.4.2 "/docker-entrypoint.…" 6 days ago Up 8 seconds 22/tcp, 0.0.0.0:3000->3000/tcp grafana
ba97caebb358 prom/snmp-exporter "/bin/snmp_exporter …" 6 days ago Up 7 seconds 0.0.0.0:9116->9116/tcp snmp-exporter
7dc28a01a329 nginx "nginx -g 'daemon of…" 6 days ago Up 6 seconds 0.0.0.0:80->80/tcp nginx
3e8ff1dc1d23 prom/blackbox-exporter:v0.13.0 "/bin/blackbox_expor…" 6 days ago Up 6 seconds 0.0.0.0:9115->9115/tcp blackbox-exporter
4af8de4eb4ef prom/alertmanager "/bin/alertmanager -…" 6 days ago Up 5 seconds 0.0.0.0:9093->9093/tcp alertmanager
3ad0edf76f32 yfshare/prometheus:2.5.0 "./prometheus --conf…" 6 days ago Up 5 seconds 22/tcp, 0.0.0.0:9090->9090/tcp prometheus
c7f8365c3c05 yfshare/node-exporter:0.17.0 "./node_exporter --w…" 18 minutes ago Up 18 minutes 22/tcp, 0.0.0.0:9100->9100/tcp node-exporter
b7ea0dc8ade6 google/cadvisor "/usr/bin/cadvisor -…" 6 days ago Up 2 seconds 0.0.0.0:8080->8080/tcp cadvisor
[root@1-206 ~]#

好干净哇…!
nginx_docker
我们先准备一个测试页面
nginx_page
当我们遇到这个问题时,是Prometheus所在的宿主机时间不对,我们需要先同步时间

1
2
[root@1-206 ~]# ntpdate -u time.windows.com
[root@1-206 ~]# /sbin/hwclock --systohc

prometheus_dashboard
prometheus默认监控了自己的9100端口
prometheus_targets
编写prometheus.yml配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ["192.168.1.206:9093"]
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['192.168.1.206:9090','192.168.1.206:9100','192.168.1.206:8080']
- job_name: 'snmp'
static_configs:
- targets:
- 192.168.1.206
metrics_path: /snmp
params:
module: [if_mib]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.206:9116
#网站监控
- job_name: 'instance-web-monitor'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://192.168.1.206:9090/metrics
- http://192.168.1.206/yfshare
labels:
city: '上海'
env: 'test'
inhibit: 'on'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.206:9115
####业务监控
- job_name: 'yfshare-web-node'
static_configs:
- targets: ['192.168.1.206:9100']
labels:
city: '上海'
env: 'DEV'
metrics_path: '/metrics'
- job_name: 'yfshare-web-cadvisor'
static_configs:
- targets: ['192.168.1.206:8080']
labels:
city: '上海'
env: 'DEV'
metrics_path: '/metrics'

这里使用的是自己封装的镜像,因官方镜像不支持通过API来重新加载Prometheus配置文件,试想一下,如果每次修改了配置文件,都需要重启Prometheus,是不是有点那啥..

1
[root@3ad0edf76f32 prometheus]# curl -X POST http://127.0.0.1:9090/-/reload

正确添加Prometheus配置文件后,我们可以查看到监控的key.
prometheus_dashboard
prometheus_dashboard

这里也使用的是自己封装的镜像,原因同上。
添加Prometheus的DataSource
grafana_datasource
grafana_datasource

Grafana dashboard

安装grafana常用插件,绘制图标。

1
2
3
4
5
6
[root@c66f56e6f287 grafana]# grafana-cli plugins install btplc-status-dot-panel
[root@c66f56e6f287 grafana]# grafana-cli plugins install grafana-clock-panel
[root@c66f56e6f287 grafana]# grafana-cli plugins install grafana-piechart-panel
[root@c66f56e6f287 grafana]# grafana-cli plugins install grafana-worldmap-panel
[root@c66f56e6f287 grafana]# grafana-cli plugins install michaeldmoore-annunciator-panel
[root@c66f56e6f287 grafana]# grafana-cli plugins install vonage-status-panel

grafana插件安装完成后需要重启服务

1
2
3
4
5
6
7
8
9
10
11
12
13
[root@1-206 ~]# docker exec -it grafana /bin/bash
[root@c66f56e6f287 grafana]# grafana-cli plugins ls
installed plugins:
btplc-status-dot-panel @ 0.2.3
grafana-clock-panel @ 1.0.2
grafana-piechart-panel @ 1.3.3
grafana-worldmap-panel @ 0.1.2
michaeldmoore-annunciator-panel @ 1.0.0
vonage-status-panel @ 1.0.9
Restart grafana after installing plugins . <service grafana-server restart>
[root@c66f56e6f287 grafana]#

绘制完成后的Prometheus监控面板
grafana_monitor
grafana_disk

Prometheus alter

这里监控2个,分别是密码文件修改监控和网站探测
url测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#网站监控
- job_name: 'instance-web-monitor'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://192.168.1.206:9090/metrics
- http://192.168.1.206/yfshare
- http://192.168.1.206/yfshare/aaa.html
labels:
city: '上海'
env: 'test'
inhibit: 'on'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.1.206:9115

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
[root@c7f8365c3c05 keys]# pwd
/usr/local/node_exporter/keys
[root@c7f8365c3c05 keys]# chmod +x check_file_md5.sh
[root@c7f8365c3c05 keys]# cat check_md5_file.txt
/etc/passwd
/etc/shadow
[root@c7f8365c3c05 keys]# sh check_file_md5.sh
[root@c7f8365c3c05 keys]# ls
check_file_md5.sh check_md5_file.txt check_md5.prom
[root@c7f8365c3c05 keys]# cat check_md5.prom
check_md5 {check_file="/etc/passwd",md5="93dfcbcaf36cddd4fa8a162bda2c98e3"} 0
check_md5 {check_file="/etc/shadow",md5="c48673bf1c829d4979bcd090649c3cbf"} 0
[root@c7f8365c3c05 keys]#
[root@c7f8365c3c05 keys]# useradd test
[root@c7f8365c3c05 keys]# sh check_file_md5.sh
[root@c7f8365c3c05 keys]# cat check_md5.prom
check_md5 {check_file="/etc/passwd",md5="93dfcbcaf36cddd4fa8a162bda2c98e3"} 1
check_md5 {check_file="/etc/shadow",md5="c48673bf1c829d4979bcd090649c3cbf"} 1
[root@c7f8365c3c05 keys]# crontab -l
* * * * * /bin/bash /usr/local/node_exporter/keys/check_file_md5.sh
[root@c7f8365c3c05 keys]#

定义value为0,正常;value为1,触发告警

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
[root@1-206 ~]# docker exec -it prometheus /bin/bash
[root@3ad0edf76f32 prometheus]# pwd
/usr/local/prometheus
[root@3ad0edf76f32 prometheus]# cat check.rules
groups:
- name: base
rules:
- alert: 密码文件变更告警
expr: check_md5 == 1
for: 1m
labels:
CITY: ALL
info: 密码文件变更告警
severity: Warning
resolved: OK
annotations:
summary: "{{ $labels.instance }} 服务器 {{ $labels.check_file }} 文件MD5发生变更,请检查."
description: "主机名: {{ $labels.hostname }} ;文件名: {{ $labels.check_file }}"
- alert: 网站状态码告警
expr: count_code{request="201"} >= 100 or count_code{request="403"} >= 100 or count_code{request="409"} >= 100 or count_code{request="404"} >= 100 or count_code{request="500"} >= 100 or count_code{request="502"} >= 100 or count_code{request="503"} >= 100
for: 1m
labels:
CITY: ALL
info: 网站状态码告警
severity: Warning
resolved: OK
annotations:
summary: "{{ $labels.instance }} 服务器网站状态码{{ $labels.request }}告警"
description: "主机名: {{ $labels.hostname }} ;状态码来源:{{ $labels.source }} ;状态码:{{ $labels.request }}"
[root@3ad0edf76f32 prometheus]#

prometheus_alter_key

这里出现重复告警,是因为测试环境只有一台,通过JOB看出Prometheus重复监控了
prometheus_alter_key
prometheus_alter_key
prometheus_alter

附件:
Grafana_templates.tar.gz
check_md5.zip
check_http.zip
check_code.zip


本文出自”Jack Wang Blog”:http://www.yfshare.vip/2022/04/22/Prometheus监控docker/