本案例需要使用docker-compose,若没有环境需下载docker-compose,下载地址:GitHub - docker/compose: Define and run multi-container applications with Docker
本例使用v2.23.3,如果网络环境不好建议从github上下载好再上传到服务器
[root@node ~]# wget https://github.com/docker/compose/releases/download/v2.23.3/docker-compose-linux-x86_64
[root@node ~]# cp docker-compose-linux-x86_64 /usr/local/bin/docker-compose
[root@node ~]# chmod +x /usr/local/bin/docker-compose
[root@node ~]# docker-compose --version
按照以下路径创建目录和文件
/home/docker-prometheus/
├── alertmanager
│ ├── alertmanager.yml
│ └── template
│ └── wechat.tmpl
├── docker-compose.yml
├── grafana
└── prometheus
├── prometheus.yml
├── rules
│ └── alerts.yml
└── sd_config
├── linux.yml
├── snmp.yml
└── windows.yml
global:
resolve_timeout: 15s
route:
group_by: ['env','instance','type','group','job','alertname']
group_wait: 15s # 当收到告警的时候,等待15秒看是否还有告警,如果有就一起发出去
group_interval: 15s # 发送警告间隔时间
repeat_interval: 30s # 重复报警的间隔时间
receiver: 'wechat'
receivers:
- name: 'wechat'
webhook_configs:
- url: 'http://192.168.32.146:8089/adapter/wx'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['env','instance','type','group','job','alertname']
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
========= 监控报警 =========
告警状态:{{ .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ $alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
触发阀值:{{ .Annotations.value }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
========= 异常恢复 =========
告警类型:{{ .Labels.alertname }}
告警状态:{{ .Status }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- end }}
version: '3.3'
services:
prometheus:
container_name: prometheus
image: prom/prometheus
ports:
- "9090:9090"
volumes:
- /home/docker-prometheus/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- /home/docker-prometheus/prometheus/rules/alerts.yml:/etc/prometheus/rules/alerts.yml
- /home/docker-prometheus/prometheus/sd_config/:/etc/prometheus/sd_config/
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
restart: always
networks:
prometheus_net:
grafana:
image: grafana/grafana
container_name: grafana
ports:
- "3000:3000"
environment:
- "GF_SECURITY_ADMIN_PASSWORD=123456Aa"
- "GF_INSTALL_PLUGINS=alexanderzobnin-zabbix-app"
privileged: true # Permission denied时加
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
- /home/docker-prometheus/grafana/data:/var/lib/grafana
- /home/docker-prometheus/grafana/grafana.ini:/etc/grafana/grafana.ini
restart: always
networks:
prometheus_net:
alertmanager:
image: prom/alertmanager
container_name: alertmanager
ports:
- '9093:9093'
volumes:
- /home/docker-prometheus/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
- /home/docker-prometheus/alertmanager/template/wechat.tmpl:/etc/alertmanager/wechat.tmpl
restart: always
networks:
prometheus_net:
webhook-adapter:
image: guyongquan/webhook-adapter:latest
container_name: webhook-adapter
hostname: webhook-adapter
ports:
- "8089:80"
restart: always
command:
- "--adapter=/app/prometheusalert/wx.js=/wx= # webhook地址
networks:
prometheus_net:
networks:
prometheus_net:
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.32.146:9093
rule_files:
- "/etc/prometheus/rules/*.yml"
scrape_configs:
- job_name: 'linux'
file_sd_configs:
- files:
- '/etc/prometheus/sd_config/linux.yml'
refresh_interval: 30m # 每隔30分钟检查一次
- job_name: 'windows'
file_sd_configs:
- files:
- '/etc/prometheus/sd_config/windows.yml'
refresh_interval: 30m
# snmp监控交换机
- job_name: 'snmp'
file_sd_configs:
- files:
- '/etc/prometheus/sd_config/snmp.yml'
refresh_interval: 30m
metrics_path: /snmp
params:
module: [HUAWEI]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.32.146:9116
groups:
- name: general.rules
rules:
- alert: 主机宕机
expr: up == 0
for: 15s
labels:
serverity: error
annotations:
summary: "主机 {{ $labels.instance }} 主机宕机"
description: "{{ $labels.instance }} job {{ $labels.job }} 实例未在线!"
- name: alters.rules
rules:
- alert: 内存使用率
expr: 100 - round(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100) > 80
for: 1m
labels:
# severity: 指定告警级别。有三种等级,分别为warning、critical和emergency。严重等级依次递增。
severity: emergency
annotations:
# summary描述告警的概要信息
# description用于描述告警的详细信息。
summary: "主机 {{ $labels.instance }} 内存使用率过高"
description: "{{ $labels.instance }} 内存使用大于 80% (当前值: {{ $value }})"
- name : CPU.rules
rules:
- alert: CPU Usage
expr: 100 - round(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
for: 1m
labels:
severity: error
annotations:
summary: "实例 {{ $labels.instance }} CPU使用率过高"
description: "实例CPU使用率超过 80% (当前值为: {{ $value }}%)"
ip: "{{ $labels.ip }}"
- name: Disk.rules
rules:
- alert: Disk Usage
expr: 100 - round(node_filesystem_free_bytes{fstype=~"ext3|ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}* 100) > 80
for: 1m
labels:
severity: error
annotations:
summary: "实例 {{ $labels.instance }} 磁盘使用率过高"
description: "实例磁盘使用率超过 80% (当前值为: {{ $value }}%)"
ip: "{{ $labels.ip }}"
- targets:
- "192.168.xx.xx:9100"
- "...:9100"
- "...:9100"
- "...:9100"
- "..:9100"
- "...:9100"
docker-compose up -d # 在docker-compose.yml文件目录下用此命令
docker-compose -f /home/docker-prometheus/docker-compose.yml up -d # 其他目录下用此命令
[root@gt-32 docker-prometheus]# docker-compose ps
若发现grafana状态没有up可通过docker logs -f grafana追踪日志,一般情况为目录没有权限,提示如下:
mkdir: can't create directory '/var/lib/grafana/plugins': Permission denied
GF_PATHS_DATA='/var/lib/grafana' is not writable.
解决办法:向grafana目录下的data目录加权限可以解决
[root@gt-32 docker-prometheus]# chmod +x grafana/data/