监控告警+webhook一键部署
安装前修改自己所需变量,安装后会有如下提示,之后追加或修改prometheus配置即可。
安装完成后会有详细提示
[✓] Webhook服务安装完成
[*] 创建测试脚本...
[✓] 测试脚本已创建: /data/webhook/test-webhook.sh
===== 安装完成 =====
Prometheus访问地址: http://10.234.210.88:9090
Alertmanager访问地址: http://10.234.210.88:9093
Webhook服务地址: http://10.234.210.88:58888
重要提示:
1. 使用以下命令测试webhook是否正常工作:
/data/webhook/test-webhook.sh
2. 使用以下命令重新加载Prometheus配置:
curl -X POST http://localhost:9090/-/reload
3. 查看服务状态:
systemctl status prometheus
systemctl status alertmanager
systemctl status prometheus-webhook
#!/bin/bash
#
# 一键部署Prometheus监控系统脚本 (精简版)
# 该脚本将自动安装和配置Prometheus和Alertmanager
set -e
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
NC='\033[0m' # No Color
# 安装路径
INSTALL_DIR="/data"
PROMETHEUS_VERSION="2.49.1"
ALERTMANAGER_VERSION="0.26.0"
# Webhook配置
WEBHOOK_PORT="58888"
LARK_WEBHOOK_URL="https://open.larksuite.com/open-apis/bot/v2/hook/-0627-4bc8--"
# 检查是否为root用户
if [ "$(id -u)" != "0" ]; then
echo -e "${RED}此脚本必须以root用户身份运行${NC}" 1>&2
exit 1
fi
echo -e "${GREEN}===== 开始部署Prometheus监控系统 =====${NC}"
# 创建安装目录
mkdir -p ${INSTALL_DIR}
cd ${INSTALL_DIR}
# 创建必要的目录结构
echo -e "${YELLOW}[*] 创建目录结构...${NC}"
mkdir -p ${INSTALL_DIR}/prometheus/{config,rules,data}
mkdir -p ${INSTALL_DIR}/alertmanager
# 下载并安装Prometheus
install_prometheus() {
# 检查是否已安装
if command -v prometheus &> /dev/null; then
echo -e "${GREEN}[✓] Prometheus已安装,跳过安装步骤${NC}"
return
fi
echo -e "${YELLOW}[*] 下载并安装Prometheus...${NC}"
# 下载Prometheus
if [ ! -f "prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz" ]; then
wget https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz
fi
# 解压Prometheus
tar -xzf prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz
cp prometheus-${PROMETHEUS_VERSION}.linux-amd64/prometheus ${INSTALL_DIR}/prometheus/
cp prometheus-${PROMETHEUS_VERSION}.linux-amd64/promtool ${INSTALL_DIR}/prometheus/
cp -r prometheus-${PROMETHEUS_VERSION}.linux-amd64/consoles ${INSTALL_DIR}/prometheus/
cp -r prometheus-${PROMETHEUS_VERSION}.linux-amd64/console_libraries ${INSTALL_DIR}/prometheus/
# 创建Prometheus配置文件
cat >${INSTALL_DIR}/prometheus/prometheus.yml<<EOF
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
rule_files:
- "${INSTALL_DIR}/prometheus/rules/*.yaml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label \`job=<job_name>\` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
EOF
# 创建示例告警规则
mkdir -p ${INSTALL_DIR}/prometheus/rules
cat >${INSTALL_DIR}/prometheus/rules/basic-alert.yaml<<EOF
groups:
- name: basic-alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 0m
labels:
severity: critical
annotations:
title: 'Instance down'
description: "实例 {{ \$labels.instance }} 已经宕机"
EOF
# 创建Prometheus systemd服务
cat >/etc/systemd/system/prometheus.service<<EOF
[Unit]
Description=Prometheus Service
Wants=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Type=simple
ExecStart=${INSTALL_DIR}/prometheus/prometheus --config.file=${INSTALL_DIR}/prometheus/prometheus.yml --web.enable-lifecycle --storage.tsdb.path=${INSTALL_DIR}/prometheus/data
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 重新加载systemd并启动Prometheus
systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus
echo -e "${GREEN}[✓] Prometheus安装完成${NC}"
}
# 下载并安装Alertmanager
install_alertmanager() {
# 检查是否已安装
if command -v alertmanager &> /dev/null; then
echo -e "${GREEN}[✓] Alertmanager已安装,跳过安装步骤${NC}"
return
fi
echo -e "${YELLOW}[*] 下载并安装Alertmanager...${NC}"
# 下载Alertmanager
if [ ! -f "alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz" ]; then
wget https://github.com/prometheus/alertmanager/releases/download/v${ALERTMANAGER_VERSION}/alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
fi
# 解压Alertmanager
tar -xzf alertmanager-${ALERTMANAGER_VERSION}.linux-amd64.tar.gz
cp alertmanager-${ALERTMANAGER_VERSION}.linux-amd64/alertmanager ${INSTALL_DIR}/alertmanager/
cp alertmanager-${ALERTMANAGER_VERSION}.linux-amd64/amtool ${INSTALL_DIR}/alertmanager/
# 创建Alertmanager配置文件
cat >${INSTALL_DIR}/alertmanager/alertmanager.yml<<EOF
global:
resolve_timeout: 1m
route:
receiver: 'webhook'
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 30m
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://127.0.0.1:${WEBHOOK_PORT}/send'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
EOF
# 创建Alertmanager systemd服务
cat >/etc/systemd/system/alertmanager.service<<EOF
[Unit]
Description=Alertmanager Service
Wants=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Type=simple
ExecStart=${INSTALL_DIR}/alertmanager/alertmanager --config.file=${INSTALL_DIR}/alertmanager/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 重新加载systemd并启动Alertmanager
systemctl daemon-reload
systemctl enable alertmanager
systemctl start alertmanager
echo -e "${GREEN}[✓] Alertmanager安装完成${NC}"
}
# 安装并配置简单版Webhook服务
install_webhook() {
echo -e "${YELLOW}[*] 安装简单版Webhook服务...${NC}"
# 检查Python3是否安装
if ! command -v python3 &> /dev/null; then
echo -e "${YELLOW}[*] Python3未安装,正在安装...${NC}"
apt-get update
apt-get install -y python3
fi
# 单独检查pip3是否安装
if ! command -v pip3 &> /dev/null; then
echo -e "${YELLOW}[*] pip3未安装,正在安装...${NC}"
apt-get update
apt-get install -y python3-pip
fi
# 安装所需的Python包
pip3 install flask requests
# 创建webhook目录
mkdir -p ${INSTALL_DIR}/webhook
# 创建简化版的Flask应用,避免JSON解析错误
cat >${INSTALL_DIR}/webhook/app.py<<EOF
from flask import Flask, request, jsonify
import requests
import json
from datetime import datetime
app = Flask(__name__)
def send_to_lark(status, title, description, start_time, end_time="", severity="Unknown", instance="Unknown", alertname="Unknown"):
"""发送消息到飞书"""
# 设置飞书webhook URL
url = "${LARK_WEBHOOK_URL}"
# 格式化时间
try:
# 将UTC时间转为本地时间
start_dt = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S.%fZ")
start_time_fmt = start_dt.strftime("%Y-%m-%d %H:%M:%S")
if end_time:
end_dt = datetime.strptime(end_time, "%Y-%m-%dT%H:%M:%S.%fZ")
end_time_fmt = end_dt.strftime("%Y-%m-%d %H:%M:%S")
else:
end_time_fmt = "未结束"
except:
start_time_fmt = start_time
end_time_fmt = end_time if end_time else "未结束"
# 设置消息颜色
color = "red" if status == "firing" else "green"
status_text = "🔥告警触发" if status == "firing" else "✅告警恢复"
# 构建简单卡片消息
card = {
"msg_type": "interactive",
"card": {
"config": {"wide_screen_mode": True},
"header": {
"template": color,
"title": {"content": f"{status_text}: {title}", "tag": "plain_text"}
},
"elements": [
{
"tag": "div",
"text": {"tag": "lark_md", "content": f"**告警名称**: {alertname}"}
},
{"tag": "hr"},
{
"tag": "div",
"fields": [
{
"is_short": True,
"text": {"tag": "lark_md", "content": f"**状态**: {status}"}
},
{
"is_short": True,
"text": {"tag": "lark_md", "content": f"**级别**: {severity}"}
}
]
},
{
"tag": "div",
"fields": [
{
"is_short": True,
"text": {"tag": "lark_md", "content": f"**开始时间**: {start_time_fmt}"}
},
{
"is_short": True,
"text": {"tag": "lark_md", "content": f"**结束时间**: {end_time_fmt}"}
}
]
},
{
"tag": "div",
"fields": [
{
"is_short": True,
"text": {"tag": "lark_md", "content": f"**实例**: {instance}"}
}
]
},
{"tag": "hr"},
{
"tag": "div",
"text": {"tag": "lark_md", "content": f"**详细信息**: {description}"}
}
]
}
}
# 发送请求
headers = {'Content-Type': 'application/json'}
try:
response = requests.post(url, json=card, headers=headers)
return response.json()
except Exception as e:
print(f"发送消息失败: {e}")
return {"error": str(e)}
@app.route("/")
def hello_world():
return "<p>Prometheus Alert Webhook Server</p>"
@app.route("/send", methods=['POST', 'GET'])
def send_msg():
if request.method == 'GET':
return "<p>请使用POST请求发送告警信息!</p>"
try:
# 获取请求数据
try:
data = request.json
except:
return jsonify({"status": "error", "message": "无效的JSON数据"}), 400
print("接收到的告警数据:")
print(data)
# 处理告警
responses = []
if "alerts" in data:
for alert in data["alerts"]:
# 提取基本信息
status = alert.get("status", "Unknown")
start_time = alert.get("startsAt", "Unknown")
end_time = alert.get("endsAt", "")
# 提取标签
labels = alert.get("labels", {})
alertname = labels.get("alertname", "Unknown")
severity = labels.get("severity", "Unknown")
instance = labels.get("instance", "Unknown")
# 提取注释
annotations = alert.get("annotations", {})
title = annotations.get("title", alertname)
description = annotations.get("description", "无详细信息")
# 发送到飞书
response = send_to_lark(
status,
title,
description,
start_time,
end_time,
severity,
instance,
alertname
)
responses.append(response)
return jsonify({"status": "success", "responses": responses})
except Exception as e:
print(f"处理请求时出错: {e}")
return jsonify({"status": "error", "message": str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=${WEBHOOK_PORT})
EOF
# 创建systemd服务
cat >/etc/systemd/system/prometheus-webhook.service<<EOF
[Unit]
Description=Prometheus Alert Webhook Service
Wants=network-online.target
After=network-online.target
[Service]
User=root
Group=root
Type=simple
ExecStart=/usr/bin/python3 ${INSTALL_DIR}/webhook/app.py
Restart=on-failure
WorkingDirectory=${INSTALL_DIR}/webhook
[Install]
WantedBy=multi-user.target
EOF
# 重新加载systemd并启动webhook服务
systemctl daemon-reload
systemctl enable prometheus-webhook
systemctl start prometheus-webhook
echo -e "${GREEN}[✓] Webhook服务安装完成${NC}"
}
# 创建测试脚本
create_test_script() {
echo -e "${YELLOW}[*] 创建测试脚本...${NC}"
cat >${INSTALL_DIR}/webhook/test-webhook.sh<<EOF
#!/bin/bash
# 设置webhook地址
WEBHOOK_URL="http://localhost:${WEBHOOK_PORT}/send"
# 当前时间(UTC格式)
CURRENT_TIME=\$(date -u +"%Y-%m-%dT%H:%M:%S.000Z")
# 模拟一个测试告警的JSON数据
curl -X POST \$WEBHOOK_URL \\
-H "Content-Type: application/json" \\
-d '{
"alerts": [
{
"status": "firing",
"labels": {
"alertname": "测试告警",
"severity": "critical",
"instance": "test-server-01"
},
"annotations": {
"title": "测试告警标题",
"description": "这是一条测试告警,用于验证webhook是否正常工作"
},
"startsAt": "'\$CURRENT_TIME'",
"endsAt": "",
"generatorURL": "http://prometheus.example.com/graph",
"fingerprint": "c1bb9a35f9844428"
}
]
}'
echo "测试告警已发送,请检查飞书是否收到消息"
EOF
chmod +x ${INSTALL_DIR}/webhook/test-webhook.sh
echo -e "${GREEN}[✓] 测试脚本已创建: ${INSTALL_DIR}/webhook/test-webhook.sh${NC}"
}
# 显示访问信息
show_info() {
echo -e "\n${GREEN}===== 安装完成 =====${NC}"
echo -e "${YELLOW}Prometheus访问地址: http://$(hostname -I | awk '{print $1}'):9090${NC}"
echo -e "${YELLOW}Alertmanager访问地址: http://$(hostname -I | awk '{print $1}'):9093${NC}"
echo -e "${YELLOW}Webhook服务地址: http://$(hostname -I | awk '{print $1}'):${WEBHOOK_PORT}${NC}"
echo -e "\n${GREEN}重要提示:${NC}"
echo -e "1. 使用以下命令测试webhook是否正常工作:"
echo -e " ${YELLOW}${INSTALL_DIR}/webhook/test-webhook.sh${NC}"
echo -e "2. 使用以下命令重新加载Prometheus配置:"
echo -e " ${YELLOW}curl -X POST http://localhost:9090/-/reload${NC}"
echo -e "3. 查看服务状态:"
echo -e " ${YELLOW}systemctl status prometheus${NC}"
echo -e " ${YELLOW}systemctl status alertmanager${NC}"
echo -e " ${YELLOW}systemctl status prometheus-webhook${NC}"
}
# 主函数
main() {
install_prometheus
install_alertmanager
install_webhook
create_test_script
show_info
}
# 执行主函数
main