victorialogs配置关键字告警
# VictoriaLogs 日志关键字监控告警
# 架构概览
fluent-bit (tools) ──→ VictoriaLogs (tools)
│
│ LogsQL stats API
▼
victorialogs-keyword-exporter (tools)
将日志关键字统计暴露为 Prometheus 指标
│
│ VMServiceScrape (跨命名空间)
▼
VMAgent (monitoring)
│
▼
VMInsert → VMStorage (monitoring)
│
▼
VMAlert 评估 VMRule 告警规则
│
▼
VMAlertmanager
/ \
飞书 邮件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 文件清单
| 文件 | 用途 |
|---|---|
configmap-code.yaml | Python exporter 代码(无需构建镜像) |
deployment.yaml | Exporter Deployment + Service |
vmservicescrape.yaml | VMAgent 跨命名空间采集配置 |
vmrule-log-alerts.yaml | VMAlert 告警规则 |
configmap-code.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: victorialogs-keyword-exporter-code
namespace: tools
data:
app.py: |
#!/usr/bin/env python3
"""
victorialogs-keyword-exporter v2 - 正确解析 hits API 响应
"""
import time, os, logging, json
from urllib.parse import urlencode
import urllib.request, urllib.error
from http.server import HTTPServer, BaseHTTPRequestHandler
from threading import Thread, Lock
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)
VLSELECT_URL = os.environ.get("VLSELECT_URL", "http://vlc-victoria-logs-cluster-vlselect.tools.svc.cluster.local:9471")
SCRAPE_INTERVAL = int(os.environ.get("SCRAPE_INTERVAL", "60"))
QUERY_WINDOW = os.environ.get("QUERY_WINDOW", "5m")
METRICS_PORT = int(os.environ.get("METRICS_PORT", "9800"))
# 关键字规则 - 使用全文搜索
KEYWORD_RULES = [
{"name": "error_exception", "logsql": '"error" OR "exception" OR "fatal" OR "ERR"', "namespace": "", "severity": "warning", "description": "错误/异常/致命日志"},
{"name": "oom_killed", "logsql": '"OOM" OR "Out of memory" OR "Killed process"', "namespace": "", "severity": "critical", "description": "OOM内存溢出"},
{"name": "connection_refused", "logsql": '"connection refused"', "namespace": "", "severity": "warning", "description": "连接拒绝"},
{"name": "image_pull_error", "logsql": '"ImagePullBackOff" OR "ErrImagePull"', "namespace": "", "severity": "critical", "description": "镜像拉取失败"},
{"name": "crashloopbackoff", "logsql": '"CrashLoopBackOff"', "namespace": "", "severity": "critical", "description": "容器反复崩溃"},
{"name": "probe_failure", "logsql": '"Readiness probe failed" OR "Liveness probe failed"', "namespace": "", "severity": "warning", "description": "探针失败"},
{"name": "http_5xx", "logsql": '"5xx" OR "500" OR "502" OR "503"', "namespace": "", "severity": "warning", "description": "HTTP5xx错误"},
{"name": "db_error", "logsql": '"dial tcp" OR "bad connection" OR "connection pool exhausted"', "namespace": "", "severity": "critical", "description": "数据库连接错误"},
{"name": "auth_failure", "logsql": '"authentication failed" OR "Unauthorized" OR "403 Forbidden"', "namespace": "", "severity": "warning", "description": "认证授权失败"},
{"name": "panic", "logsql": '"panic" NOT "panic: test"', "namespace": "", "severity": "critical", "description": "应用panic"},
]
_metrics_cache = ""
_cache_lock = Lock()
def window_to_seconds(window: str) -> int:
unit = window[-1]
val = int(window[:-1])
return val * {"s": 1, "m": 60, "h": 3600, "d": 86400}.get(unit, 60)
def build_logsql(rule: dict) -> str:
parts = []
if rule.get("namespace"):
parts.append(f'kubernetes.namespace_name:="{rule["namespace"]}"')
parts.append(f'({rule["logsql"]})')
return " AND ".join(parts) if len(parts) > 1 else parts[0]
def post_query(endpoint: str, params: dict) -> tuple[int, bytes]:
url = f"{VLSELECT_URL}{endpoint}"
body = urlencode(params).encode("utf-8")
req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/x-www-form-urlencoded"}, method="POST")
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return resp.status, resp.read()
except urllib.error.HTTPError as e:
return e.code, e.read()
except Exception as e:
return 0, str(e).encode()
def query_count(logsql: str, window_sec: int) -> int | None:
now = int(time.time())
start = now - window_sec
status, body = post_query("/select/logsql/hits", {
"query": logsql,
"start": str(start),
"end": str(now),
"step": f"{window_sec}s",
})
if status != 200:
logger.debug("hits API %d: %s", status, body[:120])
return None
try:
data = json.loads(body)
# 正确解析 hits API 响应格式: {"hits": [{"values": [1, 2, ...]}]}
total = 0
if "hits" in data:
for hit in data["hits"]:
values = hit.get("values", [])
total += sum(values)
else:
# 兼容直接返回 values 的格式
values = data.get("values", [])
total = sum(values)
logger.debug("Query: %s, total: %d", logsql[:50], total)
return total
except Exception as e:
logger.warning("Parse response error: %s | body=%s", e, body[:200])
return None
def collect_metrics() -> str:
window_sec = window_to_seconds(QUERY_WINDOW)
lines = [
"# HELP log_keyword_count 在指定时间窗口内匹配关键字的日志条数",
"# TYPE log_keyword_count gauge",
"# HELP log_keyword_scrape_error 采集是否出错 (1=出错 0=正常)",
"# TYPE log_keyword_scrape_error gauge",
]
ts = int(time.time() * 1000)
for rule in KEYWORD_RULES:
logsql = build_logsql(rule)
count = query_count(logsql, window_sec)
ns = rule.get("namespace") or "all"
lbl = f'keyword="{rule["name"]}",namespace="{ns}",severity="{rule["severity"]}",description="{rule["description"]}"'
if count is None:
lines.append(f'log_keyword_scrape_error{{{lbl}}} 1 {ts}')
lines.append(f'log_keyword_count{{{lbl}}} 0 {ts}')
logger.warning("FAILED rule=%s", rule["name"])
else:
lines.append(f'log_keyword_scrape_error{{{lbl}}} 0 {ts}')
lines.append(f'log_keyword_count{{{lbl}}} {count} {ts}')
logger.info("OK rule=%-25s count=%d", rule["name"], count)
lines += [
"# HELP log_exporter_up Exporter存活",
"# TYPE log_exporter_up gauge",
f"log_exporter_up 1 {ts}",
"# HELP log_exporter_last_scrape_timestamp_seconds 上次采集时间戳",
"# TYPE log_exporter_last_scrape_timestamp_seconds gauge",
f"log_exporter_last_scrape_timestamp_seconds {time.time():.3f}",
]
return "\n".join(lines) + "\n"
def background_collector():
global _metrics_cache
while True:
start = time.time()
try:
m = collect_metrics()
with _cache_lock:
_metrics_cache = m
except Exception as e:
logger.error("Collector error: %s", e)
time.sleep(max(0, SCRAPE_INTERVAL - (time.time() - start)))
class MetricsHandler(BaseHTTPRequestHandler):
def log_message(self, *a): pass
def do_GET(self):
if self.path in ("/metrics", "/"):
with _cache_lock:
body = (_metrics_cache or "# waiting for first scrape\n").encode()
self.send_response(200)
self.send_header("Content-Type", "text/plain; version=0.0.4")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
elif self.path == "/healthz":
self.send_response(200); self.end_headers(); self.wfile.write(b"ok")
else:
self.send_response(404); self.end_headers()
if __name__ == "__main__":
logger.info("Starting victorialogs-keyword-exporter v2 (fixed hits parser)")
logger.info(" VLSELECT_URL=%s INTERVAL=%ds WINDOW=%s PORT=%d RULES=%d",
VLSELECT_URL, SCRAPE_INTERVAL, QUERY_WINDOW, METRICS_PORT, len(KEYWORD_RULES))
window_sec = window_to_seconds(QUERY_WINDOW)
try:
with _cache_lock:
_metrics_cache = collect_metrics()
except Exception as e:
logger.error("Initial collection failed: %s", e)
Thread(target=background_collector, daemon=True).start()
HTTPServer(("0.0.0.0", METRICS_PORT), MetricsHandler).serve_forever()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
deployment.yaml
---
# ─────────────────────────────────────────────────────────────────
# victorialogs-keyword-exporter
# 部署在 tools 命名空间,查询 VictoriaLogs 关键字统计并暴露为指标
# ─────────────────────────────────────────────────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: victorialogs-keyword-exporter
namespace: tools
labels:
app: victorialogs-keyword-exporter
spec:
replicas: 1
selector:
matchLabels:
app: victorialogs-keyword-exporter
template:
metadata:
labels:
app: victorialogs-keyword-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9800"
prometheus.io/path: "/metrics"
spec:
containers:
- name: exporter
# ⚠️ 替换为你的镜像仓库地址,或使用下方 ConfigMap 方式直接挂载 app.py
image: docker.cnb.cool/zzppjj/docker-images/python:3.11-slim
command: ["python", "/app/app.py"]
ports:
- name: metrics
containerPort: 9800
env:
- name: VLSELECT_URL
value: "http://vlc-victoria-logs-cluster-vlselect.tools.svc.cluster.local:9471"
- name: SCRAPE_INTERVAL
value: "60" # 每 60 秒查询一次 VictoriaLogs
- name: QUERY_WINDOW
value: "5m" # 统计最近 5 分钟的日志条数
- name: METRICS_PORT
value: "9800"
volumeMounts:
- name: app-code
mountPath: /app
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
livenessProbe:
httpGet:
path: /healthz
port: 9800
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /healthz
port: 9800
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: app-code
configMap:
name: victorialogs-keyword-exporter-code
---
apiVersion: v1
kind: Service
metadata:
name: victorialogs-keyword-exporter
namespace: tools
labels:
app: victorialogs-keyword-exporter
spec:
selector:
app: victorialogs-keyword-exporter
ports:
- name: metrics
port: 9800
targetPort: 9800
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
vmservicescrape.yaml
---
# ─────────────────────────────────────────────────────────────────
# VMServiceScrape:让 monitoring 命名空间的 VMAgent
# 跨命名空间采集 tools 命名空间的 exporter 指标
# ─────────────────────────────────────────────────────────────────
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: victorialogs-keyword-exporter
namespace: monitoring # 必须在 monitoring 命名空间,VMAgent 才能发现
spec:
namespaceSelector:
matchNames:
- tools # 目标 Service 在 tools 命名空间
selector:
matchLabels:
app: victorialogs-keyword-exporter
endpoints:
- port: metrics
path: /metrics
interval: 60s # 与 exporter SCRAPE_INTERVAL 保持一致
scrapeTimeout: 30s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
vmrule-log-alerts.yaml
---
# ─────────────────────────────────────────────────────────────────
# VMServiceScrape:让 monitoring 命名空间的 VMAgent
# 跨命名空间采集 tools 命名空间的 exporter 指标
# ─────────────────────────────────────────────────────────────────
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: victorialogs-keyword-exporter
namespace: monitoring # 必须在 monitoring 命名空间,VMAgent 才能发现
spec:
namespaceSelector:
matchNames:
- tools # 目标 Service 在 tools 命名空间
selector:
matchLabels:
app: victorialogs-keyword-exporter
endpoints:
- port: metrics
path: /metrics
interval: 60s # 与 exporter SCRAPE_INTERVAL 保持一致
scrapeTimeout: 30s
[root@k8s-master01 keywords]# cat vmrule-log-alerts.yaml
---
# ─────────────────────────────────────────────────────────────────
# VMRule:基于日志关键字指标的告警规则
# 依赖:victorialogs-keyword-exporter 将日志统计转为 Prometheus 指标
# ─────────────────────────────────────────────────────────────────
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: log-keyword-alerts
namespace: monitoring
labels:
app: production
spec:
groups:
# ── 严重级别:立即告警 ──────────────────────────────────────
- name: log-keyword-critical
interval: 60s
rules:
- alert: LogOOMKilled
expr: log_keyword_count{keyword="oom_killed"} > 0
for: 0s # 出现即告警
labels:
severity: critical
team: ops
annotations:
summary: "🔴 检测到 OOM 内存溢出日志"
description: |
最近 5 分钟内检测到 {{ $value }} 条 OOM 相关日志。
关键字: OOM / Out of memory / Killed process
请立即检查节点和 Pod 的内存使用情况。
- alert: LogCrashLoopBackOff
expr: log_keyword_count{keyword="crashloopbackoff"} > 0
for: 0s
labels:
severity: critical
team: ops
annotations:
summary: "🔴 检测到容器 CrashLoopBackOff"
description: |
最近 5 分钟内检测到 {{ $value }} 条 CrashLoopBackOff 日志。
请执行: kubectl get pods -A | grep CrashLoop
- alert: LogImagePullError
expr: log_keyword_count{keyword="image_pull_error"} > 0
for: 0s
labels:
severity: critical
team: ops
annotations:
summary: "🔴 镜像拉取失败"
description: |
最近 5 分钟内检测到 {{ $value }} 条镜像拉取失败日志。
请检查镜像地址、仓库凭据及网络连通性。
- alert: LogPanic
expr: log_keyword_count{keyword="panic"} > 0
for: 0s
labels:
severity: critical
team: dev
annotations:
summary: "🔴 应用发生 Panic"
description: |
最近 5 分钟内检测到 {{ $value }} 条 panic 日志。
请立即检查应用日志排查根因。
- alert: LogDatabaseError
expr: log_keyword_count{keyword="db_error"} > 5
for: 2m
labels:
severity: critical
team: dev
annotations:
summary: "🔴 数据库连接异常频繁"
description: |
最近 5 分钟内检测到 {{ $value }} 条数据库连接错误日志。
关键字: dial tcp / driver: bad connection / connection pool exhausted
# ── 警告级别:持续一段时间才告警 ──────────────────────────
- name: log-keyword-warning
interval: 60s
rules:
- alert: LogErrorSpike
# 5 分钟内超过 100 条 error/exception/fatal 才告警,避免噪音
expr: log_keyword_count{keyword="error_exception"} > 0
for: 3m
labels:
severity: warning
team: dev
annotations:
summary: "⚠️ 日志错误数量激增"
description: |
最近 5 分钟内检测到 {{ $value }} 条错误/异常日志(阈值: 100)。
关键字: error / exception / fatal
- alert: LogConnectionRefused
expr: log_keyword_count{keyword="connection_refused"} > 10
for: 2m
labels:
severity: warning
team: dev
annotations:
summary: "⚠️ 连接拒绝错误频繁"
description: |
最近 5 分钟内检测到 {{ $value }} 条 "connection refused" 日志。
请检查依赖服务是否正常。
- alert: LogProbeFailure
expr: log_keyword_count{keyword="probe_failure"} > 5
for: 2m
labels:
severity: warning
team: ops
annotations:
summary: "⚠️ 健康探针持续失败"
description: |
最近 5 分钟内检测到 {{ $value }} 条探针失败日志。
请检查对应 Pod 的 Readiness/Liveness 配置。
- alert: LogHTTP5xxSpike
expr: log_keyword_count{keyword="http_5xx"} > 50
for: 2m
labels:
severity: warning
team: dev
annotations:
summary: "⚠️ HTTP 5xx 错误激增"
description: |
最近 5 分钟内检测到 {{ $value }} 条 HTTP 5xx 响应日志。
- alert: LogAuthFailure
expr: log_keyword_count{keyword="auth_failure"} > 20
for: 3m
labels:
severity: warning
team: security
annotations:
summary: "⚠️ 认证/授权失败频繁"
description: |
最近 5 分钟内检测到 {{ $value }} 条认证失败日志。
可能存在暴力破解或配置错误,请关注安全事件。
# ── Exporter 自身健康 ────────────────────────────────────
- name: log-exporter-health
interval: 60s
rules:
- alert: LogExporterDown
expr: log_exporter_up == 0 or absent(log_exporter_up)
for: 2m
labels:
severity: critical
team: ops
annotations:
summary: "🔴 日志关键字 Exporter 不可用"
description: |
victorialogs-keyword-exporter 已停止上报指标超过 2 分钟。
日志关键字告警功能已失效,请检查 tools 命名空间中的 Pod 状态。
- alert: LogExporterScrapeError
expr: log_keyword_scrape_error == 1
for: 5m
labels:
severity: warning
team: ops
annotations:
summary: "⚠️ 日志关键字采集出错"
description: |
关键字 {{ $labels.keyword }} 的采集查询持续失败超过 5 分钟。
请检查 VictoriaLogs vlselect 服务是否正常。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# 部署步骤
# 1. 部署 Exporter
# 创建代码 ConfigMap
kubectl apply -f configmap-code.yaml
# 部署 Deployment 和 Service
kubectl apply -f deployment.yaml
# 验证启动
kubectl get pod -n tools -l app=victorialogs-keyword-exporter
kubectl logs -n tools -l app=victorialogs-keyword-exporter --tail=30
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
# 2. 验证指标暴露
# 端口转发测试
kubectl port-forward -n tools svc/victorialogs-keyword-exporter 9800:9800 &
# 在另一终端查看指标
curl http://localhost:9800/metrics | grep log_keyword
1
2
3
4
5
2
3
4
5
预期输出示例:
log_keyword_count{keyword="error_exception",namespace="all",severity="warning",...} 42
log_keyword_count{keyword="oom_killed",namespace="all",severity="critical",...} 0
log_keyword_count{keyword="crashloopbackoff",namespace="all",severity="critical",...} 0
log_exporter_up 1
1
2
3
4
2
3
4
# 3. 配置 VMAgent 采集
# 让 VMAgent 发现并采集 exporter
kubectl apply -f vmservicescrape.yaml
# 验证 VMAgent 已识别目标(约 30s 后生效)
kubectl port-forward -n monitoring svc/vmagent-vmagent 8429:8429 &
# 浏览器访问 http://localhost:8429/targets 搜索 victorialogs
1
2
3
4
5
6
2
3
4
5
6
# 4. 应用告警规则
kubectl apply -f vmrule-log-alerts.yaml
# 验证规则加载
kubectl get vmrule log-keyword-alerts -n monitoring
kubectl logs -n monitoring -l app.kubernetes.io/name=vmalert --tail=50 | grep -i "log-keyword"
1
2
3
4
5
2
3
4
5
# 自定义关键字规则
修改 configmap-code.yaml 中 KEYWORD_RULES 列表,添加业务自定义规则:
# 示例:监控特定命名空间的支付服务错误
{
"name": "payment_error",
"logsql": '"payment failed" OR "transaction error" OR "insufficient balance"',
"namespace": "production", # 只监控 production 命名空间
"severity": "critical",
"description": "支付服务错误",
},
# 示例:监控慢查询
{
"name": "slow_query",
"logsql": '"slow query" OR "query time exceeded" OR ~"took [0-9]{4,}ms"',
"namespace": "",
"severity": "warning",
"description": "数据库慢查询",
},
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
修改后重新应用:
kubectl apply -f configmap-code.yaml
kubectl rollout restart deployment/victorialogs-keyword-exporter -n tools
1
2
2
# 调整告警阈值
修改 vmrule-log-alerts.yaml 中的 expr 阈值:
# 当前配置:5分钟内超过 100 条 error 日志才告警
expr: log_keyword_count{keyword="error_exception"} > 100
# 调整为:超过 50 条
expr: log_keyword_count{keyword="error_exception"} > 50
1
2
3
4
5
2
3
4
5
应用后无需重启(Operator 会自动 reload):
kubectl apply -f vmrule-log-alerts.yaml
1
# 告警指标说明
| 指标名 | 类型 | 说明 |
|---|---|---|
log_keyword_count | Gauge | 最近 N 分钟内关键字匹配的日志条数 |
log_keyword_scrape_error | Gauge | 查询是否失败(1=失败,0=正常) |
log_exporter_up | Gauge | Exporter 存活状态 |
log_exporter_last_scrape_timestamp_seconds | Gauge | 上次采集时间戳 |
Labels:
keyword: 规则名称namespace: 监控的命名空间(all=全部)severity: 严重程度(critical/warning)description: 规则描述
# 常见问题排查
# Exporter 无法连接 VictoriaLogs
# 检查 vlselect service
kubectl get svc -n tools | grep vlselect
# 从 exporter pod 内测试连通性
kubectl exec -n tools deployment/victorialogs-keyword-exporter -- \
wget -qO- "http://vlc-victoria-logs-cluster-vlselect.tools.svc.cluster.local:9471/select/logsql/stats?query=*+|+stats+count()+as+cnt"
1
2
3
4
5
6
2
3
4
5
6
# VMAgent 未采集到指标
# 检查 VMServiceScrape
kubectl describe vmservicescrape victorialogs-keyword-exporter -n monitoring
# 检查 VMAgent 是否有跨命名空间权限(通常 selectAllByDefault: true 已包含)
kubectl get vmagent vmagent -n monitoring -o yaml | grep -A5 serviceNamespaceSelector
1
2
3
4
5
2
3
4
5
# 告警规则未生效
# 检查 VMAlert 是否加载了规则
kubectl port-forward -n monitoring svc/vmalert-vmalert 8880:8880
curl http://localhost:8880/api/v1/rules | python3 -m json.tool | grep log-keyword
# 查看 VMAlert 错误日志
kubectl logs -n monitoring -l app.kubernetes.io/name=vmalert --tail=100 | grep -i error
1
2
3
4
5
6
2
3
4
5
6
上次更新: 2026/06/03, 18:21:09
|