apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kafka-exporte
labels:
prometheus: kube-prometheus
app: kafka-exporte
namespace: monitoring
spec:
endpoints:
- port: metrics
interval: 30s
scrapeTimeout: 30s
path: /metrics
namespaceSelector:
any: true
#namespaceSelector:
# matchNames:
# - kafka
selector:
matchLabels:
app: kafka-exporte
targetLabels:
- ns
kafka-rule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: kafka-rules
namespace: monitoring
labels:
prometheus: k8s
role: alert-rules
spec:
groups:
- name: kafka.rules
rules:
- alert: KafkaTopicsReplicas
expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 1
for: 5m
labels:
severity: critical
annotations:
summary: Kafka topics replicas (instance {{ $labels.instance }})
description: "Kafka topic in-sync partitionn VALUE = {{ $value }}n LABELS = {{ $labels }}n NS = {{ $labels.ns }}"
- alert: KafkaConsumersGroup
expr: sum(kafka_consumergroup_lag) by (consumergroup) > 100000
for: 1m
labels:
severity: critical
annotations:
summary: Kafka consumers group (instance {{ $labels.instance }})
description: "Kafka consumers groupn VALUE = {{ $value }}n LABELS = {{ $labels }}n NS = {{ $labels.ns }}"
- alert: Kafka InstanceDown
expr: sum(kafka_brokers) < 2
for: 15s
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }}"
description: "kafka of cluster has been down for more than 15 secondn VALUE = {{ $value }}n LABELS = {{ $labels }}n NS = {{ $labels.ns }}"
告警信息如下:但告警无法区分是哪个NS的kafa,请问如何解决?
告警类型: KafkaConsumersGroup
告警级别: critical
告警详情: Kafka consumers group
VALUE = 6969
LABELS = map[consumergroup:logstash]
NS = ;Kafka consumers group (instance )