在上一篇文章中,对使用 Prometheus 监控Flink进行了阐述(传送门),这里就不再赘述了。
尽管 Prometheus 自我标榜是监控解决方案,
From metrics to insight Power your metrics and alerting with a leading open-source monitoring solution.
但是在我们日常使用中,Prometheus 更多担任的是数据采集平台和任务调度的职责,对于监控数据的可视化,我们更多是交给 Grafana 来完成。
The open observability platform Grafana is the open source analytics & monitoring solution for every database
从 Grafana 的 solgan 可以看出来,其在分析领域的野心。其主要特性可以归纳如下:
- 可视化:快速和灵活的客户端图形具有多种选项。面板插件为许多不同的方式可视化指标和日志。
- 报警:可视化地为最重要的指标定义警报规则。Grafana将持续评估它们,并发送通知。
- 通知:警报更改状态时,它会发出通知。接收电子邮件通知。
- 动态仪表盘:使用模板变量创建动态和可重用的仪表板,这些模板变量作为下拉菜单出现在仪表板顶部。
- 混合数据源:在同一个图中混合不同的数据源!可以根据每个查询指定数据源。这甚至适用于自定义数据源。
- 注释:注释来自不同数据源图表。将鼠标悬停在事件上可以显示完整的事件元数据和标记。
- 过滤器:过滤器允许您动态创建新的键/值过滤器,这些过滤器将自动应用于使用该数据源的所有查询。
给我触动最深的,还是其整体的架构设计,这里并非指的代码结构,而是其内部对使用逻辑,系统动作,行为抽象等的架构设计。在最近的使用过程中,给与了我很深的触动。对于一直做BI产品架构师和产品经理的我来说, Grafana 的整体设计,沿用到一般BI的可视化产品中,都是可行的。这个结构真的太美了,太妙了,今天居然看着屏幕笑了起来......
好了,上面是我夹带的一些私货,下面来说一说,使用吧,我并不想在这篇文章里手把手做一个仪表盘。而是通过之前文章的案例,迅速导入一个现成的仪表盘。(想入门的童靴,可以翻阅参考连接里的文章)
引用之前案例的结构,设置好 Prometheus 对 Flink主要指标的监控
启动 grafana-server
然后设置Prometheus数据源
打开 Create --> Import 页面,将仪表盘配置的json导入(json全文在文章末尾可以找到)。
保存后就可以直观监控了Flink的主要指标了。
参考连接:
https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/metrics.html#prometheus-orgapacheflinkmetricsprometheusprometheusreporter
https://www.jianshu.com/p/0d82c7ccc85a
参考配置文件 flink-dashboard_rev1.json
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "4.2.0"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "singlestat",
"name": "Singlestat",
"version": ""
}
],
"annotations": {
"list": []
},
"editable": true,
"gnetId": 10369,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [],
"refresh": "5s",
"rows": [
{
"collapse": false,
"height": 337,
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "flink_jobmanager_Status_JVM_CPU_Load",
"intervalFactor": 10,
"legendFormat": "{{instance}}",
"metric": "flink_jobmanager_Status_JVM_CPU_Load",
"refId": "A",
"step": 20
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "JobManager CPU Load",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "flink_taskmanager_Status_JVM_CPU_Load",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "flink_taskmanager_Status_JVM_CPU_Load",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "TaskManager CPU Load",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 5,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "flink_jobmanager_Status_JVM_Memory_Direct_MemoryUsed",
"intervalFactor": 10,
"legendFormat": "{{instance}}",
"metric": "flink_jobmanager_Status_JVM_Memory_Direct_MemoryUsed",
"refId": "A",
"step": 20
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "JobManager Memory Used",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "flink_taskmanager_Status_JVM_Memory_Direct_MemoryUsed",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "flink_taskmanager_Status_JVM_Memory_Direct_MemoryUsed",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "TaskManager Memory Used",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6"
},
{
"collapse": false,
"height": 276,
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(255, 255, 255, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
"maxValue": null,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"hideTimeOverride": false,
"id": 3,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [
{
"expr": "flink_jobmanager_taskSlotsAvailable",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
"metric": "flink_jobmanager_taskSlotsAvailable",
"refId": "A",
"step": 20
}
],
"thresholds": "",
"title": "Taskslots available",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(255, 255, 255, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
"maxValue": null,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"hideTimeOverride": false,
"id": 4,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"targets": [
{
"expr": "flink_jobmanager_taskSlotsTotal",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
"metric": "flink_jobmanager_taskSlotsTotal",
"refId": "A",
"step": 20
}
],
"thresholds": "",
"title": "Taskslots total",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(255, 255, 255, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
"maxValue": null,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"hideTimeOverride": false,
"id": 7,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(251, 129, 76, 0.18)",
"full": false,
"lineColor": "rgb(193, 31, 31)",
"show": true
},
"targets": [
{
"expr": "flink_jobmanager_numRegisteredTaskManagers",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
"metric": "flink_jobmanager_numRegisteredTaskManagers",
"refId": "A",
"step": 20
}
],
"thresholds": "",
"title": "# of TaskManagers",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(255, 255, 255, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": "${DS_PROMETHEUS}",
"format": "none",
"gauge": {
"maxValue": null,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"hideTimeOverride": false,
"id": 8,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 3,
"sparkline": {
"fillColor": "rgba(251, 129, 76, 0.18)",
"full": false,
"lineColor": "rgb(193, 31, 31)",
"show": true
},
"targets": [
{
"expr": "flink_jobmanager_numRunningJobs",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
"metric": "flink_jobmanager_numRunningJobs",
"refId": "A",
"step": 20
}
],
"thresholds": "",
"title": "# of Running Jobs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6"
},
{
"collapse": false,
"height": 255,
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 9,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Young_Generation_Time",
"intervalFactor": 2,
"legendFormat": "{{instance}} Young Gen Time",
"metric": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Young_Generation_Count",
"refId": "A",
"step": 2
},
{
"expr": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Old_Generation_Time",
"intervalFactor": 2,
"legendFormat": "{{instance}} Old Gen Time",
"metric": "flink_taskmanager_Status_JVM_GarbageCollector_G1_Young_Generation_Count",
"refId": "B",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "TaskManagers Garbage Collection",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"id": 10,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "flink_jobmanager_Status_JVM_GarbageCollector_Copy_Time",
"intervalFactor": 2,
"legendFormat": "{{instance}} GC Copy Time",
"metric": "flink_jobmanager_Status_JVM_GarbageCollector_Copy_Time",
"refId": "A",
"step": 2
},
{
"expr": "flink_jobmanager_Status_JVM_GarbageCollector_MarkSweepCompact_Time",
"intervalFactor": 2,
"legendFormat": "{{instance}} GC MarkSweep Time",
"metric": "flink_jobmanager_Status_JVM_GarbageCollector_MarkSweepCompact_Time",
"refId": "B",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "JobManager Garbage Collection",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [
"flink"
],
"templating": {
"list": []
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Flink Dashboard",
"version": 19,
"description": "Flink dashboard using the Prometheus exporter. https://ci.apache.org/projects/flink/flink-docs-stable/monitoring/metrics.html#prometheus-orgapacheflinkmetricsprometheusprometheusreporter "
}