建议采集下Linux服务器上内存占用Top的进程信息,在内存抖动的时候便于排查问题。
下面是一个python版的DEMO,待修改完善。 生产上建议使用golang来编写。
代码语言:python代码运行次数:0复制# -*- coding: utf-8 -*-
# 采集指标,并上报到pushgateway
import psutil
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
# 获取所有进程信息
processes = psutil.process_iter(['pid', 'name', 'memory_percent','memory_info'])
# 创建一个字典来存储进程ID、名称和内存使用率
proc_info = [(proc.info['pid'], proc.info['name'], proc.info['memory_percent'], proc.info['memory_info']) for proc in processes]
# 过滤掉内存使用率为0的进程(通常表示无法获取准确信息)
proc_info = [proc for proc in proc_info if proc[2] != 0]
# 按照内存使用率降序排序
sorted_proc_info = sorted(proc_info, key=lambda x: x[2], reverse=True)
# 打印内存使用率最高的10个进程
data=[]
for proc in sorted_proc_info[:10]:
pid, name, memory_percent,memory_info = proc
print(f"PID: {pid}, Name: {name}, Memory Usage: {memory_percent}% , RSS_Mem: {memory_info._asdict()['rss']}")
res = {"pid": pid,"name":name,"mem_usage": memory_percent,"rss_mem": memory_info._asdict()['rss']}
data.append(res)
# Pushgateway的地址
pushgateway_url = 'http://localhost:9091'
# 创建CollectorRegistry实例
registry = CollectorRegistry()
# 创建Gauge类型的metrics
metrics = {
'mem_usage': Gauge('memory_usage', 'Percentage of usage', registry=registry, labelnames=['pid', 'name']),
'rss_mem': Gauge('rss_mem', 'Resident Set Size in bytes', registry=registry, labelnames=['pid', 'name']),
}
# 设置metrics的值
for item in data:
pid = item['pid']
name = item['name']
mem_usage = item['mem_usage']
rss_mem = item['rss_mem']
# 根据进程名选择正确的metrics
metrics['mem_usage'].labels(pid=pid, name=name).set(mem_usage)
metrics['rss_mem'].labels(pid=pid, name=name).set(rss_mem)
# 将metrics推送到Pushgateway
# TODO 这里的instance要改为获取主机的主机名或者IP地址
push_to_gateway(pushgateway_url, job='process_metrics', registry=registry,grouping_key={"instance": "devops-all-01"},)
print("Metrics successfully pushed to Pushgateway")
执行如下:
最终grafana的效果如下(建议根据instance绘图,在左上角配置个下拉列表):
UPDATE 20240627 用golang重写了下,便于各处拷贝运行,代码如下:
代码语言:txt复制package main
import (
"fmt"
"flag"
"log"
"net"
"sort"
"os"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/push"
"github.com/shirou/gopsutil/v3/process"
)
func main() {
// 命令行参数解析
pushgatewayURL := flag.String("url", "http://localhost:9091", "The URL of the Pushgateway.")
flag.Parse()
// 获取所有进程
processes, err := process.Processes()
if err != nil {
log.Fatalf("Failed to get processes: %v", err)
}
// 初始化进程信息切片,这次直接使用RSS作为排序依据
type ProcessInfo struct {
PID int32
Name string
RSS uint64
}
var processInfos []ProcessInfo
// 收集每个进程的RSS信息
for _, p := range processes {
memInfo, err := p.MemoryInfo()
if err != nil {
log.Printf("Failed to get memory info for PID %d: %v", p.Pid, err)
continue
}
name, err := p.Name()
if err != nil {
log.Printf("Failed to get name for PID %d: %v", p.Pid, err)
continue
}
processInfos = append(processInfos, ProcessInfo{
PID: p.Pid,
Name: name,
RSS: memInfo.RSS,
})
}
// 按RSS降序排序
sort.Slice(processInfos, func(i, j int) bool {
return processInfos[i].RSS > processInfos[j].RSS
})
// 取前10个
topProcesses := processInfos[:10]
// 获取本机IP地址或主机名
var instanceID string
addrs, err := net.InterfaceAddrs()
if err != nil {
log.Printf("Failed to get IP addresses: %v, falling back to hostname.", err)
instanceID = os.Getenv("HOSTNAME")
} else {
for _, addr := range addrs {
if ipnet, ok := addr.(*net.IPNet); ok && !ipnet.IP.IsLoopback() {
if ipnet.IP.To4() != nil {
instanceID = ipnet.IP.String()
break
}
}
}
if instanceID == "" {
// 如果没有找到合适的IPv4地址,则使用hostname作为备选
instanceID = os.Getenv("HOSTNAME")
}
}
// Prometheus配置
registry := prometheus.NewRegistry()
// 创建Gauge向量,直接记录RSS
rssGauge := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "top_process_rss",
Help: "Top 10 process Resident Set Size (RSS)",
},
[]string{"pid", "process_name"},
)
registry.MustRegister(rssGauge)
// 设置Gauge值
for _, proc := range topProcesses {
rssGauge.WithLabelValues(fmt.Sprintf("%d", proc.PID), proc.Name).Set(float64(proc.RSS))
}
// 推送数据到Pushgateway,使用动态获取的instanceID
if err := push.New(*pushgatewayURL, "top_memory_processes").
Collector(rssGauge).
Grouping("instance", instanceID).
Push(); err != nil {
log.Fatalf("Failed to push to Pushgateway: %v", err)
}
fmt.Println("Top 10 memory consuming processes data pushed to Pushgateway.")
}
执行方法:
./main -url=http://192.168.31.181:9091 这里填的是你的pushgateway的地址,不加参数的话默认就是上报到本机的9091端口