DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-08496763-b93f-8f6a-4fc0-d93e1357bb10", cluster="k8s-cto-gpu-pro", device="nvidia4", gpu="4", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 1980
DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-1df8afeb-04ab-4cb6-7d00-ab7dbcc4eed9", cluster="k8s-cto-gpu-pro", device="nvidia5", gpu="5", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 1980
DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-4a14cb97-c86f-7e77-a74b-05575eccc227", cluster="k8s-cto-gpu-pro", device="nvidia1", gpu="1", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 1980
DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-82a64530-b090-e638-c523-5f3eb4eea01a", cluster="k8s-cto-gpu-pro", device="nvidia0", gpu="0", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 1980
DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-b05339cb-5930-dadd-480e-dde12630e0b1", cluster="k8s-cto-gpu-pro", device="nvidia3", gpu="3", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 1980
DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-b8b369cc-8369-5891-ce29-21ca979abb00", cluster="k8s-cto-gpu-pro", device="nvidia7", gpu="7", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 1980
DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-d9d28678-a25a-0d6b-7f80-d2234aa4dc0d", cluster="k8s-cto-gpu-pro", device="nvidia2", gpu="2", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 1980
DCGM_FI_DEV_SM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-de655b8a-1101-25e8-ef43-6bde6c315612", cluster="k8s-cto-gpu-pro", container="fssc-ocr-qa-online", device="nvidia6", gpu="6", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20", namespace="fisam", pod="a0703240723011-fssc-ocr-qa-online-67bbc98589-mqskl"} 1980
作用:表示 GPU 显存的时钟频率,单位MHz
示例
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-08496763-b93f-8f6a-4fc0-d93e1357bb10", cluster="k8s-cto-gpu-pro", device="nvidia4", gpu="4", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 2619
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-1df8afeb-04ab-4cb6-7d00-ab7dbcc4eed9", cluster="k8s-cto-gpu-pro", device="nvidia5", gpu="5", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 2619
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-4a14cb97-c86f-7e77-a74b-05575eccc227", cluster="k8s-cto-gpu-pro", device="nvidia1", gpu="1", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 2619
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-82a64530-b090-e638-c523-5f3eb4eea01a", cluster="k8s-cto-gpu-pro", device="nvidia0", gpu="0", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 2619
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-b05339cb-5930-dadd-480e-dde12630e0b1", cluster="k8s-cto-gpu-pro", device="nvidia3", gpu="3", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 2619
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-b8b369cc-8369-5891-ce29-21ca979abb00", cluster="k8s-cto-gpu-pro", device="nvidia7", gpu="7", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 2619
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-d9d28678-a25a-0d6b-7f80-d2234aa4dc0d", cluster="k8s-cto-gpu-pro", device="nvidia2", gpu="2", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 2619
DCGM_FI_DEV_MEM_CLOCK{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-de655b8a-1101-25e8-ef43-6bde6c315612", cluster="k8s-cto-gpu-pro", container="fssc-ocr-qa-online", device="nvidia6", gpu="6", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20", namespace="fisam", pod="a0703240723011-fssc-ocr-qa-online-67bbc98589-mqskl"} 2619
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-08496763-b93f-8f6a-4fc0-d93e1357bb10", cluster="k8s-cto-gpu-pro", device="nvidia4", gpu="4", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 36
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-1df8afeb-04ab-4cb6-7d00-ab7dbcc4eed9", cluster="k8s-cto-gpu-pro", device="nvidia5", gpu="5", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 40
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-4a14cb97-c86f-7e77-a74b-05575eccc227", cluster="k8s-cto-gpu-pro", device="nvidia1", gpu="1", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 41
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-82a64530-b090-e638-c523-5f3eb4eea01a", cluster="k8s-cto-gpu-pro", device="nvidia0", gpu="0", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 35
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-b05339cb-5930-dadd-480e-dde12630e0b1", cluster="k8s-cto-gpu-pro", device="nvidia3", gpu="3", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 41
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-b8b369cc-8369-5891-ce29-21ca979abb00", cluster="k8s-cto-gpu-pro", device="nvidia7", gpu="7", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 40
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-d9d28678-a25a-0d6b-7f80-d2234aa4dc0d", cluster="k8s-cto-gpu-pro", device="nvidia2", gpu="2", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20"} 36
DCGM_FI_DEV_MEMORY_TEMP{Hostname="gpu-metrics-exporter-r2dch", UUID="GPU-de655b8a-1101-25e8-ef43-6bde6c315612", cluster="k8s-cto-gpu-pro", container="fssc-ocr-qa-online", device="nvidia6", gpu="6", instance="10.10.182.63:9400", job="k8s-cto-gpu-pro-gpu", modelName="NVIDIA H20", namespace="fisam", pod="a0703240723011-fssc-ocr-qa-online-67bbc98589-mqskl"} 34
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-02deb8e4-d4fc-a6c9-c189-8e776203d7c0", cluster="k8s-test", container="a0703240710024-qa", device="nvidia7", gpu="7", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20", namespace="qa", pod="a0703240710024-qa-7c576c869b-vtn4x"} 33
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1bba11cf-d9a9-f77c-9c66-6b99d116faff", cluster="k8s-test", device="nvidia0", gpu="0", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 26
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1ccf2570-2815-3ced-8898-95648cd876d9", cluster="k8s-test", device="nvidia4", gpu="4", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 27
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-419d9aa5-aa41-5e8c-830c-2217fb5d155f", cluster="k8s-test", device="nvidia3", gpu="3", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 47
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-82160b93-6cae-dfe0-3665-e5a8c22c6897", cluster="k8s-test", device="nvidia2", gpu="2", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 34
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b45c6ec4-2b71-f68e-bd68-95df336318b6", cluster="k8s-test", device="nvidia1", gpu="1", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 34
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b7833685-11da-1f34-0f07-5756b22f781c", cluster="k8s-test", device="nvidia5", gpu="5", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 33
DCGM_FI_DEV_GPU_TEMP{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-ff64b19f-1bc9-2652-2fbb-e2df22d2e2e5", cluster="k8s-test", device="nvidia6", gpu="6", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 28
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-02deb8e4-d4fc-a6c9-c189-8e776203d7c0", cluster="k8s-test", container="a0703240710024-qa", device="nvidia7", gpu="7", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20", namespace="qa", pod="a0703240710024-qa-7c576c869b-vtn4x"} 118.396
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1bba11cf-d9a9-f77c-9c66-6b99d116faff", cluster="k8s-test", device="nvidia0", gpu="0", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 117.088
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1ccf2570-2815-3ced-8898-95648cd876d9", cluster="k8s-test", device="nvidia4", gpu="4", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 115.64
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-419d9aa5-aa41-5e8c-830c-2217fb5d155f", cluster="k8s-test", device="nvidia3", gpu="3", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 118.583
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-82160b93-6cae-dfe0-3665-e5a8c22c6897", cluster="k8s-test", device="nvidia2", gpu="2", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 115.182
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b45c6ec4-2b71-f68e-bd68-95df336318b6", cluster="k8s-test", device="nvidia1", gpu="1", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 121.918
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b7833685-11da-1f34-0f07-5756b22f781c", cluster="k8s-test", device="nvidia5", gpu="5", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 120.652
DCGM_FI_DEV_POWER_USAGE{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-ff64b19f-1bc9-2652-2fbb-e2df22d2e2e5", cluster="k8s-test", device="nvidia6", gpu="6", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 119.484
rate(DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{cluster="k8s-test"}[5m])
查询结果:
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-02deb8e4-d4fc-a6c9-c189-8e776203d7c0", cluster="k8s-test", container="a0703240710024-qa", device="nvidia7", gpu="7", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20", namespace="qa", pod="a0703240710024-qa-7c576c869b-vtn4x"} 95843.91749251151
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1bba11cf-d9a9-f77c-9c66-6b99d116faff", cluster="k8s-test", device="nvidia0", gpu="0", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 95613.14993275795
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1ccf2570-2815-3ced-8898-95648cd876d9", cluster="k8s-test", device="nvidia4", gpu="4", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 93766.0105002778
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-419d9aa5-aa41-5e8c-830c-2217fb5d155f", cluster="k8s-test", device="nvidia3", gpu="3", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 94915.19862014297
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-82160b93-6cae-dfe0-3665-e5a8c22c6897", cluster="k8s-test", device="nvidia2", gpu="2", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 93046.31376552948
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b45c6ec4-2b71-f68e-bd68-95df336318b6", cluster="k8s-test", device="nvidia1", gpu="1", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 99465.33625232431
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b7833685-11da-1f34-0f07-5756b22f781c", cluster="k8s-test", device="nvidia5", gpu="5", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 97453.83702762033
{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-ff64b19f-1bc9-2652-2fbb-e2df22d2e2e5", cluster="k8s-test", device="nvidia6", gpu="6", instance="10.10.177.64:9401", job="kubernetes-gpu-exporter", modelName="NVIDIA H20"} 97032.63242375078
PCIe (Peripheral Component Interconnect Express)是一种高速串行计算机扩展总线标准,用于连接计算机主板与各种硬件设备(如显卡、固态硬盘、网卡等)
编码
器(NVENC)利用率(%)解码
器(NVDEC)利用率(%)常见的 XID 错误类型包括:
- XID 31 - GPU 负载过高,导致驱动超时
- XID 43 - GPU 挂起或崩溃
- XID 48 - 运行时 GPU 访问错误
- XID 79 - 过热导致 GPU 频率降级
NVLink 是 NVIDIA 开发的一种高速互联技术,用于在多个 GPU 之间或 GPU 与 CPU 之间传输数据
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-02deb8e4-d4fc-a6c9-c189-8e776203d7c0", cluster="k8s-test", device="nvidia7", gpu="7", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20"} 0
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1bba11cf-d9a9-f77c-9c66-6b99d116faff", cluster="k8s-test", device="nvidia0", gpu="0", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20"} 0
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-1ccf2570-2815-3ced-8898-95648cd876d9", cluster="k8s-test", device="nvidia4", gpu="4", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20"} 0
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-419d9aa5-aa41-5e8c-830c-2217fb5d155f", cluster="k8s-test", device="nvidia3", gpu="3", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20"} 342
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-82160b93-6cae-dfe0-3665-e5a8c22c6897", cluster="k8s-test", container="a0703240710024-qa", device="nvidia2", gpu="2", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20", namespace="qa", pod="a0703240710024-qa-7c576c869b-lzclw"} 343
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b45c6ec4-2b71-f68e-bd68-95df336318b6", cluster="k8s-test", device="nvidia1", gpu="1", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20"} 0
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-b7833685-11da-1f34-0f07-5756b22f781c", cluster="k8s-test", device="nvidia5", gpu="5", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20"} 0
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{Hostname="gpu-metrics-exporter-fplml", UUID="GPU-ff64b19f-1bc9-2652-2fbb-e2df22d2e2e5", cluster="k8s-test", device="nvidia6", gpu="6", instance="10.10.177.64:9401", job="kubernetes-cto-gpu-test-gpu", modelName="NVIDIA H20"} 0
root@liubei:~# nvidia-smi topo -m
GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU NUMA ID
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 0-31,64-95 0 N/A
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 0-31,64-95 0 N/A
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 0-31,64-95 0 N/A
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 0-31,64-95 0 N/A
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 32-63,96-127 1 N/A
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 32-63,96-127 1 N/A
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 32-63,96-127 1 N/A
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X 32-63,96-127 1 N/A
Legend:
X = Self
SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
PIX = Connection traversing at most a single PCIe bridge
NV# = Connection traversing a bonded set of # NVLinks
如上可知: