Skip to content

Instantly share code, notes, and snippets.

@leuc
Last active April 22, 2025 21:32
Show Gist options
  • Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Save leuc/e45f4dc64dc1db870e4bad1c436228bb to your computer and use it in GitHub Desktop.
Decode AMD GPU Metrics from SysFS
#!/usr/bin/env python3
#
# SPDX-License-Identifier: GPL-3.0-or-later
#
# amdgpu_metrics.py decode amdgpu metrics from sysfs
# Copyright (C) 2021 leuc
#
# This program is free software: you can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import ctypes
from json import dumps
from enum import IntFlag
COMMON_HEADER_SIZE = 4
class ThrottleStatus(IntFlag):
# linux/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
PPT0 = 1 << 0
PPT1 = 1 << 1
PPT2 = 1 << 2
PPT3 = 1 << 3
SPL = 1 << 4
FPPT = 1 << 5
SPPT = 1 << 6
SPPT_APU = 1 << 7
TDC_GFX = 1 << 16
TDC_SOC = 1 << 17
TDC_MEM = 1 << 18
TDC_VDD = 1 << 19
TDC_CVIP = 1 << 20
EDC_CPU = 1 << 21
EDC_GFX = 1 << 22
APCC = 1 << 23
TEMP_GPU = 1 << 32
TEMP_CORE = 1 << 33
TEMP_MEM = 1 << 34
TEMP_EDGE = 1 << 35
TEMP_HOTSPOT = 1 << 36
TEMP_SOC = 1 << 37
TEMP_VR_GFX = 1 << 38
TEMP_VR_SOC = 1 << 39
TEMP_VR_MEM0 = 1 << 40
TEMP_VR_MEM1 = 1 << 41
TEMP_LIQUID0 = 1 << 42
TEMP_LIQUID1 = 1 << 43
VRHOT0 = 1 << 44
VRHOT1 = 1 << 45
PROCHOT_CPU = 1 << 46
PROCHOT_GFX = 1 << 47
PPM = 1 << 56
FIT = 1 << 57
def active(self):
members = self.__class__.__members__
return (m for m in members if getattr(self, m)._value_ & self.value != 0)
def __iter__(self):
return self.active()
def __str__(self):
return u', '.join(self.active())
class GpuMetrics(ctypes.Structure):
def __new__(cls, buf):
return cls.from_buffer_copy(buf)
def __init__(self, data):
pass
def __iter__(self):
return ((f[0], getattr(self, f[0])) for f in self._fields_)
def __str__(self):
a = [u'{}: {}'.format(f[0], getattr(self, f[0]))
for f in self._fields_]
return u'> {}\n'.format(type(self).__name__) + u'\n'.join(a)
class MetricsTableHeader(GpuMetrics):
_fields_ = [
('structure_size', ctypes.c_uint16),
('format_revision', ctypes.c_uint8),
('content_revision', ctypes.c_uint8),
]
# AMD GPU metrics defined in
# linux/drivers/gpu/drm/amd/include/kgd_pp_interface.h
class GpuMetrics_v1_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint32),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint8),
('pcie_link_speed', ctypes.c_uint8),
]
class GpuMetrics_v1_1(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
]
class GpuMetrics_v1_2(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
]
class GpuMetrics_v1_3(GpuMetrics):
_fields_ = [
('temperature_edge', ctypes.c_uint16),
('temperature_hotspot', ctypes.c_uint16),
('temperature_mem', ctypes.c_uint16),
('temperature_vrgfx', ctypes.c_uint16),
('temperature_vrsoc', ctypes.c_uint16),
('temperature_vrmem', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_umc_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('energy_accumulator', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_vclk0_frequency', ctypes.c_uint16),
('average_dclk0_frequency', ctypes.c_uint16),
('average_vclk1_frequency', ctypes.c_uint16),
('average_dclk1_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_vclk0', ctypes.c_uint16),
('current_dclk0', ctypes.c_uint16),
('current_vclk1', ctypes.c_uint16),
('current_dclk1', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('current_fan_speed', ctypes.c_uint16),
('pcie_link_width', ctypes.c_uint16),
('pcie_link_speed', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('gfx_activity_acc', ctypes.c_uint32),
('mem_activity_acc', ctypes.c_uint32),
('temperature_hbm', ctypes.c_uint16),
('firmware_timestamp', ctypes.c_uint64),
('voltage_soc', ctypes.c_uint16),
('voltage_gfx', ctypes.c_uint16),
('voltage_mem', ctypes.c_uint16),
('padding1', ctypes.c_uint8),
# FIXME Doesn't match output on 5.15.0-051500rc7-generic
# with Navi 10 RX 5600
# ('indep_throttle_status', ctypes.c_uint64),
]
class GpuMetrics_v2_0(GpuMetrics):
_fields_ = [
('system_clock_counter', ctypes.c_uint64),
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_1(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
]
class GpuMetrics_v2_2(GpuMetrics):
_fields_ = [
('temperature_gfx', ctypes.c_uint16),
('temperature_soc', ctypes.c_uint16),
('temperature_core', ctypes.c_uint16),
('temperature_l3', ctypes.c_uint16),
('average_gfx_activity', ctypes.c_uint16),
('average_mm_activity', ctypes.c_uint16),
('system_clock_counter', ctypes.c_uint64),
('average_socket_power', ctypes.c_uint16),
('average_socket_power', ctypes.c_uint16),
('average_cpu_power', ctypes.c_uint16),
('average_soc_power', ctypes.c_uint16),
('average_gfx_power', ctypes.c_uint16),
('average_core_power', ctypes.c_uint16),
('average_gfxclk_frequency', ctypes.c_uint16),
('average_socclk_frequency', ctypes.c_uint16),
('average_uclk_frequency', ctypes.c_uint16),
('average_fclk_frequency', ctypes.c_uint16),
('average_vclk_frequency', ctypes.c_uint16),
('average_dclk_frequency', ctypes.c_uint16),
('current_gfxclk', ctypes.c_uint16),
('current_socclk', ctypes.c_uint16),
('current_uclk', ctypes.c_uint16),
('current_fclk', ctypes.c_uint16),
('current_vclk', ctypes.c_uint16),
('current_dclk', ctypes.c_uint16),
('current_coreclk', ctypes.c_uint16),
('current_l3clk', ctypes.c_uint16),
('throttle_status', ctypes.c_uint32),
('fan_pwm', ctypes.c_uint16),
('padding', ctypes.c_uint16),
('indep_throttle_status', ctypes.c_uint64),
]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('files', nargs='+',
help='Path to gpu_metrics file under /sys')
parser.add_argument('-j', '--json',
help='Format output as JSON', action="store_true")
args = parser.parse_args()
for filename in args.files:
with open(filename, mode='rb') as fh:
header = MetricsTableHeader(fh.read(COMMON_HEADER_SIZE))
assert header.structure_size > 0
buf = fh.read(header.structure_size)
assert len(buf) + COMMON_HEADER_SIZE == header.structure_size
assert fh.read() == b'' # should be empty
if header.format_revision == 1 and header.content_revision == 0:
metrics = GpuMetrics_v1_0(buf)
elif header.format_revision == 1 and header.content_revision == 1:
metrics = GpuMetrics_v1_1(buf)
elif header.format_revision == 1 and header.content_revision == 2:
metrics = GpuMetrics_v1_2(buf)
elif header.format_revision == 1 and header.content_revision == 3:
metrics = GpuMetrics_v1_3(buf)
elif header.format_revision == 2 and header.content_revision == 0:
metrics = GpuMetrics_v2_0(buf)
elif header.format_revision == 2 and header.content_revision == 1:
metrics = GpuMetrics_v2_1(buf)
elif header.format_revision == 2 and header.content_revision == 2:
metrics = GpuMetrics_v2_2(buf)
else:
raise ValueError("Unsupported metrics v{}.{}".format(
header.format_revision, header.content_revision))
ts = ThrottleStatus(metrics.throttle_status)
if args.json:
print(dumps(dict([
("path", filename)] +
list(header) +
list(metrics) +
[('throttle_status_flags', list(ts))
])))
else:
print(filename)
print(header)
print(metrics)
print("throttle_status_flags:", ts)
@leuc
Copy link
Author

leuc commented Jan 5, 2023

try manually setting the power profile to 3D_FULL_SCREEN

as root

echo manual > /sys/class/drm/card0/device/power_dpm_force_performance_level
echo "1" > /sys/class/drm/card0/device/pp_power_profile_mode

performance difference might only show with a real benchmark...

@dlm21
Copy link

dlm21 commented Jan 5, 2023

Thanks, I used the only benchmark I have installed atm, ffxiv endwalker, and it seemed to bump the score by a little bit, tho it's mostly CPU bottlenecked. The main thing I noticed is the GPU stayed higher clocked and pulled max watts most of the time rather than dropping off as soon as possible. Still goes back to normal during idle / at desktop, so I'll set this as the default for awhile. Thanks !
I still see the ever-changing "throttle_status_flags", like PPT0, PPT1, PPT2, FPPT, TDC_CVIP, but I'm pretty sure those are red herrings and not really indicative of a real problem, or any actual throttling.

@Umio-Yasuno
Copy link

temperature_hbm, temperature_core, temperature_l3, average_core_power, current_coreclk, current_l3clk, padding are arrays.
Therefore, it looks like the data is misaligned.

> MetricsTableHeader
structure_size: 128
format_revision: 2
content_revision: 2
> GpuMetrics_v2_2
temperature_gfx: 3850
temperature_soc: 3925
temperature_core: 3850
temperature_l3: 3975
average_gfx_activity: 3875
average_mm_activity: 5250
system_clock_counter: 17287498960675
average_socket_power: 0
average_socket_power: 0
average_cpu_power: 62303
average_soc_power: 17492
average_gfx_power: 19098
average_core_power: 0
average_gfxclk_frequency: 11
average_socclk_frequency: 6978
average_uclk_frequency: 1744
average_fclk_frequency: 65535
average_vclk_frequency: 0
average_dclk_frequency: 353
current_gfxclk: 0
current_socclk: 8886
current_uclk: 351
current_fclk: 350
current_vclk: 341
current_dclk: 343
current_coreclk: 400
current_l3clk: 400
throttle_status: 104857599
fan_pwm: 400
padding: 65535
indep_throttle_status: 450359988533068176
throttle_status_flags: PPT0, PPT1, PPT2, PPT3, SPL, FPPT, SPPT, SPPT_APU, TDC_GFX, TDC_SOC, TDC_MEM, TDC_VDD, TDC_CVIP, EDC_CPU
V2_2(
    gpu_metrics_v2_2 {
        common_header: metrics_table_header {
            structure_size: 128,
            format_revision: 2,
            content_revision: 2,
        },
        temperature_gfx: 4050,
        temperature_soc: 3950,
        temperature_core: [
            3875,
            3925,
            3900,
            4000,
            4200,
            4000,
            5225,
            4050,
        ],
        temperature_l3: [
            4125,
            0,
        ],
        average_gfx_activity: 2,
        average_mm_activity: 0,
        system_clock_counter: 82267147835201,
        average_socket_power: 14,
        average_cpu_power: 8300,
        average_soc_power: 2587,
        average_gfx_power: 65535,
        average_core_power: [
            0,
            396,
            0,
            403,
            380,
            339,
            4438,
            471,
        ],
        average_gfxclk_frequency: 401,
        average_socclk_frequency: 401,
        average_uclk_frequency: 65535,
        average_fclk_frequency: 1599,
        average_vclk_frequency: 400,
        average_dclk_frequency: 65535,
        current_gfxclk: 1900,
        current_socclk: 975,
        current_uclk: 6,
        current_fclk: 1600,
        current_vclk: 400,
        current_dclk: 400,
        current_coreclk: [
            0,
            3560,
            0,
            3560,
            3560,
            3560,
            4450,
            3560,
        ],
        current_l3clk: [
            4450,
            0,
        ],
        throttle_status: 0,
        fan_pwm: 0,
        padding: [
            65535,
            65535,
            65535,
        ],
        indep_throttle_status: 0,
    },
)

https://github.com/Umio-Yasuno/libdrm-amdgpu-sys-rs/blob/main/examples/gpu_metrics.rs

@Umio-Yasuno
Copy link

@leuc amdgpu_metrics.py will give partially incorrect results because _pack_ = 1 is not set.

@shmerl
Copy link

shmerl commented Mar 18, 2025

Does AMD document the layout of this metrics file anywhere or the only way to analyze it is to read the driver code?

@Umio-Yasuno
Copy link

@shmerl
The gpu_metrics structure is defined in drivers/gpu/drm/amd/include/kgd_pp_interface.h.
However, the units of the fields may vary depending on the device and firmware.

https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/amd/include/kgd_pp_interface.h

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment