-
-
Notifications
You must be signed in to change notification settings - Fork 402
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimized implementation to use IOKit directly. (#897)
* Optimized implementation. * Only test ScaleneAppleGPU on Darwin platforms. * Only test ScaleneAppleGPU on Darwin platforms (revised). * Only test ScaleneAppleGPU on Darwin platforms (revised redux). * Only test ScaleneAppleGPU on Darwin platforms (revised redux). * Only test ScaleneAppleGPU on Darwin platforms (revised redux). * Only test ScaleneAppleGPU on Darwin platforms (revised redux). * Fixed memory reporting to MB. * Disabled useless Apple GPU tests.
- Loading branch information
1 parent
060d490
commit 9a2e499
Showing
5 changed files
with
206 additions
and
179 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,73 +1,227 @@ | ||
import platform | ||
import random | ||
import re | ||
import subprocess | ||
import ctypes | ||
import time | ||
from typing import Tuple | ||
|
||
from scalene.scalene_accelerator import ScaleneAccelerator | ||
# --------------------------------------------------------------------------- | ||
# 1. Define the needed IOKit / CoreFoundation constants and function signatures | ||
# --------------------------------------------------------------------------- | ||
iokit = ctypes.cdll.LoadLibrary("/System/Library/Frameworks/IOKit.framework/IOKit") | ||
corefoundation = ctypes.cdll.LoadLibrary("/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation") | ||
|
||
CFTypeRef = ctypes.c_void_p | ||
CFAllocatorRef = ctypes.c_void_p | ||
IOOptionBits = ctypes.c_uint32 | ||
io_iterator_t = ctypes.c_void_p | ||
io_registry_entry_t = ctypes.c_void_p | ||
mach_port_t = ctypes.c_void_p | ||
|
||
class ScaleneAppleGPU(ScaleneAccelerator): | ||
"""Wrapper class for Apple integrated GPU statistics.""" | ||
try: | ||
# On Intel Macs, kIOMasterPortDefault might be defined; on Apple Silicon, it may just be 0. | ||
kIOMasterPortDefault = ctypes.c_void_p.in_dll(iokit, 'kIOMasterPortDefault') | ||
except ValueError: | ||
kIOMasterPortDefault = mach_port_t(0) | ||
|
||
IOServiceMatching = iokit.IOServiceMatching | ||
IOServiceMatching.argtypes = [ctypes.c_char_p] | ||
IOServiceMatching.restype = CFTypeRef | ||
|
||
IOServiceGetMatchingServices = iokit.IOServiceGetMatchingServices | ||
IOServiceGetMatchingServices.argtypes = [ | ||
mach_port_t, | ||
CFTypeRef, | ||
ctypes.POINTER(io_iterator_t), | ||
] | ||
IOServiceGetMatchingServices.restype = ctypes.c_int # kern_return_t | ||
|
||
IOIteratorNext = iokit.IOIteratorNext | ||
IOIteratorNext.argtypes = [io_iterator_t] | ||
IOIteratorNext.restype = io_registry_entry_t | ||
|
||
IOObjectRelease = iokit.IOObjectRelease | ||
IOObjectRelease.argtypes = [io_registry_entry_t] | ||
IOObjectRelease.restype = ctypes.c_int # kern_return_t | ||
|
||
IORegistryEntryCreateCFProperties = iokit.IORegistryEntryCreateCFProperties | ||
IORegistryEntryCreateCFProperties.argtypes = [ | ||
io_registry_entry_t, | ||
ctypes.POINTER(CFTypeRef), | ||
CFAllocatorRef, | ||
IOOptionBits, | ||
] | ||
IORegistryEntryCreateCFProperties.restype = CFTypeRef | ||
|
||
CFGetTypeID = corefoundation.CFGetTypeID | ||
CFGetTypeID.argtypes = [CFTypeRef] | ||
CFGetTypeID.restype = ctypes.c_long | ||
|
||
CFDictionaryGetTypeID = corefoundation.CFDictionaryGetTypeID | ||
CFDictionaryGetTypeID.argtypes = [] | ||
CFDictionaryGetTypeID.restype = ctypes.c_long | ||
|
||
CFStringCreateWithCString = corefoundation.CFStringCreateWithCString | ||
CFStringCreateWithCString.argtypes = [CFAllocatorRef, ctypes.c_char_p, ctypes.c_uint32] | ||
CFStringCreateWithCString.restype = CFTypeRef | ||
|
||
CFDictionaryGetValue = corefoundation.CFDictionaryGetValue | ||
CFDictionaryGetValue.argtypes = [CFTypeRef, CFTypeRef] | ||
CFDictionaryGetValue.restype = CFTypeRef | ||
|
||
CFNumberGetTypeID = corefoundation.CFNumberGetTypeID | ||
CFNumberGetTypeID.argtypes = [] | ||
CFNumberGetTypeID.restype = ctypes.c_long | ||
|
||
CFNumberGetValue = corefoundation.CFNumberGetValue | ||
CFNumberGetValue.argtypes = [CFTypeRef, ctypes.c_int, ctypes.c_void_p] | ||
CFNumberGetValue.restype = ctypes.c_bool | ||
|
||
CFNumberGetType = corefoundation.CFNumberGetType | ||
CFNumberGetType.argtypes = [CFTypeRef] | ||
CFNumberGetType.restype = ctypes.c_int | ||
|
||
CFShow = corefoundation.CFShow | ||
CFShow.argtypes = [CFTypeRef] | ||
|
||
kCFNumberSInt64Type = 4 # 64-bit integers | ||
|
||
def cfstr(py_str: str) -> CFTypeRef: | ||
"""Helper to create a CFString from a Python string.""" | ||
return CFStringCreateWithCString(None, py_str.encode('utf-8'), 0) | ||
|
||
def _read_apple_gpu_stats_and_cores() -> Tuple[float, float, int]: | ||
""" | ||
Reads from IOService class "IOAccelerator" and returns: | ||
(device_util, in_use_mem, gpu_core_count) | ||
where: | ||
- device_util is a fraction [0..1]. | ||
- in_use_mem is in megabytes. | ||
- gpu_core_count is an integer from top-level "gpu-core-count". | ||
""" | ||
matching_dict = IOServiceMatching(b"IOAccelerator") | ||
if not matching_dict: | ||
# debug_print("[DEBUG] Could not create matching dictionary.") | ||
return (0.0, 0.0, 0) | ||
|
||
service_iterator = io_iterator_t() | ||
kr = IOServiceGetMatchingServices(kIOMasterPortDefault, matching_dict, ctypes.byref(service_iterator)) | ||
if kr != 0: | ||
# debug_print(f"[DEBUG] IOServiceGetMatchingServices returned kr={kr}. Possibly no services found.") | ||
return (0.0, 0.0, 0) | ||
|
||
device_util = 0.0 | ||
in_use_mem = 0.0 | ||
gpu_core_count = 0 | ||
|
||
while True: | ||
service_object = IOIteratorNext(service_iterator) | ||
if not service_object: | ||
# No more services | ||
break | ||
|
||
props_ref = CFTypeRef() | ||
IORegistryEntryCreateCFProperties(service_object, ctypes.byref(props_ref), None, 0) | ||
|
||
# The top-level dictionary: | ||
if props_ref and CFGetTypeID(props_ref) == CFDictionaryGetTypeID(): | ||
# 1. Grab "gpu-core-count" at the top level | ||
top_key_cores = cfstr("gpu-core-count") | ||
core_val_ref = CFDictionaryGetValue(props_ref, top_key_cores) | ||
if core_val_ref and (CFGetTypeID(core_val_ref) == CFNumberGetTypeID()): | ||
val_container_64 = ctypes.c_longlong(0) | ||
success = CFNumberGetValue(core_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64)) | ||
if success: | ||
gpu_core_count = val_container_64.value | ||
IOObjectRelease(top_key_cores) | ||
|
||
# 2. Check for sub-dictionary "PerformanceStatistics" | ||
performance_key = cfstr("PerformanceStatistics") | ||
performance_dict_ref = CFDictionaryGetValue(props_ref, performance_key) | ||
IOObjectRelease(performance_key) | ||
|
||
if performance_dict_ref and (CFGetTypeID(performance_dict_ref) == CFDictionaryGetTypeID()): | ||
cf_key_util = cfstr("Device Utilization %") | ||
cf_key_mem = cfstr("In use system memory") | ||
|
||
# Device Utilization | ||
util_val_ref = CFDictionaryGetValue(performance_dict_ref, cf_key_util) | ||
if util_val_ref and (CFGetTypeID(util_val_ref) == CFNumberGetTypeID()): | ||
val_container_64 = ctypes.c_longlong(0) | ||
success = CFNumberGetValue(util_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64)) | ||
if success: | ||
device_util = val_container_64.value / 100.0 | ||
|
||
# In use system memory | ||
mem_val_ref = CFDictionaryGetValue(performance_dict_ref, cf_key_mem) | ||
if mem_val_ref and (CFGetTypeID(mem_val_ref) == CFNumberGetTypeID()): | ||
val_container_64 = ctypes.c_longlong(0) | ||
success = CFNumberGetValue(mem_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64)) | ||
if success: | ||
in_use_mem = float(val_container_64.value) / 1048576.0 | ||
|
||
IOObjectRelease(cf_key_util) | ||
IOObjectRelease(cf_key_mem) | ||
|
||
IOObjectRelease(props_ref) | ||
|
||
IOObjectRelease(service_object) | ||
|
||
if (device_util > 0.0 or in_use_mem > 0.0) and gpu_core_count > 0: | ||
# Success, break | ||
break | ||
|
||
IOObjectRelease(service_iterator) | ||
return (device_util, in_use_mem, gpu_core_count) | ||
|
||
|
||
class ScaleneAppleGPU: | ||
"""Wrapper class for Apple integrated GPU statistics, using direct IOKit calls.""" | ||
|
||
def __init__(self, sampling_frequency: int = 100) -> None: | ||
assert platform.system() == "Darwin" | ||
self.cmd = ( | ||
'DYLD_INSERT_LIBRARIES="" ioreg -r -d 1 -w 0 -c "IOAccelerator"' | ||
) | ||
self.regex_util = re.compile(r'"Device Utilization %"=(\d+)') | ||
self.regex_inuse = re.compile(r'"In use system memory"=(\d+)') | ||
# Only actually get stats some fraction of the time, since it is costly. | ||
# Used in get_stats(). | ||
self.gpu_sampling_frequency = sampling_frequency | ||
self.core_count = self._get_num_cores() | ||
|
||
def gpu_device(self) -> str: | ||
return "GPU" | ||
|
||
def has_gpu(self) -> bool: | ||
"""True iff there is a GPU""" | ||
# Disabling Apple GPU, since it does not collect per-process statistics. | ||
return False | ||
"""Return True if the system likely has an Apple integrated GPU.""" | ||
return True | ||
|
||
def reinit(self) -> None: | ||
"""A NOP, here for compatibility with the nvidia wrapper.""" | ||
return | ||
"""No-op for compatibility with other GPU wrappers.""" | ||
pass | ||
|
||
def get_num_cores(self) -> int: | ||
# FIXME: not yet implemented | ||
return 1 | ||
return self.core_count | ||
|
||
def get_stats(self) -> Tuple[float, float]: | ||
"""Returns a tuple of (utilization%, memory in use)""" | ||
"""Returns a tuple of (utilization%, memory in use in megabytes).""" | ||
if not self.has_gpu(): | ||
return (0.0, 0.0) | ||
try: | ||
# Only periodically query the statistics for real (at a | ||
# rate of 1/self.gpu_sampling_frequency). We do this to | ||
# amortize its cost, as it is shockingly expensive. | ||
if random.randint(0, self.gpu_sampling_frequency - 1) != 0: | ||
return (0.0, 0.0) | ||
in_use = 0.0 | ||
util = 0.0 | ||
read_process = subprocess.Popen( | ||
self.cmd, shell=True, stdout=subprocess.PIPE | ||
) | ||
if read_process.stdout is not None: | ||
read_process_return = read_process.stdout.readlines() | ||
for line in read_process_return: | ||
decoded_line = line.decode("utf-8") | ||
# print(decoded_line) | ||
if "In use system memory" in decoded_line: | ||
in_use_re = self.regex_inuse.search(decoded_line) | ||
if in_use_re: | ||
in_use = float(in_use_re.group(1)) | ||
if "Device Utilization %" in decoded_line: | ||
util_re = self.regex_util.search(decoded_line) | ||
if util_re: | ||
util = int(util_re.group(1)) / 1000 | ||
if util and in_use: | ||
break | ||
return (util, in_use) | ||
except Exception: | ||
pass | ||
return (0.0, 0.0) | ||
util, in_use, _ = _read_apple_gpu_stats_and_cores() | ||
return (util, in_use) | ||
except Exception as ex: | ||
return (0.0, 0.0) | ||
|
||
def _get_num_cores(self) -> int: | ||
""" | ||
Retrieves the 'gpu-core-count' property from the top-level dictionary. | ||
Returns 0 if not found. | ||
""" | ||
# We reuse the same function that gathers utilization & memory | ||
_, _, core_count = _read_apple_gpu_stats_and_cores() | ||
return core_count | ||
|
||
if __name__ == "__main__": | ||
gpu = ScaleneAppleGPU() | ||
while True: | ||
util, mem = gpu.get_stats() | ||
cores = gpu.get_num_cores() | ||
print( | ||
f"GPU Utilization: {util*100:.1f}%, " | ||
f"In-Use GPU Memory: {mem} megabytes, " | ||
f"GPU Core Count: {cores}" | ||
) | ||
time.sleep(2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.