Skip to content

Commit

Permalink
Optimized implementation to use IOKit directly. (#897)
Browse files Browse the repository at this point in the history
* Optimized implementation.

* Only test ScaleneAppleGPU on Darwin platforms.

* Only test ScaleneAppleGPU on Darwin platforms (revised).

* Only test ScaleneAppleGPU on Darwin platforms (revised redux).

* Only test ScaleneAppleGPU on Darwin platforms (revised redux).

* Only test ScaleneAppleGPU on Darwin platforms (revised redux).

* Only test ScaleneAppleGPU on Darwin platforms (revised redux).

* Fixed memory reporting to MB.

* Disabled useless Apple GPU tests.
  • Loading branch information
emeryberger authored Dec 31, 2024
1 parent 060d490 commit 9a2e499
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 179 deletions.
254 changes: 204 additions & 50 deletions scalene/scalene_apple_gpu.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,227 @@
import platform
import random
import re
import subprocess
import ctypes
import time
from typing import Tuple

from scalene.scalene_accelerator import ScaleneAccelerator
# ---------------------------------------------------------------------------
# 1. Define the needed IOKit / CoreFoundation constants and function signatures
# ---------------------------------------------------------------------------
iokit = ctypes.cdll.LoadLibrary("/System/Library/Frameworks/IOKit.framework/IOKit")
corefoundation = ctypes.cdll.LoadLibrary("/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation")

CFTypeRef = ctypes.c_void_p
CFAllocatorRef = ctypes.c_void_p
IOOptionBits = ctypes.c_uint32
io_iterator_t = ctypes.c_void_p
io_registry_entry_t = ctypes.c_void_p
mach_port_t = ctypes.c_void_p

class ScaleneAppleGPU(ScaleneAccelerator):
"""Wrapper class for Apple integrated GPU statistics."""
try:
# On Intel Macs, kIOMasterPortDefault might be defined; on Apple Silicon, it may just be 0.
kIOMasterPortDefault = ctypes.c_void_p.in_dll(iokit, 'kIOMasterPortDefault')
except ValueError:
kIOMasterPortDefault = mach_port_t(0)

IOServiceMatching = iokit.IOServiceMatching
IOServiceMatching.argtypes = [ctypes.c_char_p]
IOServiceMatching.restype = CFTypeRef

IOServiceGetMatchingServices = iokit.IOServiceGetMatchingServices
IOServiceGetMatchingServices.argtypes = [
mach_port_t,
CFTypeRef,
ctypes.POINTER(io_iterator_t),
]
IOServiceGetMatchingServices.restype = ctypes.c_int # kern_return_t

IOIteratorNext = iokit.IOIteratorNext
IOIteratorNext.argtypes = [io_iterator_t]
IOIteratorNext.restype = io_registry_entry_t

IOObjectRelease = iokit.IOObjectRelease
IOObjectRelease.argtypes = [io_registry_entry_t]
IOObjectRelease.restype = ctypes.c_int # kern_return_t

IORegistryEntryCreateCFProperties = iokit.IORegistryEntryCreateCFProperties
IORegistryEntryCreateCFProperties.argtypes = [
io_registry_entry_t,
ctypes.POINTER(CFTypeRef),
CFAllocatorRef,
IOOptionBits,
]
IORegistryEntryCreateCFProperties.restype = CFTypeRef

CFGetTypeID = corefoundation.CFGetTypeID
CFGetTypeID.argtypes = [CFTypeRef]
CFGetTypeID.restype = ctypes.c_long

CFDictionaryGetTypeID = corefoundation.CFDictionaryGetTypeID
CFDictionaryGetTypeID.argtypes = []
CFDictionaryGetTypeID.restype = ctypes.c_long

CFStringCreateWithCString = corefoundation.CFStringCreateWithCString
CFStringCreateWithCString.argtypes = [CFAllocatorRef, ctypes.c_char_p, ctypes.c_uint32]
CFStringCreateWithCString.restype = CFTypeRef

CFDictionaryGetValue = corefoundation.CFDictionaryGetValue
CFDictionaryGetValue.argtypes = [CFTypeRef, CFTypeRef]
CFDictionaryGetValue.restype = CFTypeRef

CFNumberGetTypeID = corefoundation.CFNumberGetTypeID
CFNumberGetTypeID.argtypes = []
CFNumberGetTypeID.restype = ctypes.c_long

CFNumberGetValue = corefoundation.CFNumberGetValue
CFNumberGetValue.argtypes = [CFTypeRef, ctypes.c_int, ctypes.c_void_p]
CFNumberGetValue.restype = ctypes.c_bool

CFNumberGetType = corefoundation.CFNumberGetType
CFNumberGetType.argtypes = [CFTypeRef]
CFNumberGetType.restype = ctypes.c_int

CFShow = corefoundation.CFShow
CFShow.argtypes = [CFTypeRef]

kCFNumberSInt64Type = 4 # 64-bit integers

def cfstr(py_str: str) -> CFTypeRef:
"""Helper to create a CFString from a Python string."""
return CFStringCreateWithCString(None, py_str.encode('utf-8'), 0)

def _read_apple_gpu_stats_and_cores() -> Tuple[float, float, int]:
"""
Reads from IOService class "IOAccelerator" and returns:
(device_util, in_use_mem, gpu_core_count)
where:
- device_util is a fraction [0..1].
- in_use_mem is in megabytes.
- gpu_core_count is an integer from top-level "gpu-core-count".
"""
matching_dict = IOServiceMatching(b"IOAccelerator")
if not matching_dict:
# debug_print("[DEBUG] Could not create matching dictionary.")
return (0.0, 0.0, 0)

service_iterator = io_iterator_t()
kr = IOServiceGetMatchingServices(kIOMasterPortDefault, matching_dict, ctypes.byref(service_iterator))
if kr != 0:
# debug_print(f"[DEBUG] IOServiceGetMatchingServices returned kr={kr}. Possibly no services found.")
return (0.0, 0.0, 0)

device_util = 0.0
in_use_mem = 0.0
gpu_core_count = 0

while True:
service_object = IOIteratorNext(service_iterator)
if not service_object:
# No more services
break

props_ref = CFTypeRef()
IORegistryEntryCreateCFProperties(service_object, ctypes.byref(props_ref), None, 0)

# The top-level dictionary:
if props_ref and CFGetTypeID(props_ref) == CFDictionaryGetTypeID():
# 1. Grab "gpu-core-count" at the top level
top_key_cores = cfstr("gpu-core-count")
core_val_ref = CFDictionaryGetValue(props_ref, top_key_cores)
if core_val_ref and (CFGetTypeID(core_val_ref) == CFNumberGetTypeID()):
val_container_64 = ctypes.c_longlong(0)
success = CFNumberGetValue(core_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64))
if success:
gpu_core_count = val_container_64.value
IOObjectRelease(top_key_cores)

# 2. Check for sub-dictionary "PerformanceStatistics"
performance_key = cfstr("PerformanceStatistics")
performance_dict_ref = CFDictionaryGetValue(props_ref, performance_key)
IOObjectRelease(performance_key)

if performance_dict_ref and (CFGetTypeID(performance_dict_ref) == CFDictionaryGetTypeID()):
cf_key_util = cfstr("Device Utilization %")
cf_key_mem = cfstr("In use system memory")

# Device Utilization
util_val_ref = CFDictionaryGetValue(performance_dict_ref, cf_key_util)
if util_val_ref and (CFGetTypeID(util_val_ref) == CFNumberGetTypeID()):
val_container_64 = ctypes.c_longlong(0)
success = CFNumberGetValue(util_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64))
if success:
device_util = val_container_64.value / 100.0

# In use system memory
mem_val_ref = CFDictionaryGetValue(performance_dict_ref, cf_key_mem)
if mem_val_ref and (CFGetTypeID(mem_val_ref) == CFNumberGetTypeID()):
val_container_64 = ctypes.c_longlong(0)
success = CFNumberGetValue(mem_val_ref, kCFNumberSInt64Type, ctypes.byref(val_container_64))
if success:
in_use_mem = float(val_container_64.value) / 1048576.0

IOObjectRelease(cf_key_util)
IOObjectRelease(cf_key_mem)

IOObjectRelease(props_ref)

IOObjectRelease(service_object)

if (device_util > 0.0 or in_use_mem > 0.0) and gpu_core_count > 0:
# Success, break
break

IOObjectRelease(service_iterator)
return (device_util, in_use_mem, gpu_core_count)


class ScaleneAppleGPU:
"""Wrapper class for Apple integrated GPU statistics, using direct IOKit calls."""

def __init__(self, sampling_frequency: int = 100) -> None:
assert platform.system() == "Darwin"
self.cmd = (
'DYLD_INSERT_LIBRARIES="" ioreg -r -d 1 -w 0 -c "IOAccelerator"'
)
self.regex_util = re.compile(r'"Device Utilization %"=(\d+)')
self.regex_inuse = re.compile(r'"In use system memory"=(\d+)')
# Only actually get stats some fraction of the time, since it is costly.
# Used in get_stats().
self.gpu_sampling_frequency = sampling_frequency
self.core_count = self._get_num_cores()

def gpu_device(self) -> str:
return "GPU"

def has_gpu(self) -> bool:
"""True iff there is a GPU"""
# Disabling Apple GPU, since it does not collect per-process statistics.
return False
"""Return True if the system likely has an Apple integrated GPU."""
return True

def reinit(self) -> None:
"""A NOP, here for compatibility with the nvidia wrapper."""
return
"""No-op for compatibility with other GPU wrappers."""
pass

def get_num_cores(self) -> int:
# FIXME: not yet implemented
return 1
return self.core_count

def get_stats(self) -> Tuple[float, float]:
"""Returns a tuple of (utilization%, memory in use)"""
"""Returns a tuple of (utilization%, memory in use in megabytes)."""
if not self.has_gpu():
return (0.0, 0.0)
try:
# Only periodically query the statistics for real (at a
# rate of 1/self.gpu_sampling_frequency). We do this to
# amortize its cost, as it is shockingly expensive.
if random.randint(0, self.gpu_sampling_frequency - 1) != 0:
return (0.0, 0.0)
in_use = 0.0
util = 0.0
read_process = subprocess.Popen(
self.cmd, shell=True, stdout=subprocess.PIPE
)
if read_process.stdout is not None:
read_process_return = read_process.stdout.readlines()
for line in read_process_return:
decoded_line = line.decode("utf-8")
# print(decoded_line)
if "In use system memory" in decoded_line:
in_use_re = self.regex_inuse.search(decoded_line)
if in_use_re:
in_use = float(in_use_re.group(1))
if "Device Utilization %" in decoded_line:
util_re = self.regex_util.search(decoded_line)
if util_re:
util = int(util_re.group(1)) / 1000
if util and in_use:
break
return (util, in_use)
except Exception:
pass
return (0.0, 0.0)
util, in_use, _ = _read_apple_gpu_stats_and_cores()
return (util, in_use)
except Exception as ex:
return (0.0, 0.0)

def _get_num_cores(self) -> int:
"""
Retrieves the 'gpu-core-count' property from the top-level dictionary.
Returns 0 if not found.
"""
# We reuse the same function that gathers utilization & memory
_, _, core_count = _read_apple_gpu_stats_and_cores()
return core_count

if __name__ == "__main__":
gpu = ScaleneAppleGPU()
while True:
util, mem = gpu.get_stats()
cores = gpu.get_num_cores()
print(
f"GPU Utilization: {util*100:.1f}%, "
f"In-Use GPU Memory: {mem} megabytes, "
f"GPU Core Count: {cores}"
)
time.sleep(2)
3 changes: 2 additions & 1 deletion test/torchtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
def torchtest():
dtype = torch.float
#device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU
# device = torch.device("cuda:0") # Uncomment this to run on GPU
# device = torch.device("cuda") # Uncomment this to run on GPU
device = torch.device("mps")

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
Expand Down
39 changes: 0 additions & 39 deletions tests/test_coverup_18.py

This file was deleted.

30 changes: 0 additions & 30 deletions tests/test_coverup_27.py

This file was deleted.

Loading

0 comments on commit 9a2e499

Please sign in to comment.