Skip to content

Commit

Permalink
[shortfin] Adds C++ tracing scopes. (#480)
Browse files Browse the repository at this point in the history
* Manually chose a number of things that would be interesting to see in
a trace and added annotations.
* QC'd with some unit test traces.
  • Loading branch information
stellaraccident authored Nov 12, 2024
1 parent fdf5dd9 commit 35bc60d
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 0 deletions.
9 changes: 9 additions & 0 deletions shortfin/python/array_binding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "./lib_ext.h"
#include "./utils.h"
#include "shortfin/array/api.h"
#include "shortfin/support/logging.h"

using namespace shortfin::array;

Expand Down Expand Up @@ -223,6 +224,7 @@ class PyMapping {
}

void FillFromScalar(Refs *refs, py::handle value) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::FillFromScalar");
if (!dtype()) {
throw std::invalid_argument(
"The `fill` method is only valid for typed mappings but "
Expand All @@ -242,6 +244,7 @@ class PyMapping {
}

void FillFromBuffer(py::handle buffer) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::FillFromBuffer");
Py_buffer py_view;
int flags = PyBUF_FORMAT | PyBUF_ND; // C-Contiguous ND.
if (PyObject_GetBuffer(buffer.ptr(), &py_view, flags) != 0) {
Expand Down Expand Up @@ -286,6 +289,7 @@ class PyMapping {
}

py::object GetItems(py::handle self_obj, Refs *refs) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::GetItems");
if (!dtype()) {
throw std::invalid_argument(
"The `items` property is only valid for typed mappings but "
Expand All @@ -306,6 +310,7 @@ class PyMapping {
}

void SetItems(Refs *refs, py::handle initializer) {
SHORTFIN_TRACE_SCOPE_NAMED("PyMapping::SetItems");
if (!dtype()) {
throw std::invalid_argument(
"The `items` property is only valid for typed mappings but "
Expand Down Expand Up @@ -410,6 +415,7 @@ void BindArray(py::module_ &m) {
.def(
"map",
[](storage &self, bool read, bool write, bool discard) {
SHORTFIN_TRACE_SCOPE_NAMED("PyStorage::map");
int access = 0;
if (read) access |= IREE_HAL_MEMORY_ACCESS_READ;
if (write || discard) access |= IREE_HAL_MEMORY_ACCESS_WRITE;
Expand Down Expand Up @@ -565,6 +571,7 @@ void BindArray(py::module_ &m) {
.def(
"map",
[](device_array &self, bool read, bool write, bool discard) {
SHORTFIN_TRACE_SCOPE_NAMED("PyArray::map");
int access = 0;
if (read) access |= IREE_HAL_MEMORY_ACCESS_READ;
if (write || discard) access |= IREE_HAL_MEMORY_ACCESS_WRITE;
Expand All @@ -586,6 +593,7 @@ void BindArray(py::module_ &m) {
.def_prop_rw(
"items",
[refs](device_array &self) {
SHORTFIN_TRACE_SCOPE_NAMED("PyArray::items");
PyMapping *mapping;
py::object mapping_obj = CreateMappingObject(&mapping);
mapping->set_dtype(self.dtype());
Expand All @@ -606,6 +614,7 @@ void BindArray(py::module_ &m) {
.def_prop_ro(
"__array_interface__",
[refs](device_array &self) {
SHORTFIN_TRACE_SCOPE_NAMED("PyArray::__array_interface__");
py::dict interface;
interface["version"] = 3;
interface["strides"] = py::none();
Expand Down
2 changes: 2 additions & 0 deletions shortfin/python/array_host_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ void BindArrayHostOps(py::module_ &m) {
"argmax",
[](device_array &input, int axis, std::optional<device_array> out,
bool keepdims, bool device_visible) {
SHORTFIN_TRACE_SCOPE_NAMED("PyHostOp::argmax");
if (axis < 0) axis += input.shape().size();
if (axis < 0 || axis >= input.shape().size()) {
throw std::invalid_argument(
Expand Down Expand Up @@ -139,6 +140,7 @@ void BindArrayHostOps(py::module_ &m) {
m.def(
"fill_randn",
[](device_array out, std::optional<PyRandomGenerator *> gen) {
SHORTFIN_TRACE_SCOPE_NAMED("PyHostOp::fill_randn");
if (!gen) gen = &PyRandomGenerator::get_default();
auto compute = [&]<typename EltTy>() {
auto result = xt::random::randn(out.shape_container(), /*mean=*/0.0,
Expand Down
10 changes: 10 additions & 0 deletions shortfin/python/lib_ext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ class PyWorkerExtension : public local::Worker::Extension {
py::handle loop() { return loop_; }

void OnThreadStart() noexcept override {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::OnThreadStart");
// Python threading initialization.
// If our own thread, teach Python about it. Not done for donated.
if (worker().options().owned_thread) {
Expand All @@ -187,6 +188,7 @@ class PyWorkerExtension : public local::Worker::Extension {
}

void OnThreadStop() noexcept override {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::OnThreadStop");
{
// Do Python level thread cleanup.
py::gil_scoped_acquire g;
Expand Down Expand Up @@ -253,6 +255,7 @@ class PyProcess : public local::detail::BaseProcess {
std::bind(&PyProcess::RunOnWorker, self_object));
}
static void RunOnWorker(py::handle self_handle) {
SHORTFIN_TRACE_SCOPE_NAMED("PyProcess:RunOnWorker");
py::gil_scoped_acquire g;
// Steal the reference back from ScheduleOnWorker. Important: this is
// very likely the last reference to the process. So self must not be
Expand Down Expand Up @@ -342,6 +345,7 @@ py::object PyRehydrateRef(local::ProgramInvocation *inv,

py::object RunInForeground(std::shared_ptr<Refs> refs, local::System &self,
py::object coro) {
SHORTFIN_TRACE_SCOPE_NAMED("CoroRunInForeground");
bool is_main_thread =
refs->threading_current_thread().is(refs->threading_main_thread());

Expand Down Expand Up @@ -936,6 +940,7 @@ void BindLocal(py::module_ &m) {
callable.inc_ref(); // Stolen within the callback.
auto thunk = +[](void *user_data, iree_loop_t loop,
iree_status_t status) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::Callback");
py::gil_scoped_acquire g;
py::object user_callable =
py::steal(static_cast<PyObject *>(user_data));
Expand All @@ -955,6 +960,7 @@ void BindLocal(py::module_ &m) {
callable.inc_ref(); // Stolen within the callback.
auto thunk = +[](void *user_data, iree_loop_t loop,
iree_status_t status) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("PyWorker::DelayCallback");
py::gil_scoped_acquire g;
py::object user_callable =
py::steal(static_cast<PyObject *>(user_data));
Expand Down Expand Up @@ -1030,6 +1036,7 @@ void BindLocal(py::module_ &m) {
py::class_<local::CompletionEvent>(m, "CompletionEvent")
.def(py::init<>())
.def("__await__", [](py::handle self_obj) {
SHORTFIN_TRACE_SCOPE_NAMED("PyCompletionEvent::__await__");
auto &worker_ext = PyWorkerExtension::GetCurrent();
auto &self = py::cast<local::CompletionEvent &>(self_obj);
py::object future = worker_ext.loop().attr("create_future")();
Expand All @@ -1051,6 +1058,7 @@ void BindLocal(py::module_ &m) {
self, iree_infinite_timeout(),
+[](void *future_vp, iree_loop_t loop,
iree_status_t status) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("PyCompletionEvent::OnComplete");
py::gil_scoped_acquire g;
py::object future = py::steal(static_cast<PyObject *>(future_vp));
try {
Expand Down Expand Up @@ -1145,6 +1153,7 @@ void BindLocal(py::module_ &m) {
return py::none();
})
.def("__await__", [](py::handle self_obj) {
SHORTFIN_TRACE_SCOPE_NAMED("PyFuture::__await__");
// TODO: We should make our C++ future able to be used directly
// vs needing to bridge it like this.
auto &worker_ext = PyWorkerExtension::GetCurrent();
Expand All @@ -1166,6 +1175,7 @@ void BindLocal(py::module_ &m) {
self.AddCallback(
[py_future_vp = static_cast<void *>(future.release().ptr())](
local::Future &sf_future) {
SHORTFIN_TRACE_SCOPE_NAMED("PyFuture::OnComplete");
py::gil_scoped_acquire g;
py::object py_future =
py::steal(static_cast<PyObject *>(py_future_vp));
Expand Down
3 changes: 3 additions & 0 deletions shortfin/src/shortfin/array/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ mapping device_array::data_rw() { return storage_.map_read_write(); }
mapping device_array::data_w() { return storage_.map_write_discard(); }

std::optional<mapping> device_array::map_memory_for_xtensor() {
SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::map_memory_for_xtensor");
if (storage_.is_mappable_for_read_write()) {
return storage_.map_read_write();
} else if (storage_.is_mappable_for_read()) {
Expand Down Expand Up @@ -97,6 +98,7 @@ std::string device_array::to_s() const {

void device_array::AddAsInvocationArgument(
local::ProgramInvocation *inv, local::ProgramResourceBarrier barrier) {
SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::AddAsInvocationArgument");
auto dims_span = shape();
iree_hal_buffer_view_t *buffer_view;
SHORTFIN_THROW_IF_ERROR(iree_hal_buffer_view_create(
Expand All @@ -117,6 +119,7 @@ iree_vm_ref_type_t device_array::invocation_marshalable_type() {

device_array device_array::CreateFromInvocationResultRef(
local::ProgramInvocation *inv, iree::vm_opaque_ref ref) {
SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::CreateFromInvocationResultRef");
// We don't retain the buffer view in the device array, so just deref it
// vs stealing the ref.
iree_hal_buffer_view_t *bv = iree_hal_buffer_view_deref(*ref.get());
Expand Down
6 changes: 6 additions & 0 deletions shortfin/src/shortfin/array/storage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ storage storage::import_buffer(local::ScopedDevice &device,

storage storage::allocate_device(ScopedDevice &device,
iree_device_size_t allocation_size) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::allocate_device");
if (!device.raw_device()) {
throw std::invalid_argument("Cannot allocate with a null device affinity");
}
Expand All @@ -63,6 +64,7 @@ storage storage::allocate_device(ScopedDevice &device,
storage storage::allocate_host(ScopedDevice &device,
iree_device_size_t allocation_size,
bool device_visible) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::allocate_host");
if (!device.raw_device()) {
throw std::invalid_argument("Cannot allocate with a null device affinity");
}
Expand Down Expand Up @@ -207,6 +209,7 @@ std::string storage::formatted_buffer_usage() const {

void storage::AddAsInvocationArgument(local::ProgramInvocation *inv,
local::ProgramResourceBarrier barrier) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::AddAsInvocationArgument");
iree::vm_opaque_ref ref;
*(&ref) = iree_hal_buffer_retain_ref(buffer_);
inv->AddArg(std::move(ref));
Expand All @@ -220,6 +223,7 @@ iree_vm_ref_type_t storage::invocation_marshalable_type() {

storage storage::CreateFromInvocationResultRef(local::ProgramInvocation *inv,
iree::vm_opaque_ref ref) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::CreateFromInvocationResultRef");
// Steal the ref to one of our smart pointers.
// TODO: Should have an opaque_ref::release().
iree::hal_buffer_ptr buffer =
Expand All @@ -230,6 +234,7 @@ storage storage::CreateFromInvocationResultRef(local::ProgramInvocation *inv,

storage storage::ImportInvocationResultStorage(local::ProgramInvocation *inv,
iree::hal_buffer_ptr buffer) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::ImportInvocationResultStorage");
local::ScopedDevice device =
local::ScopedDevice(*inv->fiber(), inv->device_selection());
auto imported_storage = storage::import_buffer(device, std::move(buffer));
Expand All @@ -251,6 +256,7 @@ storage storage::ImportInvocationResultStorage(local::ProgramInvocation *inv,

void storage::AddInvocationArgBarrier(local::ProgramInvocation *inv,
local::ProgramResourceBarrier barrier) {
SHORTFIN_TRACE_SCOPE_NAMED("storage::AddInvocationArgBarrier");
switch (barrier) {
case ProgramResourceBarrier::DEFAULT:
case ProgramResourceBarrier::READ:
Expand Down
2 changes: 2 additions & 0 deletions shortfin/src/shortfin/array/xtensor_bridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <sstream>

#include "shortfin/support/logging.h"
#include "xtl/xhalf_float.hpp"

namespace shortfin::array {
Expand Down Expand Up @@ -56,6 +57,7 @@ class typed_xt_methods final : public poly_xt_methods {
bool poly_xt_methods::inplace_new(uint8_t *inst_storage, DType dtype,
void *array_memory, size_t array_memory_size,
Dims &dims) {
SHORTFIN_TRACE_SCOPE_NAMED("array_xtensor_cast");
#define POLY_XT_CASE(et, cpp_type) \
case et: \
typed_xt_methods<cpp_type>::concrete_inplace_new( \
Expand Down
7 changes: 7 additions & 0 deletions shortfin/src/shortfin/local/program.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ std::string_view ProgramFunction::calling_convention() const {

ProgramInvocation::Ptr ProgramFunction::CreateInvocation(
std::shared_ptr<Fiber> fiber, std::optional<ProgramIsolation> isolation) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramFunction::CreateInvocation");
ProgramIsolation actual_isolation = isolation ? *isolation : isolation_;
// Low-overhead NONE isolation handling (saves some ref-count twiddling).
if (actual_isolation == ProgramIsolation::NONE) {
Expand All @@ -101,6 +102,7 @@ std::string ProgramFunction::to_s() const {
ProgramModule ProgramModule::Load(System &system,
const std::filesystem::path &path,
bool mmap) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramModule::Load");
iree::file_contents_ptr contents;
iree_file_read_flags_t flags =
mmap ? IREE_FILE_READ_FLAG_MMAP : IREE_FILE_READ_FLAG_PRELOAD;
Expand Down Expand Up @@ -171,6 +173,7 @@ std::vector<std::string> ProgramModule::exports() const {

Program Program::Load(std::span<const ProgramModule> modules,
Options &&options) {
SHORTFIN_TRACE_SCOPE_NAMED("Program::Load");
std::vector<iree_vm_module_t *> all_modules;
std::vector<iree_hal_device_t *> raw_devices;

Expand Down Expand Up @@ -451,6 +454,7 @@ iree_status_t ProgramInvocation::FinalizeCallingConvention(

ProgramInvocation::Future ProgramInvocation::Invoke(
ProgramInvocation::Ptr invocation) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::Invoke");
invocation->CheckNotScheduled();

Worker &worker = invocation->fiber_->worker();
Expand All @@ -462,9 +466,11 @@ ProgramInvocation::Future ProgramInvocation::Invoke(
iree_vm_function_t function,
ProgramInvocationModel invocation_model,
std::optional<ProgramInvocation::Future> failure_future) {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::InvokeAsync");
auto complete_callback =
[](void *user_data, iree_loop_t loop, iree_status_t status,
iree_vm_list_t *outputs) noexcept -> iree_status_t {
SHORTFIN_TRACE_SCOPE_NAMED("ProgramInvocation::Complete");
// Async invocation helpfully gives us a retained reference to the
// outputs, but we already have one statically on the
// ProgramInvocation. So release this one, which makes it safe to
Expand Down Expand Up @@ -620,6 +626,7 @@ StaticProgramParameters::StaticProgramParameters(

void StaticProgramParameters::Load(std::filesystem::path file_path,
LoadOptions options) {
SHORTFIN_TRACE_SCOPE_NAMED("StaticProgramParameters::Load");
// Default format from extension.
if (options.format.empty()) {
options.format = file_path.extension().string();
Expand Down
4 changes: 4 additions & 0 deletions shortfin/src/shortfin/local/scheduler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ void Account::active_deps_extend(iree_hal_semaphore_list_t sem_list) {
}

VoidFuture Account::OnSync() {
SHORTFIN_TRACE_SCOPE_NAMED("Account::OnSync");
// TODO: Burn this path with fire! No attempt has been made to make this
// particularly good: the backend is being implemented now to export
// HAL semaphores via iree_hal_semaphore_await, and that should be used
Expand Down Expand Up @@ -133,6 +134,7 @@ Scheduler::~Scheduler() {

void Scheduler::Initialize(
std::span<const std::pair<std::string_view, Device *>> devices) {
SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::Initialize");
for (auto &it : devices) {
accounts_.emplace_back(*this, it.second);
}
Expand Down Expand Up @@ -165,6 +167,7 @@ Account &Scheduler::GetDefaultAccount(ScopedDevice &device) {
void Scheduler::AppendCommandBuffer(ScopedDevice &device,
TransactionType tx_type,
std::function<void(Account &)> callback) {
SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::AppendCommandBuffer");
Account &account = GetDefaultAccount(device);
auto needed_affinity_bits = device.affinity().queue_affinity();
SHORTFIN_SCHED_LOG(
Expand Down Expand Up @@ -242,6 +245,7 @@ void Scheduler::AppendCommandBuffer(ScopedDevice &device,
}

iree_status_t Scheduler::FlushWithStatus() noexcept {
SHORTFIN_TRACE_SCOPE_NAMED("Scheduler::FlushWithStatus");
// This loop is optimized for a small number of accounts, where it is
// fine to just linearly probe. If this ever becomes cumbersome, we can
// maintain a dirty list which is appended to when an account transitions
Expand Down
3 changes: 3 additions & 0 deletions shortfin/src/shortfin/local/system.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ namespace shortfin::local {

System::System(iree_allocator_t host_allocator)
: host_allocator_(host_allocator) {
SHORTFIN_TRACE_SCOPE_NAMED("System::System");
logging::construct("System", this);
SHORTFIN_THROW_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT,
host_allocator_,
Expand All @@ -29,6 +30,7 @@ System::System(iree_allocator_t host_allocator)
}

System::~System() {
SHORTFIN_TRACE_SCOPE_NAMED("System::~System");
logging::destruct("System", this);
bool needs_shutdown = false;
{
Expand Down Expand Up @@ -61,6 +63,7 @@ System::~System() {
}

void System::Shutdown() {
SHORTFIN_TRACE_SCOPE_NAMED("System::Shutdown");
// Stop workers.
std::vector<Worker *> local_workers;
{
Expand Down
2 changes: 2 additions & 0 deletions shortfin/src/shortfin/local/systems/amdgpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ void AMDGPUSystemBuilder::InitializeDefaultSettings() {

void AMDGPUSystemBuilder::Enumerate() {
if (hip_hal_driver_) return;
SHORTFIN_TRACE_SCOPE_NAMED("AMDGPUSystemBuilder::Enumerate");

iree_hal_hip_driver_options_t driver_options;
iree_hal_hip_driver_options_initialize(&driver_options);
Expand Down Expand Up @@ -127,6 +128,7 @@ std::vector<std::string> AMDGPUSystemBuilder::GetAvailableDeviceIds() {
}

SystemPtr AMDGPUSystemBuilder::CreateSystem() {
SHORTFIN_TRACE_SCOPE_NAMED("AMDGPUSystemBuilder::CreateSystem");
auto lsys = std::make_shared<System>(host_allocator());
Enumerate();

Expand Down
Loading

0 comments on commit 35bc60d

Please sign in to comment.