Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Module Caching #943

Merged
merged 12 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/presubmit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ jobs:
run: ${{ env.OPENCL_ENV }} CHIP_DEVICE_TYPE=cpu cmake --build . --parallel 4 --target build_tests
working-directory: build
- name: Test OpenCL
run: ${{ env.OPENCL_ENV }} CHIP_DEVICE_TYPE=cpu ctest --timeout 180 --output-on-failure -E ${{ env.EXCLUDE }}
run: ${{ env.OPENCL_ENV }} POCL_KERNEL_CACHE=0 CHIP_DEVICE_TYPE=cpu ctest --timeout 180 --output-on-failure -E ${{ env.EXCLUDE }}
working-directory: build
- uses: actions/upload-artifact@v3
if: failure()
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ CHIP_JIT_FLAGS_OVERRIDE=<flags> # String to override the default
CHIP_L0_COLLECT_EVENTS_TIMEOUT=<N(30s default)> # Timeout in seconds for collecting Level Zero events
CHIP_L0_EVENT_TIMEOUT=<N(0 default) # Timeout in seconds for how long Level Zero should wait on an event before timing out
CHIP_SKIP_UNINIT=<ON/OFF(default)> # If enabled, skips the uninitialization of chipStar's backend objects at program termination
CHIP_MODULE_CACHE_DIR=/path/to/desired/dir # Module/Program cache dir. Defaults to $HOME/.cache/chipStar, if caching is undesired, set to empty string i.e. export CHIP_MODULE_CACHE_DIR=
```

Example:
Expand Down
14 changes: 13 additions & 1 deletion src/CHIPDriver.hh
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ private:
int L0CollectEventsTimeout_ = 0;
bool OCLDisableQueueProfiling_ = false;
std::optional<std::string> OclUseAllocStrategy_;
std::optional<std::string> ModuleCacheDir_;

public:
EnvVars() {
Expand Down Expand Up @@ -264,6 +265,9 @@ public:
const std::optional<std::string> &getOclUseAllocStrategy() const noexcept {
return OclUseAllocStrategy_;
}
const std::optional<std::string> &getModuleCacheDir() const {
return ModuleCacheDir_;
}

private:
void parseEnvironmentVariables() {
Expand Down Expand Up @@ -299,6 +303,12 @@ private:
readEnvVar("CHIP_OCL_USE_ALLOC_STRATEGY", value, true)
? value
: OclUseAllocStrategy_;
if (readEnvVar("CHIP_MODULE_CACHE_DIR", value, true)) {
if (value.size())
ModuleCacheDir_ = value;
} else {
ModuleCacheDir_ = std::string(std::getenv("HOME")) + "/.cache/chipStar";
}
}

int parseInt(const std::string &value) {
Expand Down Expand Up @@ -343,9 +353,11 @@ private:
logInfo("CHIP_OCL_USE_ALLOC_STRATEGY={}", OclUseAllocStrategy_.has_value()
? OclUseAllocStrategy_.value()
: "off");
logInfo("CHIP_MODULE_CACHE_DIR={}",
ModuleCacheDir_.has_value() ? ModuleCacheDir_.value() : "off");
}
};

extern EnvVars ChipEnvVars;

#endif
#endif
148 changes: 129 additions & 19 deletions src/backend/Level0/CHIPBackendLevel0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

#include "CHIPBackendLevel0.hh"
#include "Utils.hh"

#include <fstream>
#include <chrono>

// Auto-generated header that lives in <build-dir>/bitcode.
Expand Down Expand Up @@ -2378,27 +2380,141 @@
CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeModuleBuildLogDestroy);
}

void save(const ze_module_desc_t &desc, const ze_module_handle_t &module,
CHIPDeviceLevel0 *device) {
const void *pNextConst = desc.pNext;
ze_module_program_exp_desc_t *ProgramDesc =
const_cast<ze_module_program_exp_desc_t *>(
reinterpret_cast<const ze_module_program_exp_desc_t *>(pNextConst));
int numILs = ProgramDesc->count;

std::hash<std::string> hasher;
std::string combinedInput;
for (int i = 0; i < numILs; i++) {
combinedInput.append(
reinterpret_cast<const char *>(ProgramDesc->pInputModules[i]),
ProgramDesc->inputSizes[i]);
combinedInput.append(ProgramDesc->pBuildFlags[i]);
combinedInput.append(std::to_string(ProgramDesc->inputSizes[i]));
}

// Add device name to the hash input
combinedInput.append(device->getName());

size_t hash = hasher(combinedInput);

if (!ChipEnvVars.getModuleCacheDir().has_value()) {
logTrace("Module caching is disabled");
return;
}

std::string cacheDir = ChipEnvVars.getModuleCacheDir().value();
// Create the cache directory if it doesn't exist
std::filesystem::create_directories(cacheDir);

Check warning on line 2413 in src/backend/Level0/CHIPBackendLevel0.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.cc:2413:39 [readability-identifier-naming]

invalid case style for local variable 'cacheDir'
std::string fullPath =

Check warning on line 2414 in src/backend/Level0/CHIPBackendLevel0.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.cc:2414:15 [readability-identifier-naming]

invalid case style for local variable 'fullPath'
cacheDir + std::to_string(hash);

size_t binarySize;
zeStatus = zeModuleGetNativeBinary(module, &binarySize, nullptr);
CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeModuleGetNativeBinary);

std::vector<uint8_t> binary(binarySize);
zeStatus = zeModuleGetNativeBinary(module, &binarySize, binary.data());
CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeModuleGetNativeBinary);

std::ofstream outFile(fullPath, std::ios::out | std::ios::binary);
if (!outFile) {
logError("Failed to open file for writing module binary");
std::abort();
}

outFile.write(reinterpret_cast<const char *>(binary.data()), binary.size());
outFile.close();
logTrace("Module binary cached as {}", fullPath);
}

bool load(ze_module_desc_t &desc, CHIPDeviceLevel0 *device) {
const void *pNextConst = desc.pNext;
ze_module_program_exp_desc_t *ProgramDesc =
const_cast<ze_module_program_exp_desc_t *>(
reinterpret_cast<const ze_module_program_exp_desc_t *>(pNextConst));
int numILs = ProgramDesc->count;

std::hash<std::string> hasher;
std::string combinedInput;
for (int i = 0; i < numILs; i++) {
combinedInput.append(
reinterpret_cast<const char *>(ProgramDesc->pInputModules[i]),
ProgramDesc->inputSizes[i]);
combinedInput.append(ProgramDesc->pBuildFlags[i]);
combinedInput.append(std::to_string(ProgramDesc->inputSizes[i]));
}

// Add device name to the hash input
combinedInput.append(device->getName());

size_t hash = hasher(combinedInput);

if (!ChipEnvVars.getModuleCacheDir().has_value()) {
return false;
}

std::string cacheDir = ChipEnvVars.getModuleCacheDir().value();
std::string fullPath =
cacheDir + "/chipstar_module_cache_" + std::to_string(hash);
// Open the binary file
std::ifstream inFile(fullPath, std::ios::in | std::ios::binary);
if (!inFile) {
return false;
}

// Read the binary da
auto binary = std::make_unique<std::vector<uint8_t>>(
std::istreambuf_iterator<char>(inFile), std::istreambuf_iterator<char>());
inFile.close();

desc.format = ZE_MODULE_FORMAT_NATIVE;
desc.pNext = nullptr;
desc.inputSize = binary->size();
desc.pInputModule = binary->data();
desc.pBuildFlags = nullptr;
desc.pConstants = nullptr;

// Store the unique_ptr in a static variable to keep it alive
static std::vector<std::unique_ptr<std::vector<uint8_t>>> binaryStorage;
binaryStorage.push_back(std::move(binary));

logTrace("Module binary loaded from cache as {}", fullPath);
return true;
}

static ze_module_handle_t compileIL(ze_context_handle_t ZeCtx,
ze_device_handle_t ZeDev,
const ze_module_desc_t &ModuleDesc) {
ze_module_desc_t &ModuleDesc,
CHIPDeviceLevel0 *device) {

ze_module_build_log_handle_t Log;
ze_module_handle_t Object;

bool cached = load(ModuleDesc, device);
auto start = std::chrono::high_resolution_clock::now();
zeStatus = zeModuleCreate(ZeCtx, ZeDev, &ModuleDesc, &Object, &Log);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;

logTrace("zeModuleCreate took {} ms", duration.count());

std::chrono::duration<double> elapsed = end - start;
if (cached)
logTrace("Loaded from cache, zeModuleCreate took {} seconds",
elapsed.count());
else
logTrace("zeModulerCeate took {} seconds", elapsed.count());
if (zeStatus != ZE_RESULT_SUCCESS)
dumpBuildLog(std::move(Log));

CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeModuleCreate);
logTrace("LZ CREATE MODULE via calling zeModuleCreate {} ",
resultToString(zeStatus));

if (!cached)
save(ModuleDesc, Object, device);

return Object;
}

Expand Down Expand Up @@ -2470,7 +2586,7 @@

auto *ChipCtxLz = static_cast<CHIPContextLevel0 *>(ChipDev->getContext());
auto start = std::chrono::high_resolution_clock::now();
ZeModule_ = compileIL(ChipCtxLz->get(), LzDev->get(), ModuleDesc);
ZeModule_ = compileIL(ChipCtxLz->get(), LzDev->get(), ModuleDesc, LzDev);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;

Expand All @@ -2481,8 +2597,9 @@
CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeModuleGetKernelNames);
logTrace("Found {} kernels in this module.", KernelCount);

const char *KernelNames[KernelCount];
zeStatus = zeModuleGetKernelNames(ZeModule_, &KernelCount, KernelNames);
std::vector<const char *> KernelNames(KernelCount);
zeStatus =
zeModuleGetKernelNames(ZeModule_, &KernelCount, KernelNames.data());
CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeModuleGetKernelNames);
for (auto &Kernel : KernelNames)
logTrace("Kernel {}", Kernel);
Expand All @@ -2494,13 +2611,7 @@

auto *FuncInfo = findFunctionInfo(HostFName);
if (!FuncInfo) {
// TODO: __syncthreads() gets turned into
// Intel_Symbol_Table_Void_Program This is a call to OCML so it
// shouldn't be turned into a Kernel
continue;
// CHIPERR_LOG_AND_THROW("Failed to find kernel in
// OpenCLFunctionInfoMap",
// hipErrorInitializationError);
}

// Create kernel
Expand All @@ -2510,8 +2621,6 @@
HostFName.c_str()};

if (!LzDev->hasOnDemandPaging())
// TODO: This is not needed if the kernel does not access allocations
// indirectly. This requires kernel code inspection.
KernelDesc.flags |= ZE_KERNEL_FLAG_FORCE_RESIDENCY;

auto kernelStart = std::chrono::high_resolution_clock::now();
Expand All @@ -2530,10 +2639,11 @@
addKernel(ChipZeKernel);
}
auto kernelCreationEnd = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> kernelCreationDuration =
std::chrono::duration<double, std::micro> kernelCreationDuration =
kernelCreationEnd - kernelCreationStart;

logTrace("Total kernel creation took {} ms", kernelCreationDuration.count());
logTrace("zeKernelCreate for {} kernels took {} microseconds", KernelCount,
kernelCreationDuration.count());
}

void CHIPExecItemLevel0::setupAllArgs() {
Expand Down
Loading
Loading