diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 4e14d3c..84f3b0d 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -2,13 +2,15 @@ name: CMake on: push: - branches: [ "main" ] + branches: [ "main", "asplos-dev" ] pull_request: - branches: [ "main" ] + branches: [ "main", "asplos-dev" ] env: # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) - BUILD_TYPE: Release + BUILD_TYPE: Debug + + jobs: build: @@ -21,20 +23,40 @@ jobs: - uses: actions/checkout@v3 - name: Install dependencies - run: sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev - - - name: Install Custom dependencies - run: wget http://launchpadlibrarian.net/605552811/libbpf0_0.8.0-1_amd64.deb && wget http://launchpadlibrarian.net/605552807/libbpf-dev_0.8.0-1_amd64.deb && sudo dpkg -i ./libbpf0_0.8.0-1_amd64.deb && sudo dpkg -i ./libbpf-dev_0.8.0-1_amd64.deb + run: sudo apt install llvm-dev clang libbpf-dev libclang-dev python3-pip gcc-13 g++-13 ninja-build && git submodule update --init --recursive - - name: Sed Current uncompiled include file - run: sudo sed -i 's/NL_SET_ERR_MSG_MOD/\/\/NL_SET_ERR_MSG_MOD/g' /usr/src/linux-headers-`uname -r`/include/net/flow_offload.h + - name: Install conan + working-directory: ${{github.workspace}} + run: pip3 install conan && conan profile detect && mkdir build && cd build && cp ../conanfile.txt . && CC=gcc-13 CXX=g++-13 conan install . -s compiler.cppstd=gnu23 - name: Configure CMake # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type - run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} + run: CC=gcc-13 CXX=g++-13 cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_TOOLCHAIN_FILE=${{github.workspace}}/build/build/${{env.BUILD_TYPE}}/generators/conan_toolchain.cmake -DCMAKE_POLICY_DEFAULT_CMP0091=NEW -GNinja -DCMAKE_MAKE_PROGRAM=ninja - name: Build # Build your program with the given configuration - run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} + run: CC=gcc-13 CXX=g++-13 cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} + + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token + with: + tag_name: main + release_name: Release main + body: | + Changes in this Release + - First Change + - Second Change + draft: false + prerelease: false + + - name: Upload Assets to Release with a wildcard + uses: csexton/release-asset-action@v2 + with: + pattern: "build/CXLMemSim" + github-token: ${{ secrets.GITHUB_TOKEN }} + release-url: ${{ steps.create_release.outputs.upload_url }} diff --git a/.gitignore b/.gitignore index 561b9f0..7ad5b53 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,11 @@ benchmarks* Makefile *.o py_smdk_pkg -lib \ No newline at end of file +lib +CMakePresets.json +.cmake +CMakeUserPresets.json +*~ +voltdb +foo +CMakeFiles diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..6544518 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,15 @@ +[submodule "workloads/memcached-ycsb"] + path = workloads/memcached-ycsb + url = https://github.com/SlugLab/YCSB/ +[submodule "workloads/memcached"] + path = workloads/memcached + url = https://github.com/memcached/memcached +[submodule "workloads/gapbs"] + path = workloads/gapbs + url = https://github.com/victoryang00/gapbs +[submodule "script/perfmon"] + path = script/perfmon + url = https://github.com/intel/perfmon +[submodule "workloads/vectordb"] + path = workloads/vectordb + url = https://github.com/jina-ai/vectordb diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b6fc86..bdac736 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,29 +1,30 @@ cmake_minimum_required(VERSION 3.11.0) -project(CXL-MEM-Simulator VERSION 0.1.0) +project(CXLMemSim VERSION 0.1.0) +set(CMAKE_CXX_STANDARD 23) + +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + add_compile_options (-fdiagnostics-color=always) +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + add_compile_options (-fcolor-diagnostics) +endif () + +add_subdirectory(microbench) +add_subdirectory(workloads) + +list(APPEND CMAKE_PREFIX_PATH ${CMAKE_BINARY_DIR}) find_package(cxxopts REQUIRED) find_package(fmt REQUIRED) -find_package(range-v3 REQUIRED) file(GLOB_RECURSE SOURCE_FILES src/*.cpp) execute_process(COMMAND uname -r OUTPUT_VARIABLE arch OUTPUT_STRIP_TRAILING_WHITESPACE) -set(LINUX_SOURCE /lib/modules/${arch}/build/) -set(CMAKE_CXX_FLAGS "-Wall -g -pthread -lrt -rdynamic") -set(CMAKE_CXX_STANDARD 23) - -add_executable(CXL-MEM-Simulator ${SOURCE_FILES}) - -include_directories(CXL-MEM-Simulator include) -target_link_libraries(CXL-MEM-Simulator cxxopts::cxxopts fmt::fmt range-v3::range-v3 elf bpf) +set(CMAKE_CXX_FLAGS "-Wall -fPIC -pthread -ldl -lrt -mavx512f -mpreferred-stack-boundary=4 -g") -function(bpf prefix) - add_custom_target(${prefix}_bpf ALL - COMMAND clang --target=bpf -nostdinc -S -I/usr/include/linux -I${CMAKE_SOURCE_DIR}/include -I${LINUX_SOURCE}/arch/x86/include -I/usr/include -I${LINUX_SOURCE}/arch/x86/include/uapi -I${LINUX_SOURCE}/arch/x86/include/generated -I${LINUX_SOURCE}/arch/x86/include/generated/uapi -I${LINUX_SOURCE}/include -I${LINUX_SOURCE}/include/uapi -I${LINUX_SOURCE}/include/generated/uapi -I${LINUX_SOURCE}/tools/testing/selftests/bpf -include ${LINUX_SOURCE}/include/linux/kconfig.h -D__KERNEL__ -D__ASM_SYSREG_H -D__BPF_TRACING__ -D__TARGET_ARCH_x86 -Wno-implicit-function-declaration -O3 -emit-llvm -g -c ${CMAKE_SOURCE_DIR}/src/${prefix}.c -o ${CMAKE_BINARY_DIR}/${prefix}.ll - COMMAND llc -march=bpf -filetype=obj -o ${CMAKE_BINARY_DIR}/${prefix}.o ${CMAKE_BINARY_DIR}/${prefix}.ll - ) - add_dependencies(CXL-MEM-Simulator ${prefix}_bpf) -endfunction() +add_executable(CXLMemSim ${SOURCE_FILES} src/main.cc) -bpf(collectmmap) +include_directories(CXLMemSim include ${cxxopts_INCLUDE_DIR} ${fmt_INCLUDE_DIR}) +target_link_libraries(CXLMemSim fmt::fmt cxxopts::cxxopts) -add_subdirectory(microbench) \ No newline at end of file +add_library(CXLMemSimHook SHARED src/module.cc) +add_executable(CXLMemSimSock ${SOURCE_FILES} src/sock.cc) +target_link_libraries(CXLMemSimSock fmt::fmt cxxopts::cxxopts) diff --git a/README.md b/README.md index 2d2c453..4f54b24 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,20 @@ # CXL.mem Simulator -The epoch design of this project is mostly refering to [mes](https://github.com/takahiro-hirofuchi/mesmeric-emulator), the novelty is use pebs to construct the topology and calculate the hierachy latency based on this. See the [talk](https://docs.google.com/file/d/1bZi2rbB-u5xMw_YET726gb2s9QuxMZJE/edit?usp=docslist_api&filetype=mspresentation) +The CXL.mem simulator is to use the target latency for simulating the CPU perspective taking ROB and different cacheline state's into panelty from the application level. ## Prerequisite ```bash $ uname -a -Linux gpu01 5.19.0-29-generic #30-Ubuntu SMP PREEMPT_DYNAMIC Wed Jan 4 12:14:09 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux -$ sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev +Linux banana 6.4.0+ #86 SMP PREEMPT_DYNAMIC Fri Jul 28 23:49:33 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux +$ echo 0 | sudo tee /sys/devices/system/node/node1/cpu*/online >/dev/null 2>&1 ``` ## User input ```bash -LOGV=1 ./CXL-MEM-Simulator -t ./microbench/many_calloc -i 5 -c 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 +LOGV=1 ./CXL-MEM-Simulator -t ./microbench/ld -i 5 -c 0,2 -d 85 -b 10,10 -l 100,100 -c 100,100 -w 85.5,86.5,87.5,85.5,86.5,87.5,88. -o "(1,(2,3))" ``` 1. -t Target: The path to the executable 2. -i Interval: The epoch of the simulator, the parameter is in milisecond 3. -c CPUSet: The core id to run the executable and the rest will be `setaffinity` to one other core -4. -d Dram Latency: The current platform's DRAM latency, default is 85ns +4. -d Dram Latency: The current platform's DRAM latency, default is 85ns # mark that bw in the remote 5. -b, -l Bandwidth, Latency: Both use 2 input in the vector, first for read, second for write 6. -c Capacity: The capacity of the memory with first be local, remaining accordingly to the input vector. 7. -w Weight: Use the heuristic to calculate the bandwidth @@ -30,16 +30,3 @@ LOGV=1 ./CXL-MEM-Simulator -t ./microbench/many_calloc -i 5 -c 0,1,2,3,4,5,6,7,8 3 ``` 9. env LOGV stands for logs level that you can see. -## Limitation -The pebs requires no larger than 5 `perf_open_event` attached to certain PID, so I limit the bpf program to munmap(kprobe) and sbrk(kprobe/kretprobe), you can configure them. For multiple process application, I need to first SIGSTOP the process and `send/recv` back the PID information. For client and server application, I need to SIGSTOP/SIGCONT on both client and server simultaneously, which is not implemented yet. - -## Cite -```bash -@article{yangyarch23, - title={CXLMemSim: A pure software simulated CXL.mem for performance characterization}, - author={Yiwei Yang, Pooneh Safayenikoo, Jiacheng Ma, Tanvir Ahmed Khan, Andrew Quinn}, - journal={arXiv preprint arXiv:2303.06153}, - booktitle={The fifth Young Architect Workshop (YArch'23)}, - year={2023} -} -``` \ No newline at end of file diff --git a/artifact/build_and_run_all.sh b/artifact/build_and_run_all.sh deleted file mode 100755 index 29eeac1..0000000 --- a/artifact/build_and_run_all.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -sudo apt install llvm-dev clang libbpf-dev libclang-dev libcxxopts-dev libfmt-dev librange-v3-dev ninja-build - -mkdir build -cd build -cmake -GNinja .. -ninja - -sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_calloc'" > many_calloc_cxlmemsim.txt -time ./microbench/many_calloc > many_calloc_time.txt -sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_mmap_write'" > many_mmap_write_cxlmemsim.txt -time ./microbench/many_mmap_write > many_mmap_write_time.txt -sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_mmap_read'" > many_mmap_read_cxlmemsim.txt -time ./microbench/many_mmap_read > many_mmap_read_time.txt -sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_malloc'" > many_malloc_cxlmemsim.txt -time ./microbench/many_malloc > many_malloc_time.txt -sudo bash -c "LOGV=0 ./CXL-MEM-Simulator -t './microbench/many_sbrk'" > many_sbrk_cxlmemsim.txt -time ./microbench/many_sbrk > many_sbrk_time.txt \ No newline at end of file diff --git a/artifact/compare_with_gem5.sh b/artifact/compare_with_gem5.sh deleted file mode 100644 index aa17ac5..0000000 --- a/artifact/compare_with_gem5.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -cd build -git clone https://github.com/fadedzipper/gem5-cxl -b cxl.mem-dev -cd gem5-cxl -scons build/ARM/gem5.opt -j 16 -time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_calloc > many_calloc_gem5.txt -time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_mmap_write > many_mmap_write_gem5.txt -time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_mmap_read > many_mmap_read_gem5.txt -time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_malloc > many_malloc_gem5.txt -time build/X86/gem5.opt configs/example/se.py -c ../microbench/many_sbrk > many_sbrk_gem5.txt \ No newline at end of file diff --git a/artifact/gen_workloads.sh b/artifact/gen_workloads.sh new file mode 100644 index 0000000..8330531 --- /dev/null +++ b/artifact/gen_workloads.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# anns +# monetdb +# pointer_chasing lmbench3 +# wrf stream +# mlc +# gromacs +# smdk's + +git clone https://github.com/scott-beamer/gapbs.git +cd gapbs +make benchmark + +wget https://files.rcsb.org/download/4i4f.pdb \ No newline at end of file diff --git a/artifact/mlc.txt b/artifact/mlc-alderlake.txt similarity index 100% rename from artifact/mlc.txt rename to artifact/mlc-alderlake.txt diff --git a/artifact/mlc-sapphirerapids.txt b/artifact/mlc-sapphirerapids.txt new file mode 100644 index 0000000..f3d8da5 --- /dev/null +++ b/artifact/mlc-sapphirerapids.txt @@ -0,0 +1,55 @@ +Intel(R) Memory Latency Checker - v3.10 +*** Unable to modify prefetchers (try executing 'modprobe msr') +*** So, enabling random access for latency measurements +Measuring idle latencies for random access (in ns)... + Numa node Numa node +Numa node 0 1 + 0 106.3 437.5 + +Measuring Peak Injection Memory Bandwidths for the system +Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec) +Using all the threads from each core if Hyper-threading is enabled +Using traffic with the following read-write ratios +ALL Reads : 28611.9 +3:1 Reads-Writes : 25057.1 +2:1 Reads-Writes : 25078.0 +1:1 Reads-Writes : 23965.9 +Stream-triad like: 24943.3 + +Measuring Memory Bandwidths between nodes within system +Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec) +Using all the threads from each core if Hyper-threading is enabled +Using Read-only traffic type + Numa node Numa node +Numa node 0 1 + 0 28612.0 19216.8 + +Measuring Loaded Latencies for the system +Using all the threads from each core if Hyper-threading is enabled +Using Read-only traffic type +Inject Latency Bandwidth +Delay (ns) MB/sec +========================== +00000 370.12 28393.2 +00002 369.29 28435.4 +00008 378.41 28490.5 +00015 354.32 28414.2 +00050 313.07 28323.0 +00100 238.51 28010.5 +00200 125.13 14566.0 +00300 119.69 10232.0 +00400 116.76 7905.3 +00500 115.33 6500.4 +00700 113.89 4858.3 +01000 113.03 3594.6 +01300 112.57 2906.9 +01700 112.09 2363.9 +02500 111.51 1798.9 +03500 111.21 1520.8 +05000 110.77 1193.2 +09000 110.38 922.3 +20000 110.14 735.6 + +Measuring cache-to-cache transfer latency (in ns)... +Local Socket L2->L2 HIT latency 67.3 +Local Socket L2->L2 HITM latency 67.5 \ No newline at end of file diff --git a/conanfile.txt b/conanfile.txt new file mode 100644 index 0000000..1b4945e --- /dev/null +++ b/conanfile.txt @@ -0,0 +1,9 @@ +[requires] +cxxopts/3.0.0 +fmt/9.0.0 +nlohmann_json/3.11.2 +[generators] +CMakeDeps +CMakeToolchain +[layout] +cmake_layout \ No newline at end of file diff --git a/include/alloc.h b/include/alloc.h deleted file mode 100644 index e9a4cfb..0000000 --- a/include/alloc.h +++ /dev/null @@ -1,12 +0,0 @@ -// -// Created by victoryang00 on 2/2/23. -// - -#ifndef CXL_MEM_SIMULATOR_ALLOC_H -#define CXL_MEM_SIMULATOR_ALLOC_H - -class Allocator { - -}; - -#endif // CXL_MEM_SIMULATOR_ALLOC_H diff --git a/include/cxlcontroller.h b/include/cxlcontroller.h index f4d1246..351d2d2 100644 --- a/include/cxlcontroller.h +++ b/include/cxlcontroller.h @@ -2,43 +2,64 @@ // Created by victoryang00 on 1/14/23. // -#ifndef CXL_MEM_SIMULATOR_CXLCONTROLLER_H -#define CXL_MEM_SIMULATOR_CXLCONTROLLER_H +#ifndef CXLMEMSIM_CXLCONTROLLER_H +#define CXLMEMSIM_CXLCONTROLLER_H #include "cxlcounter.h" #include "cxlendpoint.h" #include #include +#include #include +enum page_type { CACHELINE, PAGE, HUGEPAGE_2M, HUGEPAGE_1G }; + class CXLController; -class Policy { +class AllocationPolicy { public: - Policy(); + AllocationPolicy(); virtual int compute_once(CXLController *) = 0; + // No write problem +}; +class MigrationPolicy { +public: + MigrationPolicy(); + virtual int compute_once(CXLController *) = 0; // reader writer + // paging related + // switching related +}; + +// need to give a timeout and will be added latency later, +class PagingPolicy { +public: + PagingPolicy(); + virtual int compute_once(CXLController *) = 0; // reader writer + // paging related }; + class CXLController : CXLSwitch { public: std::vector cur_expanders{}; int capacity; // GB - Policy *policy; + AllocationPolicy *policy; CXLCounter counter; std::map occupation; std::map va_pa_map; - bool is_page; + enum page_type page_type_; // percentage int num_switches = 0; - CXLController(Policy *policy, int capacity, bool is_page, int epoch); + + CXLController(AllocationPolicy *p, int capacity, enum page_type page_type_, int epoch); void construct_topo(std::string_view newick_tree); void insert_end_point(CXLMemExpander *end_point); std::vector tokenize(const std::string_view &s); - std::tuple> calculate_congestion() override; + std::tuple> calculate_congestion() override; void set_epoch(int epoch) override; std::tuple get_all_access() override; - double calculate_latency(LatencyPass elem); // traverse the tree to calculate the latency - double calculate_bandwidth(BandwidthPass elem); + double calculate_latency(LatencyPass elem) override; // traverse the tree to calculate the latency + double calculate_bandwidth(BandwidthPass elem) override; int insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr, int index) override; void delete_entry(uint64_t addr, uint64_t length) override; std::string output() override; }; -#endif // CXL_MEM_SIMULATOR_CXLCONTROLLER_H +#endif // CXLMEMSIM_CXLCONTROLLER_H diff --git a/include/cxlcounter.h b/include/cxlcounter.h index d4b0d09..116ecd3 100644 --- a/include/cxlcounter.h +++ b/include/cxlcounter.h @@ -2,16 +2,17 @@ // Created by victoryang00 on 1/12/23. // -#ifndef CXL_MEM_SIMULATOR_CXLCOUNTER_H -#define CXL_MEM_SIMULATOR_CXLCOUNTER_H +#ifndef CXLMEMSIM_CXLCOUNTER_H +#define CXLMEMSIM_CXLCOUNTER_H #include -#include -#include #include +#include #include +#include +#include -/** TODO: Whether to using the pebs to record the state. add back invalidation */ +/** TODO: Whether to using the pebs to record the state. add back invalidation migrate huge/ page and prefetch*/ class CXLSwitchEvent { public: uint64_t load = 0; @@ -51,4 +52,4 @@ class CXLCounter { void inc_hitm(); }; -#endif // CXL_MEM_SIMULATOR_CXLCOUNTER_H +#endif // CXLMEMSIM_CXLCOUNTER_H diff --git a/include/cxlendpoint.h b/include/cxlendpoint.h index ed94861..39c1f63 100644 --- a/include/cxlendpoint.h +++ b/include/cxlendpoint.h @@ -2,11 +2,55 @@ // Created by victoryang00 on 1/13/23. // -#ifndef CXL_MEM_SIMULATOR_CXLENDPOINT_H -#define CXL_MEM_SIMULATOR_CXLENDPOINT_H +#ifndef CXLMEMSIM_CXLENDPOINT_H +#define CXLMEMSIM_CXLENDPOINT_H #include "cxlcounter.h" #include "helper.h" +class LRUCache { + std::list lru_list; + std::unordered_map::iterator> lru_map; + std::unordered_map wb_map; + size_t capacity; + +public: + LRUCache(size_t cap) : capacity(cap) {} + + void insert(uint64_t key, uint64_t value) { + // Check if the item is already in the cache + if (lru_map.find(key) != lru_map.end()) { + // Move the element to the front of the list + lru_list.erase(lru_map[key]); + lru_list.push_front(key); + lru_map[key] = lru_list.begin(); + wb_map[key] = value; + } else { + // If the cache is full, remove the least recently used item + if (lru_list.size() == capacity) { + uint64_t old_key = lru_list.back(); + lru_list.pop_back(); + lru_map.erase(old_key); + wb_map.erase(old_key); + } + // Insert the new item + lru_list.push_front(key); + lru_map[key] = lru_list.begin(); + wb_map[key] = value; + } + } + + uint64_t get(uint64_t key) { + if (lru_map.find(key) == lru_map.end()) { + throw std::runtime_error("Key not found"); + } + // Move the accessed item to the front of the list + lru_list.erase(lru_map[key]); + lru_list.push_front(key); + lru_map[key] = lru_list.begin(); + return wb_map[key]; + } +}; + class CXLEndPoint { virtual void set_epoch(int epoch) = 0; virtual std::string output() = 0; @@ -27,6 +71,9 @@ class CXLMemExpander : public CXLEndPoint { std::map va_pa_map; // va, pa CXLMemExpanderEvent counter{}; CXLMemExpanderEvent last_counter{}; + + LRUCache lru_cache; + // tlb map and paging map -> invalidate int last_read = 0; int last_write = 0; double last_latency = 0.; @@ -50,6 +97,9 @@ class CXLSwitch : public CXLEndPoint { int id = -1; int epoch = 0; uint64_t last_timestamp = 0; + // get the approximate congestion and target done time + std::unordered_map timeseries_map; + double congestion_latency = 0.02; explicit CXLSwitch(int id); std::tuple get_all_access() override; @@ -62,4 +112,4 @@ class CXLSwitch : public CXLEndPoint { void set_epoch(int epoch) override; }; -#endif // CXL_MEM_SIMULATOR_CXLENDPOINT_H +#endif // CXLMEMSIM_CXLENDPOINT_H diff --git a/include/helper.h b/include/helper.h index 6330ef0..07d4810 100644 --- a/include/helper.h +++ b/include/helper.h @@ -2,12 +2,13 @@ // Created by victoryang00 on 1/12/23. // -#ifndef CXL_MEM_SIMULATOR_HELPER_H -#define CXL_MEM_SIMULATOR_HELPER_H +#ifndef CXLMEMSIM_HELPER_H +#define CXLMEMSIM_HELPER_H #include "incore.h" #include "logging.h" #include "uncore.h" +#include #include #include #include @@ -16,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -28,6 +29,16 @@ class Incore; class Uncore; class Helper; +struct PerfConfig { + std::string path_format_cha_type{}; + std::array, 4> cha{}; + std::array, 4> cpu{}; +}; +struct ModelContext { + uint32_t model{}; + struct PerfConfig perf_conf; +}; + struct EmuCXLLatency { double read; double write; @@ -47,22 +58,16 @@ struct BandwidthPass { struct LatencyPass { std::tuple all_access; double dramlatency; - double ma_ro; - double ma_wb; + uint64_t readonly; + uint64_t writeback; }; -struct CBOElem { - uint64_t llc_wb; +struct CHAElem { + std::array cha; }; struct CPUElem { - uint64_t all_dram_rds; - uint64_t cpu_l2stall_t; - uint64_t cpu_llcl_hits; - uint64_t cpu_llcl_miss; - uint64_t cpu_bandwidth_read; - uint64_t cpu_bandwidth_write; - std::map cpu_munmap_address_length; + std::array cpu; }; struct PEBSElem { @@ -79,54 +84,39 @@ struct CPUInfo { struct Elem { struct CPUInfo cpuinfo; - struct CBOElem *cbos; - struct CPUElem *cpus; + std::vector chas; + std::vector cpus; struct PEBSElem pebs; }; class PMUInfo { public: - std::vector cbos; + std::vector chas; std::vector cpus; Helper *helper; PMUInfo(pid_t pid, Helper *h, struct PerfConfig *perf_config); ~PMUInfo(); int start_all_pmcs(); int stop_all_pmcs(); - int freeze_counters_cbo_all(); - int unfreeze_counters_cbo_all(); -}; - -struct PerfConfig { - const char *path_format_cbo_type; - uint64_t cbo_config; - uint64_t all_dram_rds_config; - uint64_t all_dram_rds_config1; - uint64_t cpu_l2stall_config; - uint64_t cpu_llcl_hits_config; - uint64_t cpu_llcl_miss_config; - uint64_t cpu_bandwidth_read_config; - uint64_t cpu_bandwidth_write_config; -}; - -struct ModelContext { - uint32_t model; - struct PerfConfig perf_conf; + int freeze_counters_cha_all(); + int unfreeze_counters_cha_all(); }; class Helper { public: - int cpu; - int cbo; - double cpu_freq; - PerfConfig perf_conf; + PerfConfig perf_conf{}; Helper(); - static int num_of_cpu(); - static int num_of_cbo(); + int cpu; + int cha; + std::vector used_cpu; + std::vector used_cha; + int num_of_cpu(); + int num_of_cha(); static void detach_children(); static void noop_handler(int signum); - double cpu_frequency() const; - PerfConfig detect_model(uint32_t); + double cpu_frequency(); + PerfConfig detect_model(uint32_t model, const std::vector &perf_name, + const std::vector &perf_conf1, const std::vector &perf_conf2); }; -#endif // CXL_MEM_SIMULATOR_HELPER_H +#endif // CXLMEMSIM_HELPER_H diff --git a/include/incore.h b/include/incore.h index 6e34116..8338b22 100644 --- a/include/incore.h +++ b/include/incore.h @@ -1,38 +1,34 @@ // Created by victoryang00 on 1/14/23. // -#ifndef CXL_MEM_SIMULATOR_INCORE_H -#define CXL_MEM_SIMULATOR_INCORE_H +#ifndef CXLMEMSIM_INCORE_H +#define CXLMEMSIM_INCORE_H #include "helper.h" #include "perf.h" -#include -class CXLController; +#include +#include + +class CXLController; // TODO: need to be shm gotten union CPUID_INFO { int array[4]; struct { unsigned int eax, ebx, ecx, edx; } reg; }; +/** This is a per cha metrics*/ class Incore { public: - PerfInfo *perf[5]; + std::array perf{nullptr}; // should only be 4 counters struct PerfConfig *perf_config; - Incore(const pid_t pid, const int cpu, struct PerfConfig *perf_config); + Incore(pid_t pid, int cpu, struct PerfConfig *perf_config); ~Incore() = default; int start(); int stop(); - void init_all_dram_rds(const pid_t pid, const int cpu); - void init_cpu_l2stall(const pid_t pid, const int cpu); - void init_cpu_llcl_hits(const pid_t pid, const int cpu); - void init_cpu_llcl_miss(const pid_t pid, const int cpu); - void init_cpu_mem_read(const pid_t pid, const int cpu); - void init_cpu_mem_write(const pid_t pid, const int cpu); - void init_cpu_ebpf(const pid_t pid, const int cpu); - int read_cpu_elems(struct CPUElem *cpu_elem); + ssize_t read_cpu_elems(struct CPUElem *cpu_elem); }; void pcm_cpuid(unsigned leaf, CPUID_INFO *info); bool get_cpu_info(struct CPUInfo *); -#endif // CXL_MEM_SIMULATOR_INCORE_H +#endif // CXLMEMSIM_INCORE_H diff --git a/include/logging.h b/include/logging.h index 94fd004..14942a8 100644 --- a/include/logging.h +++ b/include/logging.h @@ -2,8 +2,8 @@ // Created by victoryang00 on 1/13/23. // -#ifndef CXL_MEM_SIMULATOR_LOGGING_H -#define CXL_MEM_SIMULATOR_LOGGING_H +#ifndef CXLMEMSIM_LOGGING_H +#define CXLMEMSIM_LOGGING_H #include #include @@ -15,11 +15,22 @@ #include #include #include +#include #include #include #include +#include "sock.h" -enum LogLevel { DEBUG = 0, INFO, WARNING, ERROR }; +/** Barry's work*/ +struct Enumerate : std::ranges::range_adaptor_closure { + template constexpr auto operator()(R &&r) const { + return std::views::zip(std::views::iota(0), (R &&) r); + } +}; + +inline constexpr Enumerate enumerate; + +enum LogLevel { DEBUG = 0, INFO, WARNING, ERROR, TRACE }; class LogStream; class LogWriter; @@ -30,15 +41,17 @@ class LogWriter { char *logv = std::getenv("LOGV"); if (logv) { env_log_level = std::stoi(logv); + file_ = std::fstream(OUTPUT_PMU_PATH, std::ios::out | std::ios::app); } else { env_log_level = 4; } }; - + ~LogWriter() = default; void operator<(const LogStream &stream); private: void output_log(const std::ostringstream &g); + std::fstream file_; std::source_location location_; LogLevel log_level_; int env_log_level; @@ -66,8 +79,9 @@ fmt::color level2color(LogLevel level); #define LOG_IF(level) LogWriter(std::source_location::current(), level) < LogStream() #define LOG(level) LOG_##level #define LOG_DEBUG LOG_IF(DEBUG) +#define LOG_TRACE LOG_IF(TRACE) #define LOG_INFO LOG_IF(INFO) #define LOG_WARNING LOG_IF(WARNING) #define LOG_ERROR LOG_IF(ERROR) -#endif // CXL_MEM_SIMULATOR_LOGGING_H +#endif // CXLMEMSIM_LOGGING_H diff --git a/include/monitor.h b/include/monitor.h index 8763d2c..9655476 100644 --- a/include/monitor.h +++ b/include/monitor.h @@ -2,8 +2,8 @@ // Created by victoryang00 on 1/11/23. // -#ifndef SLUGALLOCATOR_MONITOR_H -#define SLUGALLOCATOR_MONITOR_H +#ifndef CXLMEMSIM_MONITOR_H +#define CXLMEMSIM_MONITOR_H #include "cxlcontroller.h" #include "helper.h" @@ -22,46 +22,112 @@ enum MONITOR_STATUS { MONITOR_TERMINATED = 2, MONITOR_NOPERMISSION = 3, MONITOR_DISABLE = 4, + MONITOR_SUSPEND = 5, MONITOR_UNKNOWN = 0xff }; +extern Helper helper; + class Monitor; class Monitors { public: std::vector mon; - Monitors(int tnum, cpu_set_t *use_cpuset, int nmem, Helper h); + bool print_flag; + Monitors(int tnum, cpu_set_t *use_cpuset); ~Monitors() = default; - void stop_all(const int); - void run_all(const int); - int enable(const uint32_t, const uint32_t, bool, uint64_t, const int32_t, bool is_page); - void disable(const uint32_t target); - int terminate(const uint32_t, const uint32_t, const int32_t); - bool check_all_terminated(const uint32_t); - bool check_continue(const uint32_t, const struct timespec); + void stop_all(int); + void run_all(int); + Monitor get_mon(int, int); + int enable(const uint32_t, const uint32_t, bool, uint64_t, const int32_t); + void disable(uint32_t target); + int terminate(uint32_t, uint32_t, int32_t); + bool check_all_terminated(uint32_t); + bool check_continue(uint32_t, struct timespec); }; class Monitor { public: - pid_t tgid; + pid_t tgid; // process id pid_t tid; uint32_t cpu_core; char status; struct timespec injected_delay; // recorded time for injected struct timespec wasted_delay; // recorded time for calling between continue and calculation struct timespec squabble_delay; // inj-was - struct Elem elem[2]; + struct Elem elem[2]; // before & after struct Elem *before, *after; double total_delay; struct timespec start_exec_ts, end_exec_ts; bool is_process; struct PEBS *pebs_ctx; - Monitor(const int nmem, Helper h); + explicit Monitor(); void stop(); void run(); - void clear_time(struct timespec *); + static void clear_time(struct timespec *); +}; + +template <> struct fmt::formatter { + fmt::formatter f; + + constexpr auto parse(auto &ctx) { return f.parse(ctx); } + + auto format(Monitors const &p, auto &ctx) const { + auto out = fmt::format_to(ctx.out(), ""); + if (p.print_flag) { + for (auto const &[mon_id, mon] : p.mon | enumerate) { + for (auto core_idx = 0; core_idx < helper.used_cha.size(); core_idx++) { + for (auto cha_idx = 0; cha_idx < helper.perf_conf.cha.size(); cha_idx++) { + out = fmt::format_to(out, "mon{}_{}_{}_{},", mon_id, std::get<0>(helper.perf_conf.cha[cha_idx]), + helper.used_cha[core_idx], core_idx); + } + } + + for (auto core_idx = 0; core_idx < helper.used_cpu.size(); core_idx++) { + for (auto cpu_idx = 0; cpu_idx < helper.perf_conf.cpu.size(); cpu_idx++) { + if (cpu_idx == helper.perf_conf.cpu.size() - 1 && core_idx == helper.used_cpu.size() - 1) { + out = fmt::format_to(out, "mon{}_{}_{}_{}", mon_id, + std::get<0>(helper.perf_conf.cpu[cpu_idx]), helper.used_cpu[core_idx], + core_idx); + } else { + out = fmt::format_to(out, "mon{}_{}_{}_{},", mon_id, + std::get<0>(helper.perf_conf.cpu[cpu_idx]), helper.used_cpu[core_idx], + core_idx); + } + } + } + } + } else { + + for (auto const &[mon_id, mon] : p.mon | enumerate) { + for (auto core_idx = 0; core_idx < helper.used_cha.size(); core_idx++) { + for (auto cha_idx = 0; cha_idx < helper.perf_conf.cha.size(); cha_idx++) { + out = fmt::format_to(out, "{},", + mon.after->chas[core_idx].cha[cha_idx] - + mon.before->chas[core_idx].cha[cha_idx]); + } + } + for (auto core_idx = 0; core_idx < helper.used_cpu.size(); core_idx++) { + for (auto cpu_idx = 0; cpu_idx < helper.perf_conf.cpu.size(); cpu_idx++) { + if (cpu_idx == helper.perf_conf.cpu.size() - 1 && core_idx == helper.used_cpu.size() - 1) { + out = fmt::format_to(out, "{}", + mon.after->cpus[core_idx].cpu[cpu_idx] - + mon.before->cpus[core_idx].cpu[cpu_idx]); + } else { + out = fmt::format_to(out, "{},", + mon.after->cpus[core_idx].cpu[cpu_idx] - + mon.before->cpus[core_idx].cpu[cpu_idx]); + } + } + } + } // visitor mode write to the file + } + // *out++ = '\n'; + ctx.advance_to(out); + return out; + }; }; -#endif // SLUGALLOCATOR_MONITOR_H +#endif // CXLMEMSIM_MONITOR_H diff --git a/include/pebs.h b/include/pebs.h index 03971cb..ba06035 100644 --- a/include/pebs.h +++ b/include/pebs.h @@ -2,12 +2,11 @@ // Created by victoryang00 on 1/13/23. // -#ifndef CXL_MEM_SIMULATOR_PEBS_H -#define CXL_MEM_SIMULATOR_PEBS_H +#ifndef CXLMEMSIM_PEBS_H +#define CXLMEMSIM_PEBS_H #include "cxlcontroller.h" #include "helper.h" -#include "logging.h" #include #include #include @@ -33,14 +32,13 @@ class PEBS { uint64_t sample_period; uint32_t seq{}; size_t rdlen{}; - size_t mplen; + size_t mplen{}; struct perf_event_mmap_page *mp; - bool is_page; - PEBS(pid_t, uint64_t, bool); + PEBS(pid_t, uint64_t); ~PEBS(); int read(CXLController *, struct PEBSElem *); int start(); int stop(); }; -#endif // CXL_MEM_SIMULATOR_PEBS_H +#endif // CXLMEMSIM_PEBS_H diff --git a/include/perf.h b/include/perf.h index 126418c..1e7568d 100644 --- a/include/perf.h +++ b/include/perf.h @@ -2,8 +2,8 @@ // Created by victoryang00 on 1/14/23. // -#ifndef CXL_MEM_SIMULATOR_PERF_H -#define CXL_MEM_SIMULATOR_PERF_H +#ifndef CXLMEMSIM_PERF_H +#define CXLMEMSIM_PERF_H #include #include @@ -23,33 +23,6 @@ #include #include -class ThreadSafeMap { -public: - ThreadSafeMap() = default; - - // Multiple threads/readers can read the Map's value at the same time. - std::map> get() const { - std::shared_lock lock(mutex_); - return res; - } - - // Only one thread/writer can increment/write the Map's value. - void insert(unsigned long address, unsigned long size, unsigned long long time) { - std::unique_lock lock(mutex_); - res[address] = std::make_tuple(size, time); - } - - // Only one thread/writer can reset/write the Map's value. - void reset() { - std::unique_lock lock(mutex_); - res.clear(); - } - -private: - mutable std::shared_mutex mutex_; - std::map> res; -}; - class PerfInfo { public: int fd; @@ -58,18 +31,14 @@ class PerfInfo { pid_t pid; unsigned long flags; struct perf_event_attr attr; - ThreadSafeMap *map; - std::jthread j; + PerfInfo() = default; PerfInfo(int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr); - PerfInfo(int fd, int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr); ~PerfInfo(); ssize_t read_pmu(uint64_t *value); - std::map read_trace_pipe(); int start(); int stop(); }; PerfInfo *init_incore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1); -PerfInfo *init_incore_bpf_perf(const pid_t pid, const int cpu); -void write_trace_to_map(ThreadSafeMap *map); -#endif // CXL_MEM_SIMULATOR_PERF_H +PerfInfo *init_uncore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1, int value); +#endif // CXLMEMSIM_PERF_H diff --git a/include/policy.h b/include/policy.h index 0039ee9..dae51e7 100644 --- a/include/policy.h +++ b/include/policy.h @@ -2,8 +2,8 @@ // Created by victoryang00 on 1/12/23. // -#ifndef CXL_MEM_SIMULATOR_POLICY_H -#define CXL_MEM_SIMULATOR_POLICY_H +#ifndef CXLMEMSIM_POLICY_H +#define CXLMEMSIM_POLICY_H #include "cxlcontroller.h" #include "cxlendpoint.h" #include "helper.h" @@ -11,7 +11,7 @@ // Saturate Local 90% and start interleave accrodingly the remote with topology // Say 3 remote, 2 200ns, 1 400ns, will give 40% 40% 20% -class InterleavePolicy : public Policy { +class InterleavePolicy : public AllocationPolicy { public: InterleavePolicy(); @@ -21,4 +21,4 @@ class InterleavePolicy : public Policy { int compute_once(CXLController *) override; }; -#endif // CXL_MEM_SIMULATOR_POLICY_H +#endif // CXLMEMSIM_POLICY_H diff --git a/include/sock.h b/include/sock.h new file mode 100644 index 0000000..8618a20 --- /dev/null +++ b/include/sock.h @@ -0,0 +1,29 @@ +// +// Created by root on 11/21/23. +// + +#ifndef CXLMEMSIM_SOCK_H +#define CXLMEMSIM_SOCK_H +#include + +#ifdef __cplusplus +extern "C" { +#endif +enum opcode { + CXLMEMSIM_PROCESS_CREATE = 0, + CXLMEMSIM_THREAD_CREATE = 1, + CXLMEMSIM_THREAD_EXIT = 2, + CXLMEMSIM_STABLE_SIGNAL = 3, +}; +struct op_data { + uint32_t tgid; + uint32_t tid; + uint32_t opcode; +}; +#define SOCKET_PATH "/tmp/cxl_mem_simulator.sock" +#define OUTPUT_PMU_PATH "./output_pmu.csv" + +#ifdef __cplusplus +} +#endif +#endif // CXLMEMSIM_SOCK_H diff --git a/include/uncore.h b/include/uncore.h index b83bcee..d12e1ce 100644 --- a/include/uncore.h +++ b/include/uncore.h @@ -2,21 +2,24 @@ // Created by victoryang00 on 1/12/23. // -#ifndef CXL_MEM_SIMULATOR_UNCORE_H -#define CXL_MEM_SIMULATOR_UNCORE_H +#ifndef CXLMEMSIM_UNCORE_H +#define CXLMEMSIM_UNCORE_H #include "helper.h" #include "perf.h" +#include #include struct PerfConfig; class Uncore { public: - uint32_t unc_idx; - PerfInfo *perf; - Uncore(const uint32_t unc_idx, PerfConfig *perf_config); + uint32_t unc_idx{}; + int fd{}; + std::array perf{nullptr, nullptr, nullptr, nullptr}; + Uncore(uint32_t unc_idx, PerfConfig *perf_config); + ~Uncore() = default; - int read_cbo_elems(struct CBOElem *elem); + int read_cha_elems(struct CHAElem *elem); }; -#endif // CXL_MEM_SIMULATOR_UNCORE_H +#endif // CXLMEMSIM_UNCORE_H diff --git a/microbench/CMakeLists.txt b/microbench/CMakeLists.txt index aeac01f..5ee4d65 100644 --- a/microbench/CMakeLists.txt +++ b/microbench/CMakeLists.txt @@ -1,5 +1,67 @@ -add_executable(many_calloc ./many_calloc.c) -add_executable(many_mmap_read ./many_mmap_read.c) -add_executable(many_mmap_write ./many_mmap_write.c) -add_executable(many_malloc ./many_malloc.c) -add_executable(many_sbrk ./many_sbrk.c) \ No newline at end of file +add_executable(calloc calloc.c) +add_executable(mmap_read mmap_read.c) +add_executable(mmap_write mmap_write.c) +add_executable(malloc malloc.c) +add_executable(sbrk sbrk.c) + +add_executable(ld_simple ld_simple.cpp) +add_executable(nt-ld nt-ld.cpp) +add_executable(nt-st nt-st.cpp) +add_executable(ptr-chasing ptr-chasing.cpp) + +add_executable(ld1 ld.cpp) +target_compile_definitions(ld1 PRIVATE -DFENCE_COUNT=1) +add_executable(ld2 ld.cpp) +target_compile_definitions(ld2 PRIVATE -DFENCE_COUNT=2) +add_executable(ld4 ld.cpp) +target_compile_definitions(ld4 PRIVATE -DFENCE_COUNT=4) +add_executable(ld8 ld.cpp) +target_compile_definitions(ld8 PRIVATE -DFENCE_COUNT=8) +add_executable(ld16 ld.cpp) +target_compile_definitions(ld16 PRIVATE -DFENCE_COUNT=16) +add_executable(ld32 ld.cpp) +target_compile_definitions(ld32 PRIVATE -DFENCE_COUNT=32) +add_executable(ld64 ld.cpp) +target_compile_definitions(ld64 PRIVATE -DFENCE_COUNT=64) +add_executable(ld128 ld.cpp) +target_compile_definitions(ld128 PRIVATE -DFENCE_COUNT=128) +add_executable(ld256 ld.cpp) +target_compile_definitions(ld256 PRIVATE -DFENCE_COUNT=256) + +add_executable(ld_base1 ld_base.cpp) +target_compile_definitions(ld_base1 PRIVATE -DFENCE_COUNT=1) +add_executable(ld_base2 ld_base.cpp) +target_compile_definitions(ld_base2 PRIVATE -DFENCE_COUNT=2) +add_executable(ld_base4 ld_base.cpp) +target_compile_definitions(ld_base4 PRIVATE -DFENCE_COUNT=4) +add_executable(ld_base8 ld_base.cpp) +target_compile_definitions(ld_base8 PRIVATE -DFENCE_COUNT=8) +add_executable(ld_base16 ld_base.cpp) +target_compile_definitions(ld_base16 PRIVATE -DFENCE_COUNT=16) +add_executable(ld_base32 ld_base.cpp) +target_compile_definitions(ld_base32 PRIVATE -DFENCE_COUNT=32) +add_executable(ld_base64 ld_base.cpp) +target_compile_definitions(ld_base64 PRIVATE -DFENCE_COUNT=64) +add_executable(ld_base128 ld_base.cpp) +target_compile_definitions(ld_base128 PRIVATE -DFENCE_COUNT=128) +add_executable(ld_base256 ld_base.cpp) +target_compile_definitions(ld_base256 PRIVATE -DFENCE_COUNT=256) + +add_executable(st1 st.cpp) +target_compile_definitions(st1 PRIVATE -DFENCE_COUNT=1) +add_executable(st2 st.cpp) +target_compile_definitions(st2 PRIVATE -DFENCE_COUNT=2) +add_executable(st4 st.cpp) +target_compile_definitions(st4 PRIVATE -DFENCE_COUNT=4) +add_executable(st8 st.cpp) +target_compile_definitions(st8 PRIVATE -DFENCE_COUNT=8) +add_executable(st16 st.cpp) +target_compile_definitions(st16 PRIVATE -DFENCE_COUNT=16) +add_executable(st32 st.cpp) +target_compile_definitions(st32 PRIVATE -DFENCE_COUNT=32) +add_executable(st64 st.cpp) +target_compile_definitions(st64 PRIVATE -DFENCE_COUNT=64) +add_executable(st128 st.cpp) +target_compile_definitions(st128 PRIVATE -DFENCE_COUNT=128) +add_executable(st256 st.cpp) +target_compile_definitions(st256 PRIVATE -DFENCE_COUNT=256) diff --git a/microbench/many_calloc.c b/microbench/calloc.c similarity index 100% rename from microbench/many_calloc.c rename to microbench/calloc.c diff --git a/microbench/ld.cpp b/microbench/ld.cpp new file mode 100644 index 0000000..d597a58 --- /dev/null +++ b/microbench/ld.cpp @@ -0,0 +1,119 @@ +/* + * Microbench testies for MLP and memory latency in CXLMS + * + * By: Andrew Quinn + * Yiwei Yang + * + * Copyright 2023 Regents of the Univeristy of California + * UC Santa Cruz Sluglab. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#define MOVE_SIZE 128 +#define MAP_SIZE (long)(1024 * 1024 * 1024) +#define CACHELINE_SIZE 64 + +#ifndef FENCE_COUNT +#define FENCE_COUNT 8 +#endif + +#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE) + +// we need to jump in MOVE_SIZE increments otherwise segfault! + +#define BODY(start) \ + "xor %%r8, %%r8 \n" \ + "LOOP_START%=: \n" \ + "lea (%[" #start "], %%r8), %%r9 \n" \ + "movdqa (%%r9), %%xmm0 \n" \ + "add $" STR(MOVE_SIZE) ", %%r8 \n" \ + "cmp $" STR(FENCE_BOUND) ",%%r8\n" \ + "jl LOOP_START%= \n" \ + "mfence \n" + + +int main(int argc, char **argv) { + + // in principle, you would want to clear out cache lines (and the + // pipeline) before doing any of the inline assembly stuff. But, + // that's hard. And, its probably noise when you execute over + // enough things. + + + // allocate some meomery + char *base =(char *) mmap(nullptr, + MAP_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, + 0); + + if (base == MAP_FAILED) { + fprintf(stderr, "oops, you suck %d\n", errno); + return -1; + } + char *addr = NULL; + + intptr_t *iaddr = (intptr_t*) base; + intptr_t hash = 0; + struct timespec tstart = {0,0}, tend = {0,0}; + + // Necessary so that we don't include allocation costs in our benchmark + while (iaddr < (intptr_t *)(base + MAP_SIZE)) { + hash = hash ^ (intptr_t) iaddr; + *iaddr = hash; + iaddr++; + } + + // should flush everything from the cache. But, how big is the cache? + addr = base; + while (addr < (base + MAP_SIZE)) { + asm volatile( + "mov %[buf], %%rsi\n" + "clflush (%%rsi)\n" + : + : [buf] "r" (addr) + : "rsi"); + addr += CACHELINE_SIZE; + } + + asm volatile ("mfence\n" :::); + + clock_gettime(CLOCK_MONOTONIC, &tstart); +for (int i=0;i<1e3;i++){ + addr = base; + while (addr < (base + MAP_SIZE)) { + //fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE); + asm volatile( + BODY(addr) + : + : [addr] "r" (addr) + : "r8", "r9", "xmm0"); + + addr += (FENCE_COUNT * MOVE_SIZE); + } + clock_gettime(CLOCK_MONOTONIC, &tend); + uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec); + nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec); + + + printf("%lu\n", nanos); +} + return 0; +} diff --git a/microbench/ld_base.cpp b/microbench/ld_base.cpp new file mode 100644 index 0000000..9a299ce --- /dev/null +++ b/microbench/ld_base.cpp @@ -0,0 +1,116 @@ +/* + * Microbench testies for MLP and memory latency in CXLMS + * + * By: Andrew Quinn + * Yiwei Yang + * + * Copyright 2023 Regents of the Univeristy of California + * UC Santa Cruz Sluglab. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#define MOVE_SIZE 128 +#define MAP_SIZE (long)(1024 * 1024 * 1024) +#define CACHELINE_SIZE 64 + +#ifndef FENCE_COUNT +#define FENCE_COUNT 8 +#endif + +#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE) + +// we need to jump in MOVE_SIZE increments otherwise segfault! + +#define BODY(start) \ + "xor %%r8, %%r8 \n" \ + "LOOP_START%=: \n" \ + "lea (%[" #start "], %%r8), %%r9 \n" \ + "add $" STR(MOVE_SIZE) ", %%r8 \n" \ + "cmp $" STR(FENCE_BOUND) ",%%r8\n" \ + "jl LOOP_START%= \n" \ + "mfence \n" + + +int main(int argc, char **argv) { + + // in principle, you would want to clear out cache lines (and the + // pipeline) before doing any of the inline assembly stuff. But, + // that's hard. And, its probably noise when you execute over + // enough things. + + + // allocate some meomery + char *base =(char *) mmap(nullptr, + MAP_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, + 0); + + if (base == MAP_FAILED) { + fprintf(stderr, "oops, you suck %d\n", errno); + return -1; + } + char *addr = NULL; + + intptr_t *iaddr = (intptr_t*) base; + intptr_t hash = 0; + struct timespec tstart = {0,0}, tend = {0,0}; + + // Necessary so that we don't include allocation costs in our benchmark + while (iaddr < (intptr_t *)(base + MAP_SIZE)) { + hash = hash ^ (intptr_t) iaddr; + *iaddr = hash; + iaddr++; + } + + // should flush everything from the cache. But, how big is the cache? + addr = base; + while (addr < (base + MAP_SIZE)) { + asm volatile( + "mov %[buf], %%rsi\n" + "clflush (%%rsi)\n" + : + : [buf] "r" (addr) + : "rsi"); + addr += CACHELINE_SIZE; + } + + asm volatile ("mfence\n" :::); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + addr = base; + while (addr < (base + MAP_SIZE)) { + //fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE); + asm volatile( + BODY(addr) + : + : [addr] "r" (addr) + : "r8", "r9", "xmm0"); + + addr += (FENCE_COUNT * MOVE_SIZE); + } + clock_gettime(CLOCK_MONOTONIC, &tend); + uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec); + nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec); + + + printf("%lu\n", nanos); + return 0; +} diff --git a/microbench/ld_simple.cpp b/microbench/ld_simple.cpp new file mode 100644 index 0000000..282d9f6 --- /dev/null +++ b/microbench/ld_simple.cpp @@ -0,0 +1,97 @@ +/* ********************************************************** + * Copyright (c) 2018-2023 Google LLC All rights reserved. + * **********************************************************/ + +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of Google, Inc. nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE LLC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* This microbenchmark suffers from a significant number of last-level cache + * (LLC) misses. SW prefetching can significantly improve its performance. + * + * The cache miss analyzer can be used to identify the load instruction that + * is suffering from most of the LLC misses in this microbenchmark. The analyzer + * can also produce prefetching hints for this microbenchmark. To run the + * analyzer on this microbenchmark and write the prefetching hints in a text + * file called "rec.csv", perform the following: + * * Compile the microbenchmark. Assuming g++ is the compiler being used: + * $ g++ -O3 -o stride_benchmark stride_benchmark.cpp + * * Run the analyzer: + * $ bin64/drrun -t drcachesim -simulator_type miss_analyzer -LL_miss_file rec.csv -- \ + * stride_benchmark + * + */ + +#include +#include +#include + +#define MEM_BARRIER() __asm__ __volatile__("" ::: "memory") + +int +main(int argc, const char *argv[]) +{ + // Cache line size in bytes. + const int kLineSize = 64; + // Number of cache lines skipped by the stream every iteration. + const int kStride = 7; + // Number of 1-byte elements in the array. + const size_t kArraySize = 1024 * 1024 * 1024; + // Number of iterations in the main loop. + const int kIterations = 1e4; + // The main vector/array used for emulating pointer chasing. + unsigned char *buffer = new unsigned char[kArraySize]; + memset(buffer, kStride, kArraySize); + + // Add a memory barrier so the call doesn't get optimized away or + // reordered with respect to callers. + MEM_BARRIER(); + + int position = 0; + + // Here the code will pointer chase through the array skipping forward + // kStride cache lines at a time. Since kStride is an odd number, the main + // loop will touch different cache lines as it wraps around. + for (int loop = 0; loop < kIterations; ++loop) { + // This prefetching instruction results in a speedup of >2x + // on a Skylake machine running Linux when compiled with g++ -O3. + //const int prefetch_distance = 5 * kStride * kLineSize; + //__builtin_prefetch(&buffer[position + prefetch_distance], 0, 0); + + position += (buffer[position] * kLineSize); + position = loop; + position &= (kArraySize - 1); + } + + // Add a memory barrier so the call doesn't get optimized away or + // reordered with respect to callers. + MEM_BARRIER(); + +// std::cerr << "Value = " << position << std::endl; + + return 0; +} diff --git a/microbench/many_malloc.c b/microbench/malloc.c similarity index 100% rename from microbench/many_malloc.c rename to microbench/malloc.c diff --git a/microbench/many_mmap_read.c b/microbench/mmap_read.c similarity index 100% rename from microbench/many_mmap_read.c rename to microbench/mmap_read.c diff --git a/microbench/many_mmap_write.c b/microbench/mmap_write.c similarity index 100% rename from microbench/many_mmap_write.c rename to microbench/mmap_write.c diff --git a/microbench/nt-ld.cpp b/microbench/nt-ld.cpp new file mode 100644 index 0000000..4792322 --- /dev/null +++ b/microbench/nt-ld.cpp @@ -0,0 +1,119 @@ +#include "uarch.h" + +int main() { + int i; + long long aggregated = 0, aggregated2 = 0; + long seed = 0xdeadbeef1245678; + uint64_t a = 0xfc0; + int access_size = 64; + int stride_size = 64; + int delay = 64; + int count = 32; + uint64_t *cindex; + uint64_t csize; + int ret; + + // for (i = 0; i < 100000; i++) { + // char *buf = malloc(4096 * 1024); + // buf = buf + 64 - (((long)buf) % 64); + // // Separate RaW job + // RAW_BEFORE_WRITE + // stride_storeclwb(buf, access_size, stride_size, delay, count); + // asm volatile("mfence \n" :::); + // RAW_BEFORE_READ + // stride_nt(buf, access_size, stride_size, delay, count); + // asm volatile("mfence \n" :::); + // RAW_FINAL("raw-separate") + // + // aggregated += diff; + // aggregated2 += c_ntload_end - c_store_start; + // } + // + // printf("Separate RaW job %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count); + aggregated = 0; + aggregated2 = 0; + for (i = 0; i < 100000; i++) { + char *buf = static_cast(malloc(4096 * 1024)); + buf = buf + 64 - (((long)buf) % 64); + // Naive RaW job + RAW_BEFORE_WRITE + RAW_BEFORE_READ + stride_read_after_write(buf, access_size, stride_size, delay, count); + asm volatile("mfence \n" :::); + RAW_FINAL("raw-combined") + aggregated += diff; + aggregated2 += c_ntload_end - c_store_start; + } + printf("Naive RaW job %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count); + aggregated = 0; + aggregated2 = 0; + + // for (i = 0; i < 100000; i++) { + // char *buf = malloc(4096 * 1024); + // buf = buf + 64 - (((long)buf) % 64); + // RAW_BEFORE_WRITE + // sizebw_storeclwb(buf, access_size, count, &seed, a); + // asm volatile("mfence \n" :::); + // RAW_FINAL("sizebw_storeclwb") + // aggregated += diff; + // aggregated2 += c_ntload_end - c_store_start; + // } + // printf("sizebw_storeclwb %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count); + // + // aggregated = 0; + // aggregated2 = 0; + // for (i = 0; i < 100000; i++) { + // char *buf = malloc(4096 * 1024); + // buf = buf + 64 - (((long)buf) % 64); + // char *virt_addr = malloc(4096 * 1024); + // virt_addr = virt_addr + 64 - (((long)virt_addr) % 64); + // // Pointer chasing RaW job + // // No need to fill report fs page table, init_chasing_index will do that + // csize = access_size / CACHELINE_SIZE; + // cindex = (uint64_t *)(virt_addr); + // ret = init_chasing_index(cindex, csize); + // + // RAW_BEFORE_WRITE + // chasing_storeclwb(buf, access_size, stride_size, count, cindex); + // asm volatile("mfence \n" :::); + // RAW_BEFORE_READ + // chasing_loadnt(buf, access_size, stride_size, count, cindex); + // asm volatile("mfence \n" :::); + // RAW_FINAL("raw-chasing") + // + // aggregated += diff; + // aggregated2 += c_ntload_end - c_store_start; + // } + // printf("pointer chasing 2 hop %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count); + // + // aggregated = 0; + // aggregated2 = 0; + // for (i = 0; i < 100000; i++) { + // char *buf = malloc(4096 * 1024); + // buf = buf + 64 - (((long)buf) % 64); + // char *virt_addr = malloc(4096 * 1024); + // virt_addr = virt_addr + 64 - (((long)virt_addr) % 64); + // // Pointer chasing RaW job + // // No need to fill report fs page table, init_chasing_index will do that + // csize = access_size / CACHELINE_SIZE; + // cindex = (uint64_t *)(virt_addr); + // ret = init_chasing_index(cindex, csize); + // + // RAW_BEFORE_WRITE + // chasing_storeclwb(buf, access_size, stride_size, count, cindex); + // asm volatile("mfence \n" :::); + // RAW_BEFORE_READ + // chasing_loadnt(buf, access_size, stride_size, count, cindex); + // asm volatile("mfence \n" :::); + // chasing_storeclwb(buf, access_size, stride_size, count, cindex); + // asm volatile("mfence \n" :::); + // chasing_loadnt(buf, access_size, stride_size, count, cindex); + // asm volatile("mfence \n" :::); + // RAW_FINAL("raw-chasing") + // + // aggregated += diff; + // aggregated2 += c_ntload_end - c_store_start; + // } + // printf("pointer chasing 4 hop %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count); + return 0; +} diff --git a/microbench/nt-st.cpp b/microbench/nt-st.cpp new file mode 100644 index 0000000..3f03481 --- /dev/null +++ b/microbench/nt-st.cpp @@ -0,0 +1,34 @@ +#include "uarch.h" + +int main() { + int i; + long long aggregated = 0, aggregated2 = 0; + long seed = 0xdeadbeef1245678; + uint64_t a = 0xfc0; + int access_size = 64; + int stride_size = 64; + int delay = 64; + int count = 32; + uint64_t *cindex; + uint64_t csize; + int ret; + + for (i = 0; i < 100000; i++) { + char *buf = static_cast(malloc(4096 * 1024)); + buf = buf + 64 - (((long)buf) % 64); + // Separate RaW job + RAW_BEFORE_WRITE + stride_storeclwb(buf, access_size, stride_size, delay, count); + asm volatile("mfence \n" :::); + RAW_BEFORE_READ + stride_nt(buf, access_size, stride_size, delay, count); + asm volatile("mfence \n" :::); + RAW_FINAL("raw-separate") + + aggregated += diff; + aggregated2 += c_ntload_end - c_store_start; + } + + printf("Separate RaW job %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count); + return 0; +} diff --git a/microbench/ptr-chasing.cpp b/microbench/ptr-chasing.cpp new file mode 100644 index 0000000..ba5d7c2 --- /dev/null +++ b/microbench/ptr-chasing.cpp @@ -0,0 +1,46 @@ +#include "uarch.h" + +int main() { + int i; + long long aggregated = 0, aggregated2 = 0; + long seed = 0xdeadbeef1245678; + uint64_t a = 0xfc0; + int access_size = 64; + int stride_size = 64; + int delay = 64; + int count = 32; + uint64_t *cindex; + uint64_t csize; + int ret; + + aggregated = 0; + aggregated2 = 0; + for (i = 0; i < 100000; i++) { + char *buf = static_cast(malloc(4096 * 1024)); + buf = buf + 64 - (((long)buf) % 64); + char *virt_addr = static_cast(malloc(4096 * 1024)); + virt_addr = virt_addr + 64 - (((long)virt_addr) % 64); + // Pointer chasing RaW job + // No need to fill report fs page table, init_chasing_index will do that + csize = access_size / CACHELINE_SIZE; + cindex = (uint64_t *)(virt_addr); + ret = init_chasing_index(cindex, csize); + + RAW_BEFORE_WRITE + chasing_storeclwb(buf, access_size, stride_size, count, cindex); + asm volatile("mfence \n" :::); + RAW_BEFORE_READ + chasing_loadnt(buf, access_size, stride_size, count, cindex); + asm volatile("mfence \n" :::); + chasing_storeclwb(buf, access_size, stride_size, count, cindex); + asm volatile("mfence \n" :::); + chasing_loadnt(buf, access_size, stride_size, count, cindex); + asm volatile("mfence \n" :::); + RAW_FINAL("raw-chasing") + + aggregated += diff; + aggregated2 += c_ntload_end - c_store_start; + } + printf("pointer chasing 4 hop %lld %lld\n", aggregated / 100000 / count, aggregated2 / 100000 / count); + return 0; +} diff --git a/microbench/many_sbrk.c b/microbench/sbrk.c similarity index 95% rename from microbench/many_sbrk.c rename to microbench/sbrk.c index 1da226e..e1b53dd 100644 --- a/microbench/many_sbrk.c +++ b/microbench/sbrk.c @@ -136,9 +136,9 @@ void my_free(void *p) { if (ptr->succ != 0) ptr->succ->prev = pred; // end added - printf("BKR freeing block %#x merging with predecessor new size is %d.\n", p, pred->size); + printf("BKR freeing block %#p merging with predecessor new size is %d.\n", p, pred->size); } else { - printf("BKR freeing block %#x.\n", p); + printf("BKR freeing block %#p.\n", p); arr[i++] = ptr; ptr->isfree = 1; pred = ptr; @@ -153,7 +153,7 @@ void my_free(void *p) { succ->succ->prev = pred; // end added arr[i++] = ptr; - printf("BKR freeing block %#x merging with successor new size is %d.\n", p, pred->size); + printf("BKR freeing block %#p merging with successor new size is %d.\n", p, pred->size); } } @@ -161,7 +161,7 @@ int main(int argc, const char *const *argv) { size_t mbcount = 100; - printf("allocating %d MB\n", mbcount); + printf("allocating %ld MB\n", mbcount); uint8_t *p; p = (uint8_t *)my_malloc(mbcount * 1024ULL * 1024ULL); diff --git a/microbench/st.cpp b/microbench/st.cpp new file mode 100644 index 0000000..e798086 --- /dev/null +++ b/microbench/st.cpp @@ -0,0 +1,119 @@ +/* + * Microbench testies for MLP and memory latency in CXLMS + * + * By: Andrew Quinn + * Yiwei Yang + * + * Copyright 2023 Regents of the Univeristy of California + * UC Santa Cruz Sluglab. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#define MOVE_SIZE 128 +#define MAP_SIZE (long)(1024 * 1024 * 1024) +#define CACHELINE_SIZE 64 + +#ifndef FENCE_COUNT +#define FENCE_COUNT 8 +#endif + +#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE) + +// we need to jump in MOVE_SIZE increments otherwise segfault! + +#define BODY(start) \ + "xor %%r8, %%r8 \n" \ + "pxor %%xmm1, %%xmm1 \n" \ + "LOOP_START%=: \n" \ + "lea (%[" #start "], %%r8), %%r9 \n" \ + "movdqa %%xmm1, (%%r9) \n" \ + "add $" STR(MOVE_SIZE) ", %%r8 \n" \ + "cmp $" STR(FENCE_BOUND) ",%%r8\n" \ + "jl LOOP_START%= \n" \ + "mfence \n" + + +int main(int argc, char **argv) { + + // in principle, you would want to clear out cache lines (and the + // pipeline) before doing any of the inline assembly stuff. But, + // that's hard. And, its probably noise when you execute over + // enough things. + + + // allocate some meomery + char *base =(char *) mmap(nullptr, + MAP_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + -1, + 0); + + if (base == MAP_FAILED) { + fprintf(stderr, "oops, you suck %d\n", errno); + return -1; + } + char *addr = NULL; + + intptr_t *iaddr = (intptr_t*) base; + intptr_t hash = 0; + struct timespec tstart = {0,0}, tend = {0,0}; + + // Necessary so that we don't include allocation costs in our benchmark + while (iaddr < (intptr_t *)(base + MAP_SIZE)) { + hash = hash ^ (intptr_t) iaddr; + *iaddr = hash; + iaddr++; + } + + // should flush everything from the cache. But, how big is the cache? + addr = base; + while (addr < (base + MAP_SIZE)) { + asm volatile( + "mov %[buf], %%rsi\n" + "clflush (%%rsi)\n" + : + : [buf] "r" (addr) + : "rsi"); + addr += CACHELINE_SIZE; + } + + asm volatile ("mfence\n" :::); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + addr = base; + while (addr < (base + MAP_SIZE)) { + //fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE); + asm volatile( + BODY(addr) + : + : [addr] "r" (addr) + : "r8", "r9", "xmm0"); + + addr += (FENCE_COUNT * MOVE_SIZE); + } + clock_gettime(CLOCK_MONOTONIC, &tend); + uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec); + nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec); + + + printf("%lu\n", nanos); + return 0; +} + diff --git a/microbench/uarch.h b/microbench/uarch.h new file mode 100644 index 0000000..6b45b7d --- /dev/null +++ b/microbench/uarch.h @@ -0,0 +1,1036 @@ +#include +#include +#include +#include +#include + +uint32_t *lfs_random_array; +#define KERNEL_BEGIN \ + do { \ + } while (0); +#define KERNEL_END \ + do { \ + } while (0); +#define CACHELINE_SIZE 64 + +#define SIZEBTNT_64_AVX512 \ + "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ + "add $0x40, %%r10 \n" + +#define SIZEBTNT_128_AVX512 \ + "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x40(%%r9, %%r10) \n" \ + "add $0x80, %%r10 \n" + +#define SIZEBTNT_256_AVX512 \ + "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x40(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x80(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "add $0x100, %%r10 \n" + +#define SIZEBTNT_512_AVX512 \ + "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x40(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x80(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x100(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x140(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x180(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x1c0(%%r9, %%r10) \n" \ + "add $0x200, %%r10 \n" + +#define SIZEBTNT_1024_AVX512 \ + "vmovntdq %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x40(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x80(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x100(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x140(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x180(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x1c0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x200(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x240(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x280(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x2c0(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x300(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x340(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x380(%%r9, %%r10) \n" \ + "vmovntdq %%zmm0, 0x3c0(%%r9, %%r10) \n" \ + "add $0x400, %%r10 \n" + +#define SIZEBTSTFLUSH_64_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "clwb 0x0(%%r9, %%r10) \n" \ + "add $0x40, %%r10 \n" + +#define SIZEBTSTFLUSH_128_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "clwb 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "clwb 0x40(%%r9, %%r10) \n" \ + "add $0x80, %%r10 \n" + +#define SIZEBTSTFLUSH_256_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "clwb 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "clwb 0x40(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ + "clwb 0x80(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "clwb 0xc0(%%r9, %%r10) \n" \ + "add $0x100, %%r10 \n" + +#define SIZEBTSTFLUSH_512_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "clwb 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "clwb 0x40(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ + "clwb 0x80(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "clwb 0xc0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x100(%%r9, %%r10) \n" \ + "clwb 0x100(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x140(%%r9, %%r10) \n" \ + "clwb 0x140(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x180(%%r9, %%r10) \n" \ + "clwb 0x180(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x1c0(%%r9, %%r10) \n" \ + "clwb 0x1c0(%%r9, %%r10) \n" \ + "add $0x200, %%r10 \n" + +#define SIZEBTSTFLUSH_1024_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "clwb 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "clwb 0x40(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ + "clwb 0x80(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "clwb 0xc0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x100(%%r9, %%r10) \n" \ + "clwb 0x100(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x140(%%r9, %%r10) \n" \ + "clwb 0x140(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x180(%%r9, %%r10) \n" \ + "clwb 0x180(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x1c0(%%r9, %%r10) \n" \ + "clwb 0x1c0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x200(%%r9, %%r10) \n" \ + "clwb 0x200(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x240(%%r9, %%r10) \n" \ + "clwb 0x240(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x280(%%r9, %%r10) \n" \ + "clwb 0x280(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x2c0(%%r9, %%r10) \n" \ + "clwb 0x2c0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x300(%%r9, %%r10) \n" \ + "clwb 0x300(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x340(%%r9, %%r10) \n" \ + "clwb 0x340(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x380(%%r9, %%r10) \n" \ + "clwb 0x380(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x3c0(%%r9, %%r10) \n" \ + "clwb 0x3c0(%%r9, %%r10) \n" \ + "add $0x400, %%r10 \n" + +#define SIZEBTST_64_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "add $0x40, %%r10 \n" + +#define SIZEBTST_128_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "add $0x80, %%r10 \n" + +#define SIZEBTST_256_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "add $0x100, %%r10 \n" + +#define SIZEBTST_512_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x100(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x140(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x180(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x1c0(%%r9, %%r10) \n" \ + "add $0x200, %%r10 \n" + +#define SIZEBTST_1024_AVX512 \ + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x40(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x80(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0xc0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x100(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x140(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x180(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x1c0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x200(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x240(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x280(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x2c0(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x300(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x340(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x380(%%r9, %%r10) \n" \ + "vmovdqa64 %%zmm0, 0x3c0(%%r9, %%r10) \n" \ + "add $0x400, %%r10 \n" + +#define SIZEBTLD_64_AVX512 \ + "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ + "add $0x40, %%r10 \n" + +#define SIZEBTLD_128_AVX512 \ + "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ + "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ + "add $0x80, %%r10 \n" + +#define SIZEBTLD_256_AVX512 \ + "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ + "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ + "vmovntdqa 0x80(%%r9, %%r10), %%zmm2 \n" \ + "vmovntdqa 0xc0(%%r9, %%r10), %%zmm3 \n" \ + "add $0x100, %%r10 \n" + +#define SIZEBTLD_512_AVX512 \ + "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ + "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ + "vmovntdqa 0x80(%%r9, %%r10), %%zmm2 \n" \ + "vmovntdqa 0xc0(%%r9, %%r10), %%zmm3 \n" \ + "vmovntdqa 0x100(%%r9, %%r10), %%zmm4 \n" \ + "vmovntdqa 0x140(%%r9, %%r10), %%zmm5 \n" \ + "vmovntdqa 0x180(%%r9, %%r10), %%zmm6 \n" \ + "vmovntdqa 0x1c0(%%r9, %%r10), %%zmm7 \n" \ + "add $0x200, %%r10 \n" + +#define SIZEBTLD_1024_AVX512 \ + "vmovntdqa 0x0(%%r9, %%r10), %%zmm0 \n" \ + "vmovntdqa 0x40(%%r9, %%r10), %%zmm1 \n" \ + "vmovntdqa 0x80(%%r9, %%r10), %%zmm2 \n" \ + "vmovntdqa 0xc0(%%r9, %%r10), %%zmm3 \n" \ + "vmovntdqa 0x100(%%r9, %%r10), %%zmm4 \n" \ + "vmovntdqa 0x140(%%r9, %%r10), %%zmm5 \n" \ + "vmovntdqa 0x180(%%r9, %%r10), %%zmm6 \n" \ + "vmovntdqa 0x1c0(%%r9, %%r10), %%zmm7 \n" \ + "vmovntdqa 0x200(%%r9, %%r10), %%zmm8 \n" \ + "vmovntdqa 0x240(%%r9, %%r10), %%zmm9 \n" \ + "vmovntdqa 0x280(%%r9, %%r10), %%zmm10 \n" \ + "vmovntdqa 0x2c0(%%r9, %%r10), %%zmm11 \n" \ + "vmovntdqa 0x300(%%r9, %%r10), %%zmm12 \n" \ + "vmovntdqa 0x340(%%r9, %%r10), %%zmm13 \n" \ + "vmovntdqa 0x380(%%r9, %%r10), %%zmm14 \n" \ + "vmovntdqa 0x3c0(%%r9, %%r10), %%zmm15 \n" \ + "add $0x400, %%r10 \n" + +#define SIZEBT_NT_64 \ + "movnti %[random], 0x0(%%r9, %%r10) \n" \ + "movnti %[random], 0x8(%%r9, %%r10) \n" \ + "movnti %[random], 0x10(%%r9, %%r10) \n" \ + "movnti %[random], 0x18(%%r9, %%r10) \n" \ + "movnti %[random], 0x20(%%r9, %%r10) \n" \ + "movnti %[random], 0x28(%%r9, %%r10) \n" \ + "movnti %[random], 0x30(%%r9, %%r10) \n" \ + "movnti %[random], 0x38(%%r9, %%r10) \n" \ + "add $0x40, %%r10 \n" + +#define SIZEBT_LOAD_64 \ + "mov 0x0(%%r9, %%r10), %%r13 \n" \ + "mov 0x8(%%r9, %%r10), %%r13 \n" \ + "mov 0x10(%%r9, %%r10), %%r13 \n" \ + "mov 0x18(%%r9, %%r10), %%r13 \n" \ + "mov 0x20(%%r9, %%r10), %%r13 \n" \ + "mov 0x28(%%r9, %%r10), %%r13 \n" \ + "mov 0x30(%%r9, %%r10), %%r13 \n" \ + "mov 0x38(%%r9, %%r10), %%r13 \n" + +/* Arbitrary sizes w/o clearing pipeline */ + +#define SIZEBTNT_MACRO SIZEBTNT_512_AVX512 +#define SIZEBTST_MACRO SIZEBTST_512_AVX512 +#define SIZEBTLD_MACRO SIZEBT_LOAD_64 +#define SIZEBTSTFLUSH_MACRO SIZEBTSTFLUSH_512_AVX512 + +// #define SIZEBTST_FENCE "mfence \n" +// #define SIZEBTLD_FENCE "mfence \n" +#define SIZEBTST_FENCE "" +#define SIZEBTLD_FENCE "" + +#define CACHEFENCE_FENCE "sfence \n" +// #define CACHEFENCE_FENCE "mfence \n" + +#define RandLFSR64_NEW(rand, accessmask, addr) \ + "mov (%[" #rand "]), %%r9 \n" \ + "mov %%r9, %%r12 \n" \ + "shr %%r9 \n" \ + "and $0x1, %%r12d \n" \ + "neg %%r12 \n" \ + "and %%rcx, %%r12 \n" \ + "xor %%r9, %%r12 \n" \ + "mov %%r12, (%[" #rand "]) \n" \ + "mov %%r12, %%r8 \n" \ + "and %[" #accessmask "], %%r8 \n" \ + "lea (%[" #addr "], %%r8), %%r9 \n" + +#define RandLFSR64 \ + "mov (%[random]), %%r9 \n" \ + "mov %%r9, %%r12 \n" \ + "shr %%r9 \n" \ + "and $0x1, %%r12d \n" \ + "neg %%r12 \n" \ + "and %%rcx, %%r12 \n" \ + "xor %%r9, %%r12 \n" \ + "mov %%r12, (%[random]) \n" \ + "mov %%r12, %%r8 \n" \ + "and %[accessmask], %%r8 \n" + + +void sizebw_load(char *start_addr, long size, long count, long *rand_seed, long access_mask) { + KERNEL_BEGIN + asm volatile("movabs $0xd800000000000000, %%rcx \n" /* rcx: bitmask used in LFSR */ + "xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: access counter */ + // 1 + "LOOP_FRNG_SIZEBWL_RLOOP: \n" /* outer (counter) loop */ + RandLFSR64 /* LFSR: uses r9, r12 (reusable), rcx (above), fill r8 */ + "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "LOOP_FRNG_SIZEBWL_ONE1: \n" /* inner (access) loop, unroll 8 times */ + SIZEBTLD_MACRO /* Access: uses r8[rand_base], r10[size_accessed], r9 */ + "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWL_ONE1 \n" SIZEBTLD_FENCE + + // 2 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWL_ONE2: \n" SIZEBTLD_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWL_ONE2 \n" SIZEBTLD_FENCE + // 3 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWL_ONE3: \n" SIZEBTLD_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWL_ONE3 \n" SIZEBTLD_FENCE + // 4 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWL_ONE4: \n" SIZEBTLD_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWL_ONE4 \n" SIZEBTLD_FENCE + + "add $4, %%r11 \n" + "cmp %[count], %%r11\n" + "jl LOOP_FRNG_SIZEBWL_RLOOP \n" + + : [random] "=r"(rand_seed) + : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), + "0"(rand_seed), [accessmask] "r"(access_mask) + : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void sizebw_load_new(char *start_addr, long count, long *rand_seed, uint64_t access_mask) { + KERNEL_BEGIN + asm volatile("movabs $0xd800000000000000, %%rcx \n" /* rcx: bitmask used in LFSR */ + "xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: access counter */ + // 1 + "LD_LOOP_NEW: \n" + RandLFSR64 /* LFSR: uses r9, r12 (reusable), rcx (above), fill r8 */ + "lea (%[start_addr], %%r8), %%r9 \n" + "mov 0x0(%%r9), %%r13 \n" + "add $1, %%r11 \n" + "cmp %[count], %%r11\n" + "jl LD_LOOP_NEW \n" + : [random] "=r"(rand_seed) + : [start_addr] "r"(start_addr), [count] "r"(count), + "0"(rand_seed), [accessmask] "r"(access_mask) + : "%rcx", "%r13", "%r12", "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + + +#define OPERATION + + +#define RANDOM_OPER(rand, mask, buf) \ + RandLFSR64_NEW(rand, mask, buf) \ + OPERATION + + +#define UNROLL_4(rand, mask, buf) \ + RANDOM_OPER(rand, mask, buf) \ + RANDOM_OPER(rand, mask, buf) \ + RANDOM_OPER(rand, mask, buf) \ + RANDOM_OPER(rand, mask, buf) + +#define UNROLL_16(rand, mask, buf) \ + UNROLL_4(rand, mask, buf) \ + UNROLL_4(rand, mask, buf) \ + UNROLL_4(rand, mask, buf) \ + UNROLL_4(rand, mask, buf) + + +#define LOAD_NEW(start_addr, rand_seed, access_mask) \ + do { \ + /*r8: rand number, r9: computed addr, r13: dest, r12: temp in lfsr, */ \ + /*rcx: bitmask for lfsr */ \ + asm volatile("movabs $0xd800000000000000, %%rcx \n" /* bitmask for LFSR */ \ + "xor %%r8, %%r8 \n" /* r8: access offset */ \ + UNROLL_16(random, accessmask, buf) \ + : [random] "=r"(rand_seed) \ + : [buf] "r"(start_addr), "0"(rand_seed), [accessmask] "r"(access_mask) \ + : "%rcx", "%r13", "%r12", "%r9", "%r8"); \ + } while(0); + + +void sizebw_nt(char *start_addr, long size, long count, long *rand_seed, long access_mask) { + KERNEL_BEGIN + asm volatile("movabs $0xd800000000000000, %%rcx \n" + "xor %%r11, %%r11 \n" + "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */ + // 1 + "LOOP_FRNG_SIZEBWNT_RLOOP: \n" RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWNT_ONE1: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWNT_ONE1 \n" SIZEBTST_FENCE + + // 2 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWNT_ONE2: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWNT_ONE2 \n" SIZEBTST_FENCE + // 3 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWNT_ONE3: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWNT_ONE3 \n" SIZEBTST_FENCE + // 4 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWNT_ONE4: \n" SIZEBTNT_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWNT_ONE4 \n" SIZEBTST_FENCE + + "add $4, %%r11 \n" + "cmp %[count], %%r11\n" + "jl LOOP_FRNG_SIZEBWNT_RLOOP \n" + + : [random] "=r"(rand_seed) + : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), + "0"(rand_seed), [accessmask] "r"(access_mask) + : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void sizebw_store(char *start_addr, long size, long count, long *rand_seed, long access_mask) { + KERNEL_BEGIN + asm volatile("movabs $0xd800000000000000, %%rcx \n" + "xor %%r11, %%r11 \n" + "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */ + // 1 + "LOOP_FRNG_SIZEBWST_RLOOP: \n" RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWST_ONE1: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWST_ONE1 \n" SIZEBTST_FENCE + + // 2 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWST_ONE2: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWST_ONE2 \n" SIZEBTST_FENCE + // 3 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWST_ONE3: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWST_ONE3 \n" SIZEBTST_FENCE + // 4 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWST_ONE4: \n" SIZEBTST_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWST_ONE4 \n" SIZEBTST_FENCE + + "add $4, %%r11 \n" + "cmp %[count], %%r11\n" + "jl LOOP_FRNG_SIZEBWST_RLOOP \n" + + : [random] "=r"(rand_seed) + : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), + "0"(rand_seed), [accessmask] "r"(access_mask) + : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void sizebw_storeclwb(char *start_addr, long size, long count, long *rand_seed, long access_mask) { + KERNEL_BEGIN + asm volatile("movabs $0xd800000000000000, %%rcx \n" + "xor %%r11, %%r11 \n" + "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */ + // 1 + "LOOP_FRNG_SIZEBWSTFLUSH_RLOOP: \n" RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWSTFLUSH_ONE1: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE1 \n" SIZEBTST_FENCE + + // 2 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWSTFLUSH_ONE2: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE2 \n" SIZEBTST_FENCE + // 3 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWSTFLUSH_ONE3: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE3 \n" SIZEBTST_FENCE + // 4 + RandLFSR64 "lea (%[start_addr], %%r8), %%r9 \n" + "xor %%r10, %%r10 \n" + "LOOP_FRNG_SIZEBWSTFLUSH_ONE4: \n" SIZEBTSTFLUSH_MACRO "cmp %[accesssize], %%r10 \n" + "jl LOOP_FRNG_SIZEBWSTFLUSH_ONE4 \n" SIZEBTST_FENCE + + "add $4, %%r11 \n" + "cmp %[count], %%r11\n" + "jl LOOP_FRNG_SIZEBWSTFLUSH_RLOOP \n" + + : [random] "=r"(rand_seed) + : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), + "0"(rand_seed), [accessmask] "r"(access_mask) + : "%rcx", "%r12", "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void stride_load(char *start_addr, long size, long skip, long delay, long count) { + KERNEL_BEGIN + asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: counter */ + + // 1 + "LOOP_STRIDELOAD_OUTER: \n" /* outer (counter) loop */ + "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */ + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "LOOP_STRIDELOAD_INNER: \n" /* inner (access) loop, unroll 8 times */ + SIZEBTLD_64_AVX512 /* Access: uses r10[size_accessed], r9 */ + "cmp %[accesssize], %%r10 \n" + "jl LOOP_STRIDELOAD_INNER \n" SIZEBTLD_FENCE + + "xor %%r10, %%r10 \n" + "LOOP_STRIDELOAD_DELAY: \n" /* delay cycles */ + "inc %%r10 \n" + "cmp %[delay], %%r10 \n" + "jl LOOP_STRIDELOAD_DELAY \n" + + "add %[skip], %%r8 \n" + "inc %%r11 \n" + "cmp %[count], %%r11 \n" + + "jl LOOP_STRIDELOAD_OUTER \n" + + ::[start_addr] "r"(start_addr), + [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay) + : "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void stride_nt(char *start_addr, long size, long skip, long delay, long count) { + KERNEL_BEGIN + asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: counter */ + "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */ + // 1 + "LOOP_STRIDENT_OUTER: \n" /* outer (counter) loop */ + "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */ + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "LOOP_STRIDENT_INNER: \n" /* inner (access) loop, unroll 8 times */ + SIZEBTNT_64_AVX512 /* Access: uses r10[size_accessed], r9 */ + "cmp %[accesssize], %%r10 \n" + "jl LOOP_STRIDENT_INNER \n" SIZEBTLD_FENCE + + "xor %%r10, %%r10 \n" + "LOOP_STRIDENT_DELAY: \n" /* delay cycles */ + "inc %%r10 \n" + "cmp %[delay], %%r10 \n" + "jl LOOP_STRIDENT_DELAY \n" + + "add %[skip], %%r8 \n" + "inc %%r11 \n" + "cmp %[count], %%r11 \n" + + "jl LOOP_STRIDENT_OUTER \n" + + ::[start_addr] "r"(start_addr), + [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay) + : "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void stride_store(char *start_addr, long size, long skip, long delay, long count) { + KERNEL_BEGIN + asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: counter */ + "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */ + // 1 + "LOOP_STRIDEST_OUTER: \n" /* outer (counter) loop */ + "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */ + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "LOOP_STRIDEST_INNER: \n" /* inner (access) loop, unroll 8 times */ + SIZEBTST_64_AVX512 /* Access: uses r10[size_accessed], r9 */ + "cmp %[accesssize], %%r10 \n" + "jl LOOP_STRIDEST_INNER \n" SIZEBTST_FENCE + + "xor %%r10, %%r10 \n" + "LOOP_STRIDEST_DELAY: \n" /* delay cycles */ + "inc %%r10 \n" + "cmp %[delay], %%r10 \n" + "jl LOOP_STRIDEST_DELAY \n" + + "add %[skip], %%r8 \n" + "inc %%r11 \n" + "cmp %[count], %%r11 \n" + + "jl LOOP_STRIDEST_OUTER \n" + + ::[start_addr] "r"(start_addr), + [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay) + : "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void stride_storeclwb(char *start_addr, long size, long skip, long delay, long count) { + KERNEL_BEGIN + asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: counter */ + "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */ + // 1 + "LOOP_STRIDESTFLUSH_OUTER: \n" /* outer (counter) loop */ + "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */ + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "LOOP_STRIDESTFLUSH_INNER: \n" /* inner (access) loop, unroll 8 times */ + SIZEBTSTFLUSH_64_AVX512 /* Access: uses r10[size_accessed], r9 */ + "cmp %[accesssize], %%r10 \n" + "jl LOOP_STRIDESTFLUSH_INNER \n" SIZEBTST_FENCE + + "xor %%r10, %%r10 \n" + "LOOP_STRIDESTFLUSH_DELAY: \n" /* delay cycles */ + "inc %%r10 \n" + "cmp %[delay], %%r10 \n" + "jl LOOP_STRIDESTFLUSH_DELAY \n" + + "add %[skip], %%r8 \n" + "inc %%r11 \n" + "cmp %[count], %%r11 \n" + + "jl LOOP_STRIDESTFLUSH_OUTER \n" + + ::[start_addr] "r"(start_addr), + [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay) + : "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +#define RDRAND_MAX_RETRY 32 + +/* + * Generate random number to [rd] within [range], return 0 if success, 1 if fail. + */ +void stride_read_after_write(char *start_addr, long size, long skip, long delay, long count) { + KERNEL_BEGIN + asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: counter */ + "movq %[start_addr], %%xmm0 \n" /* zmm0: read/write register */ + + "LOOP_RAW_OUTER: \n" /* outer (counter) loop */ + "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */ + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "LOOP_RAW_STRIDESTCLWB_INNER: \n" /* inner (access) loop, unroll 8 times */ + SIZEBTSTFLUSH_64_AVX512 /* Access: uses r10[size_accessed], r9 */ + "cmp %[accesssize], %%r10 \n" + "jl LOOP_RAW_STRIDESTCLWB_INNER \n" + "mfence \n" + + "xor %%r10, %%r10 \n" + "LOOP_RAW_STRIDELDNT_INNER: \n" SIZEBTNT_64_AVX512 "cmp %[accesssize], %%r10 \n" + "jl LOOP_RAW_STRIDELDNT_INNER \n" + "mfence \n" + + "xor %%r10, %%r10 \n" + "LOOP_RAW_DELAY: \n" /* delay cycles */ + "inc %%r10 \n" + "cmp %[delay], %%r10 \n" + "jl LOOP_RAW_DELAY \n" + + "add %[skip], %%r8 \n" + "inc %%r11 \n" + "cmp %[count], %%r11 \n" + + "jl LOOP_RAW_OUTER \n" + + ::[start_addr] "r"(start_addr), + [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), [delay] "r"(delay) + : "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +static inline int get_rand(uint64_t *rd, uint64_t range) { + uint8_t ok; + int i = 0; + for (i = 0; i < RDRAND_MAX_RETRY; i++) { + asm volatile("rdrand %0; setc %1\n\t" : "=r"(*rd), "=qm"(ok)); + + if (ok) { + *rd = *rd % range; + return 0; + } + } + + return 1; +} + +int init_chasing_index(uint64_t *cindex, uint64_t csize) { + uint64_t curr_pos = 0; + uint64_t next_pos = 0; + uint64_t i = 0; + int ret = 0; + + memset(cindex, 0, sizeof(uint64_t) * csize); + + for (i = 0; i < csize - 1; i++) { + do { + ret = get_rand(&next_pos, csize); + if (ret != 0) + return 1; + } while ((cindex[next_pos] != 0) || (next_pos == curr_pos)); + + cindex[curr_pos] = next_pos; + curr_pos = next_pos; + } + + return 0; +} + +void chasing_storeclwb(char *start_addr, long size, long skip, long count, uint64_t *cindex) { + KERNEL_BEGIN + asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: counter */ + "LOOP_CHASING_STCLWB_OUTER: \n" /* outer (counter) loop */ + "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */ + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "xor %%r12, %%r12 \n" /* r12: chasing index addr */ + "LOOP_CHASING_STCLWB_INNER: \n" + "movq (%[cindex], %%r12, 8), %%xmm0\n" + "shl $0x06, %%r12\n" + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r12) \n" + "clwb 0x0(%%r9, %%r12) \n" + "add $0x40, %%r10\n" + "movq %%xmm0, %%r12\n" /* Update to next chasing element */ + + "cmp %[accesssize], %%r10 \n" + "jl LOOP_CHASING_STCLWB_INNER \n" SIZEBTST_FENCE + + "xor %%r10, %%r10 \n" + + "add %[skip], %%r8 \n" + "inc %%r11 \n" + "cmp %[count], %%r11 \n" + + "jl LOOP_CHASING_STCLWB_OUTER \n" + + : + : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), + [cindex] "r"(cindex) + : "%r12", "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void chasing_loadnt(char *start_addr, long size, long skip, long count, uint64_t *cindex) { + KERNEL_BEGIN + asm volatile("xor %%r8, %%r8 \n" /* r8: access offset */ + "xor %%r11, %%r11 \n" /* r11: counter */ + "LOOP_CHASING_STRIDENT_OUTER: \n" /* outer (counter) loop */ + "lea (%[start_addr], %%r8), %%r9 \n" /* r9: access loc */ + "xor %%r10, %%r10 \n" /* r10: accessed size */ + "xor %%r12, %%r12 \n" /* r12: chasing index addr */ + "LOOP_CHASING_STRIDENT_INNER: \n" + "shl $0x06, %%r12\n" + "vmovntdqa 0x0(%%r9, %%r12), %%zmm0\n" + "movq %%xmm0, %%r12\n" /* Update to next chasing element */ + "add $0x40, %%r10 \n" + + "cmp %[accesssize], %%r10 \n" + "jl LOOP_CHASING_STRIDENT_INNER \n" SIZEBTLD_FENCE + + //"mfence \n" /* !!!! */ + "add %[skip], %%r8 \n" + "inc %%r11 \n" + "cmp %[count], %%r11 \n" + + "jl LOOP_CHASING_STRIDENT_OUTER \n" + + : + : [start_addr] "r"(start_addr), [accesssize] "r"(size), [count] "r"(count), [skip] "r"(skip), + [cindex] "r"(cindex) + : "%r11", "%r10", "%r9", "%r8"); + KERNEL_END +} + +void cachefence(char *start_addr, long size, long cache, long fence) { + KERNEL_BEGIN + asm volatile("movq %[start_addr], %%xmm0 \n" + "xor %%r9, %%r9 \n" /* r9: offset of write */ + "CACHEFENCE_FENCEBEGIN: \n" + "xor %%r11, %%r11 \n" /* r11: fence counter */ + "CACHEFENCE_FLUSHBEGIN: \n" + "xor %%r10, %%r10 \n" /* r10: clwb counter */ + // "movq %%r9, %%rdx \n" /* rdx: flush start offset */ + "leaq (%[start_addr], %%r9), %%rdx \n" + "CACHEFENCE_WRITEONE: \n" + "vmovdqa64 %%zmm0, 0x0(%[start_addr], %%r9) \n" /* Write one addr */ + "add $0x40, %%r9 \n" + "add $0x40, %%r10 \n" + "add $0x40, %%r11 \n" + "cmp %[cache], %%r10 \n" /* check clwb */ + "jl CACHEFENCE_WRITEONE \n" + + "leaq (%[start_addr], %%r9), %%rcx \n" /* rcx: flush end offset, rdx->rcx */ + // "add %[start_addr], %%rcx" + "CACHEFENCE_FLUSHONE: \n" + "clwb (%%rdx) \n" /* Flush from rdx to rcx */ + "add $0x40, %%rdx \n" + "cmp %%rcx, %%rdx \n" + "jl CACHEFENCE_FLUSHONE \n" + + "cmp %[fence], %%r11 \n" + "jl CACHEFENCE_FLUSHBEGIN \n" CACHEFENCE_FENCE + + "cmp %[accesssize], %%r9 \n" + "jl CACHEFENCE_FENCEBEGIN \n" + + ::[start_addr] "r"(start_addr), + [accesssize] "r"(size), [cache] "r"(cache), [fence] "r"(fence) + : "%rdx", "%rcx", "%r11", "%r10", "%r9"); + KERNEL_END + return; +} + +void cacheprobe(char *start_addr, char *end_addr, long stride) { + KERNEL_BEGIN + asm volatile("mov %[start_addr], %%r8 \n" + "movq %[start_addr], %%xmm0 \n" + "LOOP_CACHEPROBE: \n" + "vmovdqa64 %%zmm0, 0x0(%%r8) \n" + "clflush (%%r8) \n" + "vmovdqa64 %%zmm0, 0x40(%%r8) \n" + "clflush 0x40(%%r8) \n" + "add %[stride], %%r8 \n" + "cmp %[end_addr], %%r8 \n" + "jl LOOP_CACHEPROBE \n" + "mfence \n" + + ::[start_addr] "r"(start_addr), + [end_addr] "r"(end_addr), [stride] "r"(stride) + : "%r8"); + KERNEL_END + return; +} + +void imcprobe(char *start_addr, char *end_addr, long loop) { + KERNEL_BEGIN + asm volatile("xor %%r9, %%r9 \n" + "movq %[start_addr], %%xmm0 \n" + + "LOOP1_IMCPROBE: \n" + "mov %[start_addr], %%r8 \n" + "LOOP2_IMCPROBE: \n" + "vmovntdq %%zmm0, 0x0(%%r8) \n" + "add $0x40, %%r8 \n" + "cmp %[end_addr], %%r8 \n" + "jl LOOP2_IMCPROBE \n" + + "add $1, %%r9 \n" + "cmp %[loop], %%r9 \n" + "jl LOOP1_IMCPROBE \n" + + ::[start_addr] "r"(start_addr), + [end_addr] "r"(end_addr), [loop] "r"(loop) + : "%r8", "%r9"); + KERNEL_END + return; +} + +void seq_load(char *start_addr, char *end_addr, long size) { + KERNEL_BEGIN + asm volatile("mov %[start_addr], %%r9 \n" + + "LOOP_SEQLOAD1: \n" + "xor %%r8, %%r8 \n" + "LOOP_SEQLOAD2: \n" + "vmovntdqa 0x0(%%r9, %%r8), %%zmm0 \n" + "add $0x40, %%r8 \n" + "cmp %[size], %%r8 \n" + "jl LOOP_SEQLOAD2 \n" + + "add %[size], %%r9 \n" + "cmp %[end_addr], %%r9 \n" + "jl LOOP_SEQLOAD1 \n" + + ::[start_addr] "r"(start_addr), + [end_addr] "r"(end_addr), [size] "r"(size) + : "%r8", "%r9"); + KERNEL_END + return; +} +void seq_store(char *start_addr, char *end_addr, long size) { + KERNEL_BEGIN + asm volatile("mov %[start_addr], %%r9 \n" + "movq %[start_addr], %%xmm0 \n" + + "LOOP_SEQSTORE1: \n" + "xor %%r8, %%r8 \n" + "LOOP_SEQSTORE2: \n" + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r8) \n" + "clwb (%%r9, %%r8) \n" + "add $0x40, %%r8 \n" + "cmp %[size], %%r8 \n" + "jl LOOP_SEQSTORE2 \n" + + "add %[size], %%r9 \n" + "cmp %[end_addr], %%r9 \n" + "jl LOOP_SEQSTORE1 \n" + + ::[start_addr] "r"(start_addr), + [end_addr] "r"(end_addr), [size] "r"(size) + : "%r8", "%r9"); + KERNEL_END + return; +} + +void seq_clwb(char *start_addr, char *end_addr, long size) { + KERNEL_BEGIN + asm volatile("mov %[start_addr], %%r9 \n" + "movq %[start_addr], %%xmm0 \n" + + "LOOP_SEQCLWB1: \n" + "xor %%r8, %%r8 \n" + "LOOP_SEQCLWB2: \n" + "vmovdqa64 %%zmm0, 0x0(%%r9, %%r8) \n" + "clwb (%%r9, %%r8) \n" + "add $0x40, %%r8 \n" + "cmp %[size], %%r8 \n" + "jl LOOP_SEQCLWB2 \n" + + "add %[size], %%r9 \n" + "cmp %[end_addr], %%r9 \n" + "jl LOOP_SEQCLWB1 \n" + + ::[start_addr] "r"(start_addr), + [end_addr] "r"(end_addr), [size] "r"(size) + : "%r8", "%r9"); + KERNEL_END +} + +void seq_nt(char *start_addr, char *end_addr, long size) { + KERNEL_BEGIN + asm volatile("mov %[start_addr], %%r9 \n" + "movq %[start_addr], %%xmm0 \n" + + "LOOP_SEQNT1: \n" + "xor %%r8, %%r8 \n" + "LOOP_SEQNT2: \n" + "vmovntdq %%zmm0, 0x0(%%r9, %%r8) \n" + "add $0x40, %%r8 \n" + "cmp %[size], %%r8 \n" + "jl LOOP_SEQNT2 \n" + + "add %[size], %%r9 \n" + "cmp %[end_addr], %%r9 \n" + "jl LOOP_SEQNT1 \n" + + ::[start_addr] "r"(start_addr), + [end_addr] "r"(end_addr), [size] "r"(size) + : "%r8", "%r9"); + KERNEL_END +} + +struct timespec tstart, tend; +unsigned int c_store_start_hi, c_store_start_lo; +unsigned int c_ntload_start_hi, c_ntload_start_lo; +unsigned int c_ntload_end_hi, c_ntload_end_lo; +unsigned long c_store_start; +unsigned long c_ntload_start, c_ntload_end; +long pages, diff; + + +#define BEFORE(buf, size, name) \ + asm volatile("xor %%r8, %%r8 \n" /* r8: counter */ \ + "FLUSH_LOOP" #name ": \n" \ + "lea (%[buf], %%r8), %%r9 \n" \ + "clflush (%%r9) \n" \ + "add $1, %%r8 \n" \ + "cmp %[size], %%r8 \n" \ + "jl FLUSH_LOOP" #name " \n" \ + "mfence \n" \ + :: [buf] "r" (buf), [size] "r"(size) \ + : "%r8", "%r9"); \ + clock_gettime(CLOCK_MONOTONIC_RAW, &tstart); \ + asm volatile("mfence \n\t" \ + "rdtscp \n\t" \ + "mfence \n\t" \ + "mov %%edx, %[hi]\n\t" \ + "mov %%eax, %[lo]\n\t" \ + : [hi] "=r"(c_store_start_hi), [lo] "=r"(c_store_start_lo) \ + : \ + : "rdx", "rax", "rcx"); + + +#define AFTER \ + asm volatile("mfence \n\t" \ + "rdtscp \n\t" \ + "mfence \n\t" \ + "mov %%edx, %[hi]\n\t" \ + "mov %%eax, %[lo]\n\t" \ + : [hi] "=r"(c_ntload_end_hi), [lo] "=r"(c_ntload_end_lo) \ + : \ + : "rdx", "rax", "rcx"); \ + if (clock_gettime(CLOCK_MONOTONIC_RAW, &tend) == 0) { \ + diff = (tend.tv_sec - tstart.tv_sec) * 1e9 + tend.tv_nsec - tstart.tv_nsec; \ + } \ + c_store_start = (((unsigned long)c_store_start_hi) << 32) | c_store_start_lo; \ + c_ntload_start = (((unsigned long)c_ntload_start_hi) << 32) | c_ntload_start_lo; \ + c_ntload_end = (((unsigned long)c_ntload_end_hi) << 32) | c_ntload_end_lo; + + + +#define LFS_PERMRAND_ENTRIES 0x1000 +#define RAW_BEFORE_WRITE \ + clock_gettime(CLOCK_MONOTONIC_RAW, &tstart); \ + asm volatile("rdtscp \n\t" \ + "lfence \n\t" \ + "mov %%edx, %[hi]\n\t" \ + "mov %%eax, %[lo]\n\t" \ + : [hi] "=r"(c_store_start_hi), [lo] "=r"(c_store_start_lo) \ + : \ + : "rdx", "rax", "rcx"); +#define RAW_BEFORE_READ \ + asm volatile("rdtscp \n\t" \ + "lfence \n\t" \ + "mov %%edx, %[hi]\n\t" \ + "mov %%eax, %[lo]\n\t" \ + : [hi] "=r"(c_ntload_start_hi), [lo] "=r"(c_ntload_start_lo) \ + : \ + : "rdx", "rax", "rcx"); +#define RAW_FINAL(job_name) \ + asm volatile("lfence \n\t" \ + "rdtscp \n\t" \ + "lfence \n\t" \ + "mov %%edx, %[hi]\n\t" \ + "mov %%eax, %[lo]\n\t" \ + : [hi] "=r"(c_ntload_end_hi), [lo] "=r"(c_ntload_end_lo) \ + : \ + : "rdx", "rax", "rcx"); \ + if (clock_gettime(CLOCK_MONOTONIC_RAW, &tend) == 0) { \ + diff = (tend.tv_sec - tstart.tv_sec) * 1e9 + tend.tv_nsec - tstart.tv_nsec; \ + } \ + c_store_start = (((unsigned long)c_store_start_hi) << 32) | c_store_start_lo; \ + c_ntload_start = (((unsigned long)c_ntload_start_hi) << 32) | c_ntload_start_lo; \ + c_ntload_end = (((unsigned long)c_ntload_end_hi) << 32) | c_ntload_end_lo; diff --git a/script/collect_weights.py b/script/collect_weights.py new file mode 100644 index 0000000..4d2c97d --- /dev/null +++ b/script/collect_weights.py @@ -0,0 +1,15 @@ +import os, subprocess +import time +workloads = ["mlc","ld","st","nt-ld","nt-st","ptr-chasing"] + + +def batch_run(): + os.system("../cmake-build-debug/CXLMemSim") + +def run_command(size): + start_time = time.time() + cmd = ["../cmake-build-debug/CXLMemSim" ,"-s"] + print(cmd) + subprocess.run(cmd) + end_time = time.time() + return end_time - start_time diff --git a/script/dump_pmu.py b/script/dump_pmu.py new file mode 100644 index 0000000..dcc2aea --- /dev/null +++ b/script/dump_pmu.py @@ -0,0 +1,53 @@ +import csv +import matplotlib.pyplot as plt +import os +import json + +pmu_list = ["INST_RETIRED.ANY"] +pmu_core_after = {"INST_RETIRED.ANY": (0, 0)} +pmu_core_before = {"INST_RETIRED.ANY": (0, 0)} + + +def get_perfmon(path: str, pmu: list) -> dict: + data_dict = {} + cur_csv = json.loads(f.read()) + + with open(path, "r") as f: + for line in pmu: + # Extract the EventName, UMask, and EventCode + event = cur_csv["Events"][0] + event_name = event["EventName"] + umask = event["UMask"] + event_code = event["EventCode"] + + # Combine UMask and EventCode + combined_code = ( + umask + event_code[2:] + ) # Concatenate and remove '0x' from EventCode + combined_code_hex = ( + "0x" + combined_code[2:] + ) # Add '0x' back for hex representation + + # Print the results + print(f"Event Name: {event_name}") + print(f"Combined UMask and EventCode: {combined_code_hex}") + return data_dict + + +def batch_pmu_run(pmu: dict): + for i, p in enumerate(pmu): + print(p) + if i % 4 == 0: + os.system( + "../cmake-build-debug/CXLMemSim -t ../cmake-build-debug/microbench/ld2 -i 100 --p"+ + ) + os.system("mv ./output_pmu.csv ./ld_pmu2_results.csv") + + +if __name__ == "__main__": + pmu = {"INST_RETIRED.ANY": 0} + get_perfmon("./perfmon/SPR/events/sapphirerapids_core.json", pmu) + get_perfmon("./perfmon/SPR/events/sapphirerapids_uncore.json", pmu) + get_perfmon("./perfmon/SPR/events/sapphirerapids_uncore_experimental.json", pmu) + # x, y = load_csv('data.csv') + # draw_graph(x, y) diff --git a/script/get_all_results.py b/script/get_all_results.py new file mode 100644 index 0000000..e69de29 diff --git a/script/ld_base_result.py b/script/ld_base_result.py new file mode 100644 index 0000000..b157c82 --- /dev/null +++ b/script/ld_base_result.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +import subprocess +import time +import csv +import sys, os + +workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"] + + +def run_command(size, mem_node): + start_time = time.time() + cmd = [ + f"/usr/bin/numactl -m {mem_node} ../cmake-build-debug/microbench/ld_base" + str(size), + ] + print(cmd) + process = subprocess.Popen( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + out, err = process.communicate() + print(f"err: {err}, out: {out}") + return int(out) + + +def run_cxlmemsim_command(size, mem_node): + # start_time = time.time() + cmd = [ + "LOGV=1", + f"/usr/bin/numactl -m {mem_node}", + "../cmake-build-debug/CXLMemSim", + "-t", + "../cmake-build-debug/microbench/ld" + str(size), + "-i", + "100", + ] + cmd = " ".join(cmd) + print(cmd) + os.system(cmd) + # end_time = time.time() + df = pd.read_csv("./output_pmu.csv") + os.system(f"mv ./output_pmu.csv ./ld_pmu{size}_results.csv") + return df + +def execute(cmd): + print(cmd) + process = subprocess.Popen( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + out, err = process.communicate() + print(f"err: {err}, out: {out}") + return out + + +def main(): + prefetching_off = [f"wrmsr -a 0x1a4 0xf"] + prefetching_on = [f"wrmsr -a 0x1a4 0xf"] + + sizes = [2**x for x in range(0, 9)] + + mode = "local" + mem_node = 0 if mode == "local" else 1 + + + execute(prefetching_off) + + f = open(f"ld_base_results_{mode}.csv", "a") + writer = csv.writer(f, delimiter=",") + writer.writerow(["size", "time"]) + for i in range(5): + for size in sizes: + exec_time = run_command(size, mem_node) + writer.writerow([size, exec_time]) + + execute(prefetching_on) + # for size in sizes: + # df = run_cxlmemsim_command(size,1) + + +if __name__ == "__main__": + main() diff --git a/script/ld_plot_result.py b/script/ld_plot_result.py new file mode 100644 index 0000000..d314146 --- /dev/null +++ b/script/ld_plot_result.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import argparse +import subprocess +import time +from math import sqrt +import matplotlib.pyplot as plt +import pandas as pd + +workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"] +pmus = [ + "mon0_tatal_stall_0_0", + "mon0_all_dram_rds_0_0", + "mon0_l2stall_0_0", + "mon0_snoop_fw_wb_0_0", + "mon0_llcl_hits_0_0", + "mon0_llcl_miss_0_0", + "mon0_null_0_0", + "mon0_null_0_0", +] + + +def get_mean_and_ebars(df, groups, select): + """returns df with error bars. gropus includes columns to groupby""" + agg = df.groupby(groups)[select].agg(["mean", "count", "std"]) + error = [] + for i in agg.index: + mean, count, std = agg.loc[i] + error.append(1.95 * std / sqrt(count)) + + agg["error"] = error + + return agg[["mean"]], agg[["error"]] + + +def print_pmu_csv(): + sizes = [2**x for x in range(0, 9)] + for c in pmus: + for i in sizes: + df = pd.read_csv(f"ld_pmu{i}_results.csv") + col = df[df[c]<1844674407][c] + print(col) + if col[11] == "0": + print(col) + continue + # Plotting the data + plt.plot(col, marker="o", linestyle="-", label=i) + + # Adding title and labels + plt.title("PMU Plot for ld") + plt.xlabel("PMU gathered per epoch") + plt.ylabel(f"{c} Values") + plt.legend() + plt.savefig(f"ld_results_pmu_{c}.png") + + +def main(): + parser = argparse.ArgumentParser(description="plot results.") + # parser.add_argument( + # "-f", "--file_name", nargs="?", default="ld_results.csv", + # help="csv containing results.") + + # args = parser.parse_args() + + # df = pd.read_csv(args.file_name) + # means, error = get_mean_and_ebars(df, ["size"], "time") + + # fig,ax = plt.subplots() + + # ax.errorbar(means.index, means["mean"],yerr=error["error"], capsize=4) + # #means.plot(ax=ax, yerr=error, grid=True, rot=0, capsize=4) + # ax.set_xlabel("Size") + # ax.set_ylabel("Execution Time (seconds)") + # print(error) + + # fig.savefig("ld_results.png") + print_pmu_csv() + + +if __name__ == "__main__": + main() diff --git a/script/ld_result.py b/script/ld_result.py new file mode 100644 index 0000000..9fb5601 --- /dev/null +++ b/script/ld_result.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +import subprocess +import time +import csv +import sys, os + +workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"] + + +def run_command(size, mem_node): + start_time = time.time() + cmd = [ + f"/usr/bin/numactl -m {mem_node} ../cmake-build-debug/microbench/ld" + str(size), + ] + print(cmd) + process = subprocess.Popen( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + out, err = process.communicate() + print(f"err: {err}, out: {out}") + return int(out) + + +def run_cxlmemsim_command(size, mem_node): + # start_time = time.time() + cmd = [ + "LOGV=1", + f"/usr/bin/numactl -m {mem_node}", + "../cmake-build-debug/CXLMemSim", + "-t", + "../cmake-build-debug/microbench/ld" + str(size), + "-i", + "100", + ] + cmd = " ".join(cmd) + print(cmd) + os.system(cmd) + # end_time = time.time() + df = pd.read_csv("./output_pmu.csv") + os.system(f"mv ./output_pmu.csv ./ld_pmu{size}_results.csv") + return df + +def execute(cmd): + print(cmd) + process = subprocess.Popen( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + out, err = process.communicate() + print(f"err: {err}, out: {out}") + return out + + +def main(): + prefetching_off = [f"wrmsr -a 0x1a4 0xf"] + prefetching_on = [f"wrmsr -a 0x1a4 0xf"] + + sizes = [2**x for x in range(0, 9)] + + mode = "remote" + mem_node = 0 if mode == "local" else 1 + + + execute(prefetching_off) + f = open(f"ld_results_{mode}_noprefetch.csv", "a") + writer = csv.writer(f, delimiter=",") + writer.writerow(["size", "time"]) + for i in range(10): + for size in sizes: + exec_time = run_command(size, mem_node) + writer.writerow([size, exec_time]) + + execute(prefetching_on) + # for size in sizes: + # df = run_cxlmemsim_command(size,1) + + +if __name__ == "__main__": + main() diff --git a/script/st_result.py b/script/st_result.py new file mode 100644 index 0000000..322b75e --- /dev/null +++ b/script/st_result.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +import subprocess +import time +import csv +import sys, os +import pandas as pd + +workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"] + + +def run_command(size, mem_node): + start_time = time.time() + cmd = [ + f"/usr/bin/numactl -m {mem_node} ../cmake-build-debug/microbench/st" + str(size), + ] + print(cmd) + process = subprocess.Popen( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + out, err = process.communicate() + print(f"err: {err}, out: {out}") + return int(out) + + +def run_cxlmemsim_command(size, mem_node): + # start_time = time.time() + cmd = [ + "LOGV=1", + f"/usr/bin/numactl -m {mem_node}", + "../cmake-build-debug/CXLMemSim", + "-t", + "../cmake-build-debug/microbench/st" + str(size), + "-i", + "100", + ] + cmd = " ".join(cmd) + print(cmd) + os.system(cmd) + # end_time = time.time() + df = pd.read_csv("./output_pmu.csv") + os.system(f"mv ./output_pmu.csv ./st_pmu{size}_results.csv") + return df + + +def main(): + sizes = [2**x for x in range(0, 9)] + + mode = "remote" + mem_node = 0 if mode == "local" else 1 + + + f = open(f"st_results_{mode}.csv", "a") + + writer = csv.writer(f, delimiter=",") + + writer.writerow(["size", "time"]) + for i in range(25): + for size in sizes: + exec_time = run_command(size, mem_node) + writer.writerow([size, exec_time]) + + # for size in sizes: + # df = run_cxlmemsim_command(size,1) + + +if __name__ == "__main__": + main() diff --git a/script/wb_result.py b/script/wb_result.py new file mode 100644 index 0000000..e88208f --- /dev/null +++ b/script/wb_result.py @@ -0,0 +1,104 @@ +import subprocess +import time +import matplotlib.pyplot as plt +import pandas as pd +import os, csv +import re + +workloads = ["mlc", "ld", "st", "nt-ld", "nt-st", "ptr-chasing"] + + +def run_command(size, mem_node): + start_time = time.time() + cmd = [ + f"/usr/bin/numactl -m {mem_node} ../../MLC/Linux/mlc --loaded_latency -W" + + str(size), + ] + print(cmd) + process = subprocess.Popen( + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + print(f"err: {err}, out: {out}") + + out, err = process.communicate() + + regex_pattern = r"\t(\d+)\.\d+\t\s*(\d+)\.\d+" + + # Find all matches + matches = re.findall(regex_pattern, out) + + print(f"err: {err}, out: {matches}") + return int(out) + + +def run_cxlmemsim_command(size, mem_node): + # start_time = time.time() + cmd = [ + "LOGV=1", + f"/usr/bin/numactl -m {mem_node}", + "../cmake-build-debug/CXLMemSim", + "-t", + f"'../../MLC/Linux/mlc --loaded_latency -W{size}'", + "-i", + "100", + "-c", + '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23' + ] + cmd = " ".join(cmd) + print(cmd) + os.system(cmd) + # end_time = time.time() + df = pd.read_csv("./output_pmu.csv") + os.system(f"mv ./output_pmu.csv ./wb_pmu{size}_results.csv") + return df + + +def main(): + sizes = [x for x in range(2, 12)] + + mode = "remote" + mem_node = 0 if mode == "local" else 1 + + inject_latency = [ + "00000", + "00002", + "00008", + "00015", + "00050", + "00100", + "00200", + "00300", + "00400", + "00500", + "00700", + "01000", + "01300", + "01700", + "02500", + "03500", + "05000", + "09000", + "20000", + ] + writer = [] + for latency in inject_latency: + f = open(f"wb_results_{mode}_{latency}.csv", "a") + writer.append(csv.writer(f, delimiter=",")) + + writer[-1].writerow(["size", "time", "bw"]) + # f = open(f"wb_results_{mode}.csv", "a") + # writer.append(csv.writer(f, delimiter=",")) + + # writer.writerow(["size", "time", "bw"]) + for i in range(25): + for size in sizes: + exec_time = run_command(size, mem_node) + + writer.writerow([size, exec_time]) + + # for size in sizes: + # df = run_cxlmemsim_command(size,1) + + +if __name__ == "__main__": + main() diff --git a/src/collectmmap.c b/src/collectmmap.c deleted file mode 100644 index d24e3da..0000000 --- a/src/collectmmap.c +++ /dev/null @@ -1,85 +0,0 @@ -#include -#include -#include -#include - -/* helper macro to place programs, maps, license in - * different sections in elf_bpf file. Section names - * are interpreted by elf_bpf loader - */ -#define SEC(NAME) __attribute__((section(NAME), used)) - -/* helper functions called from eBPF programs written in C */ -static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = (void *)BPF_FUNC_probe_read; -static unsigned long long (*bpf_ktime_get_ns)(void) = (void *)BPF_FUNC_ktime_get_ns; -static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = (void *)BPF_FUNC_trace_printk; -static unsigned long long (*bpf_get_current_pid_tgid)(void) = (void *)BPF_FUNC_get_current_pid_tgid; -/* a helper structure used by eBPF C program - * to describe map attributes to elf_bpf loader - */ -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; - unsigned int inner_map_idx; -}; -#define PT_REGS_PARM1(x) ((x)->di) -#define PT_REGS_PARM2(x) ((x)->si) -SEC("kprobe/__x64_sys_munmap") -int munmap_init(struct pt_regs *ctx) { - long size; - long address; - char fmt[] = "munmap %ld %ld %u\n"; - u32 pid = bpf_get_current_pid_tgid(); - bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM2(ctx)); - bpf_probe_read(&address, sizeof(address), (void *)&PT_REGS_PARM1(ctx)); - if (size > 0) { - bpf_trace_printk(fmt, sizeof(fmt), size, address, pid); - } - return 0; -} -SEC("kprobe/__x64_sys_brk") -int brk_init(struct pt_regs *ctx) { - long address; - char fmt[] = "brk %ld %u\n"; - u32 pid = bpf_get_current_pid_tgid(); - bpf_probe_read(&address, sizeof(address), (void *)&PT_REGS_PARM1(ctx)); - bpf_trace_printk(fmt, sizeof(fmt), address, pid); - return 0; -} -SEC("kretprobe/__x64_sys_brk") -int brk_finish(struct pt_regs *ctx) { - int size; - char fmt[] = "brkret %d %u\n"; - u32 pid = bpf_get_current_pid_tgid(); - bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM1(ctx)); - if (size > 0) { - bpf_trace_printk(fmt, sizeof(fmt), size, pid); - } - return 0; -} -SEC("kprobe/__x64_sys_sbrk") -int sbrk_init(struct pt_regs *ctx) { - int size; - char fmt[] = "sbrkret %d %u\n"; - u32 pid = bpf_get_current_pid_tgid(); - bpf_probe_read(&size, sizeof(size), (void *)&PT_REGS_PARM1(ctx)); - if (size > 0) { - bpf_trace_printk(fmt, sizeof(fmt), size, pid); - } - return 0; -} -SEC("kretprobe/__x64_sys_sbrk") -int sbrk_finish(struct pt_regs *ctx) { - long address; - char fmt[] = "sbrkret %ld %u\n"; - u32 pid = bpf_get_current_pid_tgid(); - bpf_probe_read(&address, sizeof(address), (void *)&PT_REGS_PARM1(ctx)); - bpf_trace_printk(fmt, sizeof(fmt), address, pid); - - return 0; -} -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/src/cxlcontroller.cpp b/src/cxlcontroller.cpp index b40aa9e..e21f086 100644 --- a/src/cxlcontroller.cpp +++ b/src/cxlcontroller.cpp @@ -32,14 +32,18 @@ void CXLController::construct_topo(std::string_view newick_tree) { } } -CXLController::CXLController(Policy *p, int capacity, bool is_page, int epoch) - : CXLSwitch(0), capacity(capacity), policy(p), is_page(is_page) { +CXLController::CXLController(AllocationPolicy *p, int capacity, enum page_type page_type_, int epoch) + : CXLSwitch(0), capacity(capacity), policy(p), page_type_(static_cast(page_type_)) { for (auto switch_ : this->switches) { switch_->set_epoch(epoch); } for (auto expander : this->expanders) { expander->set_epoch(epoch); } + // TODO get LRU wb + // TODO BW type series + + // deferentiate R/W for multireader multi writer } double CXLController::calculate_latency(LatencyPass elem) { return CXLSwitch::calculate_latency(elem) * 1000; } @@ -119,3 +123,10 @@ std::tuple> CXLController::calculate_congestion() return CXLSwitch::calculate_congestion(); } void CXLController::set_epoch(int epoch) { CXLSwitch::set_epoch(epoch); } +// TODO: impl me +MigrationPolicy::MigrationPolicy() { + +} +PagingPolicy::PagingPolicy() { + +} diff --git a/src/cxlendpoint.cpp b/src/cxlendpoint.cpp index 795380a..324c798 100644 --- a/src/cxlendpoint.cpp +++ b/src/cxlendpoint.cpp @@ -5,7 +5,7 @@ #include "cxlendpoint.h" CXLMemExpander::CXLMemExpander(int read_bw, int write_bw, int read_lat, int write_lat, int id, int capacity) - : capacity(capacity), id(id) { + : capacity(capacity), id(id), lru_cache(capacity / 1000 / 64) { this->bandwidth.read = read_bw; this->bandwidth.write = write_bw; this->latency.read = read_lat; @@ -14,8 +14,8 @@ CXLMemExpander::CXLMemExpander(int read_bw, int write_bw, int read_lat, int writ double CXLMemExpander::calculate_latency(LatencyPass lat) { auto all_access = lat.all_access; auto dramlatency = lat.dramlatency; - auto ma_ro = lat.ma_ro; - auto ma_wb = lat.ma_wb; + auto ma_ro = lat.readonly; + auto ma_wb = lat.writeback; auto all_read = std::get<0>(all_access); auto all_write = std::get<1>(all_access); double read_sample = 0.; @@ -26,6 +26,23 @@ double CXLMemExpander::calculate_latency(LatencyPass lat) { if (all_write != 0) { write_sample = ((double)last_write / all_write); } + uint64_t mastall_wb = 0; + uint64_t mastall_ro = 0; + /** If both target_llchits and target_llcmiss are 0, it means that hit in L2. + * Stall by LLC misses is 0. + * choose by vector */ + + // mastall_wb = (double)(target_l2stall / frequency) * + // ((double)(weight * llcmiss_wb) / (double)(target_llchits + (weight * target_llcmiss))) * 1000; + // // weight is a delay specific value current pro + // mastall_ro = (double)(target_l2stall / frequency) * + // ((double)(weight * llcmiss_ro) / (double)(target_llchits + (weight * target_llcmiss))) * + // 1000; // weight is a delay specific value + // LOG(DEBUG) << fmt::format("l2stall={}, mastall_wb={}, mastall_ro={}, target_llchits={}, target_llcmiss={}\n", + // target_l2stall, mastall_wb, mastall_ro, target_llchits, target_llcmiss); + + auto writeback = (double)mastall_wb / dramlatency; + auto readonly = (double)mastall_ro / dramlatency; this->last_latency = ma_ro * read_sample * (latency.read - dramlatency) + ma_wb * write_sample * (latency.write - dramlatency); return this->last_latency; @@ -43,7 +60,7 @@ double CXLMemExpander::calculate_bandwidth(BandwidthPass bw) { if (all_read != 0) { read_sample = ((double)last_read / all_read); } - double write_sample = 0.; + double write_sample = 0.; // based on time series if (all_write != 0) { write_sample = ((double)last_write / all_write); } @@ -51,13 +68,13 @@ double CXLMemExpander::calculate_bandwidth(BandwidthPass bw) { ((double)bandwidth.read)) { res += read_sample * 64 * read_config / 1024 / 1024 / (this->epoch + this->last_latency) * 1000 / bandwidth.read - - this->epoch * 0.001; + this->epoch * 0.001; // TODO: read } if ((((double)write_sample * 64 * write_config) / 1024 / 1024 / (this->epoch + this->last_latency) * 1000) > bandwidth.write) { res += (((double)write_sample * 64 * write_config) / 1024 / 1024 / (this->epoch + this->last_latency) * 1000 / bandwidth.write) - - this->epoch * 0.001; + this->epoch * 0.001; // TODO: wb+clflush } return res; } @@ -89,6 +106,7 @@ void CXLMemExpander::delete_entry(uint64_t addr, uint64_t length) { } int CXLMemExpander::insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr, int index) { + if (index == this->id) { last_timestamp = last_timestamp > timestamp ? last_timestamp : timestamp; // Update the last timestamp // Check if the address is already in the map) @@ -185,10 +203,11 @@ double CXLSwitch::calculate_bandwidth(BandwidthPass elem) { for (auto &switch_ : this->switches) { bw += switch_->calculate_bandwidth(elem); } + // time series return bw; } int CXLSwitch::insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr, int index) { - for (auto &expander : this->expanders) { + for (auto &expander : this->expanders) { // differ read and write。 auto ret = expander->insert(timestamp, phys_addr, virt_addr, index); if (ret == 1) { this->counter.inc_store(); @@ -212,6 +231,7 @@ int CXLSwitch::insert(uint64_t timestamp, uint64_t phys_addr, uint64_t virt_addr return 0; } } + return 0; } std::tuple> CXLSwitch::calculate_congestion() { double latency = 0.0; diff --git a/src/helper.cpp b/src/helper.cpp index 0fdb9d6..580ed43 100644 --- a/src/helper.cpp +++ b/src/helper.cpp @@ -2,260 +2,102 @@ // Created by victoryang00 on 1/12/23. // #include "helper.h" -#include "logging.h" - -const struct ModelContext model_ctx[] = {{CPU_MDL_BDX, - {"/sys/bus/event_source/devices/uncore_cbox_%u/type", - /* - * cbo_config: - * unc_c_llc_victims.m_state - * umask=0x1,event=0x37 - */ - 0x0137, - /* - * all_dram_rds_config: - * offcore_response.all_reads.llc_miss.local_dram - * cpu/umask=0x1,event=0xb7,offcore_rsp=0x40007f7/ - */ - 0x01b7, 0x6040007f7, - /* - * cpu_l2stall_config: - * cycle_activity.stalls_l2_pending - * cpu/umask=0x5,cmask=0x5,event=0xa3/ - */ - 0x50005a3, - /* - * cpu_llcl_hits_config: - * mem_load_uops_l3_hit_retired.xsnp_none - * cpu/umask=0x8,event=0xd2/ - */ - 0x08d2, - /* - * cpu_llcl_miss_config: - * mem_load_uops_l3_miss_retired.local_dram - * cpu/umask=0x1,event=0xd3/ - */ - 0x01d3, - /* - * cpu_bandwidth_read_config: - * UNC_M_CAS_COUNT.RD * 64 - * cpu/umask=0x03,event=0x04/ - */ - 0x0304, - /* - * cpu_bandwidth_write_config: - * UNC_M_CAS_COUNT.WR * 64 - * cpu/umask=0x0c,event=0x04/ - */ - 0x0c04}}, - {CPU_MDL_SKX, - {"/sys/bus/event_source/devices/uncore_cha_%u/type", - /* - * cbo_config: - * UNC_C_LLC_VICTIMS - * umask=0x21,event=37 - */ - 0x2137, - /* - * all_dram_rds_config: - * OCR.ALL_READS.L3_MISS.SNOOP_NONE - * cpu/umask=0x1,event=0xb7,offcore_rsp=0xBC408000/ - */ - 0x01b7, 0xBC408000, - /* - * cpu_l2stall_config: - * cycle_activity.stalls_l2_miss - * cpu/umask=0x5,cmask=0x5,event=0xa3/ - */ - 0x50005a3, - /* - * cpu_llcl_hits_config: - * mem_load_l3_hit_retired.xsnp_none - * cpu/umask=0x8,event=0xd2/ - */ - 0x08d2, - /* - * cpu_llcl_miss_config: - * mem_load_l3_miss_retired.local_dram - * cpu/umask=0x1,event=0xd3/ - */ - 0x01d3, - /* - * cpu_bandwidth_read_config: - * UNC_M_CAS_COUNT.RD * 64 - * cpu/umask=0x03,event=0x04/ - */ - 0x0304, - /* - * cpu_bandwidth_write_config: - * UNC_M_CAS_COUNT.WR * 64 - * cpu/umask=0x0c,event=0x04/ - */ - 0x0c04}}, - {CPU_MDL_SPR, - {"/sys/bus/event_source/devices/uncore_cha_%u/type", - /* - * cbo_config: - * UNC_C_LLC_VICTIMS => OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD - * umask=0x10,event=b0 - */ - 0x10b0, - /* - * all_dram_rds_config: - * OCR.ALL_READS.L3_MISS.SNOOP_NONE => L3_MISS.SNOOP_MISS_OR_NO_FWD - * cpu/umask=0x1,event=0xb7,offcore_rsp=0x63FC00491/ - */ - 0x01b7, 0x63FC00491, - /* - * cpu_l2stall_config: - * cycle_activity.stalls_l2_miss - * cpu/umask=0x5,cmask=0x5,event=0xa3/ - */ - 0x50005a3, - /* - * cpu_llcl_hits_config: - * mem_load_l3_hit_retired.xsnp_none - * cpu/umask=0x8,event=0xd2/ - */ - 0x08d2, - /* - * cpu_llcl_miss_config: - * mem_load_l3_miss_retired.local_dram - * cpu/umask=0x1,event=0xd3/ - */ - 0x01d3, - /* - * cpu_bandwidth_read_config: - * UNC_M_CAS_COUNT.RD * 64 - * cpu/umask=0xcf,event=0x05/ - */ - 0xcf05, - /* - * cpu_bandwidth_write_config: - * UNC_M_CAS_COUNT.WR * 64 - * cpu/umask=0xf0,event=0x05/ - */ - 0xf005}}, - {CPU_MDL_ADL, - {"/sys/bus/event_source/devices/uncore_cbox_%u/type", - /* - * cbo_config: - * UNC_C_LLC_VICTIMS => OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD - * umask=0x21,event=10 - */ - 0x2110, - /* - * all_dram_rds_config: - * OCR.ALL_READS.L3_MISS.SNOOP_NONE => OCR.DEMAND_DATA_RD.L3_MISS - * cpu/umask=0x1,event=0x2A,offcore_rsp=0x3FBFC00001/ - */ - 0x012a, 0x3fbfc00001, - /* - * cpu_l2stall_config: - * cycle_activity.stalls_l2_miss - * cpu/umask=0x5,cmask=0x5,event=0xa3/ - */ - 0x50005a3, - /* - * cpu_llcl_hits_config: - * mem_load_l3_hit_retired.xsnp_none - * cpu/umask=0x8,event=0xd2/ - */ - 0x08d2, - /* - * cpu_llcl_miss_config: - * mem_load_l3_miss_retired.local_dram - * cpu/umask=0x1,event=0xd3/ - */ - 0x01d3, - /* - * cpu_bandwidth_read_config: - * UNC_M_CAS_COUNT.RD * 64 - * cpu/umask=0xcf,event=0x05/ - */ - 0xcf05, - /* - * cpu_bandwidth_write_config: - * UNC_M_CAS_COUNT.WR * 64 - * cpu/umask=0xf0,event=0x05/ - */ - 0xf005}}, - {CPU_MDL_END, {0}}}; +#include +#include + +struct ModelContext model_ctx[] = {{CPU_MDL_BDX, + { + "/sys/bus/event_source/devices/uncore_cbo_%u/type", + }}, + {CPU_MDL_SKX, + { + "/sys/bus/event_source/devices/uncore_cha_%u/type", + }}, + {CPU_MDL_SPR, + { + "/sys/bus/event_source/devices/uncore_cha_%u/type", + }}, + {CPU_MDL_ADL, + { + "/sys/bus/event_source/devices/uncore_cbo_%u/type", + }}, + {CPU_MDL_END, {""}}}; int Helper::num_of_cpu() { - int ncpu; - ncpu = sysconf(_SC_NPROCESSORS_ONLN); + int ncpu = sysconf(_SC_NPROCESSORS_ONLN); if (ncpu < 0) { LOG(ERROR) << "sysconf"; } - LOG(DEBUG) << fmt::format("num_of_cpu={}\n", ncpu); return ncpu; } -int Helper::num_of_cbo() { - int ncbo = 0; - for (; ncbo < 128; ++ncbo) { - std::string path = fmt::format("/sys/bus/event_source/devices/uncore_cbox_{}/type", ncbo); - // LOG(DEBUG) << path; +int Helper::num_of_cha() { + int ncha = 0; + for (; ncha < cpu; ++ncha) { + std::string path = fmt::format("/sys/bus/event_source/devices/uncore_cha_{}/type", ncha); + // LOG(DEBUG) << path; if (!std::filesystem::exists(path)) { break; } } - LOG(DEBUG) << fmt::format("num_of_cbo={}\n", ncbo); - return ncbo; + return ncha; } -double Helper::cpu_frequency() const { - int i = 0; - int cpu = 0; +double Helper::cpu_frequency() { + int i, c = 0; double cpu_mhz = 0.0; double max_cpu_mhz = 0.0; std::ifstream fp("/proc/cpuinfo"); - for (std::string line; cpu != this->cpu - 1; std::getline(fp, line)) { + for (std::string line; c != this->num_of_cpu() - 1; std::getline(fp, line)) { // LOG(DEBUG) << fmt::format("line: {}\n", line); i = std::sscanf(line.c_str(), "cpu MHz : %lf", &cpu_mhz); max_cpu_mhz = i == 1 ? std::max(max_cpu_mhz, cpu_mhz) : max_cpu_mhz; - std::sscanf(line.c_str(), "processor : %d", &cpu); + std::sscanf(line.c_str(), "processor : %d", &c); } LOG(DEBUG) << fmt::format("cpu MHz: {}\n", cpu_mhz); return cpu_mhz; } -PerfConfig Helper::detect_model(uint32_t model) { +PerfConfig Helper::detect_model(uint32_t model, const std::vector &perf_name, + const std::vector &perf_conf1, const std::vector &perf_conf2) { int i = 0; LOG(INFO) << fmt::format("Detecting model...{}\n", model); while (model_ctx[i].model != CPU_MDL_END) { if (model_ctx[i].model == model) { this->perf_conf = model_ctx[i].perf_conf; - return model_ctx[i].perf_conf; + for (int j = 0; j < 4; ++j) { + this->perf_conf.cha[j] = std::make_tuple(perf_name[j], perf_conf1[j], perf_conf2[j]); + } + for (int j = 0; j < 4; ++j) { + this->perf_conf.cpu[j] = std::make_tuple(perf_name[j + 4], perf_conf1[j + 4], perf_conf2[j + 4]); + } + return this->perf_conf; } i++; } - LOG(ERROR) << "Failed to execute. This CPU model is not supported. Update src/types.c\n"; + LOG(ERROR) << "Failed to execute. This CPU model is not supported. Refer to perfmon or pcm to add support\n"; throw; } -Helper::Helper() : perf_conf({}) { +Helper::Helper() { cpu = num_of_cpu(); - LOG(DEBUG) << cpu; - cbo = num_of_cbo(); - cpu_freq = cpu_frequency(); + cha = num_of_cha(); } void Helper::noop_handler(int sig) { ; } void Helper::detach_children() { - struct sigaction sa; + struct sigaction sa {}; sa.sa_handler = noop_handler; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_RESTART | SA_NOCLDWAIT; - if (sigaction(SIGCHLD, &sa, NULL) < 0) { + if (sigaction(SIGCHLD, &sa, nullptr) < 0) { LOG(ERROR) << fmt::format("Failed to sigaction: %s", strerror(errno)); } } int PMUInfo::start_all_pmcs() { /* enable all pmcs to count */ - int i, r; - for (i = 0; i < helper->num_of_cpu(); i++) { + int r, i; + for (i = 0; i < this->cpus.size(); i++) { r = this->cpus[i].start(); if (r < 0) { LOG(ERROR) << fmt::format("start failed. cpu:{}\n", i); @@ -265,24 +107,19 @@ int PMUInfo::start_all_pmcs() { return 0; } PMUInfo::PMUInfo(pid_t pid, Helper *helper, struct PerfConfig *perf_config) : helper(helper) { - int i, r, n; + int r; - n = helper->num_of_cbo(); - - for (i = 0; i < n; i++) { - this->cbos.emplace_back(i, perf_config); + for (auto i : helper->used_cpu) { + this->chas.emplace_back(i, perf_config); } - // unfreeze counters - r = this->unfreeze_counters_cbo_all(); + r = this->unfreeze_counters_cha_all(); if (r < 0) { - LOG(DEBUG) << fmt::format("unfreeze_counters_cbo_all failed.\n"); + LOG(DEBUG) << fmt::format("unfreeze_counters_cha_all failed.\n"); throw; } - n = helper->num_of_cpu(); - - for (i = 0; i < n; i++) { + for (auto i : helper->used_cpu) { this->cpus.emplace_back(pid, i, perf_config); } @@ -295,7 +132,7 @@ int PMUInfo::stop_all_pmcs() { /* disable all pmcs to count */ int i, r; - for (i = 0; i < helper->num_of_cpu(); i++) { + for (i = 0; i < this->cpus.size(); i++) { r = this->cpus[i].stop(); if (r < 0) { LOG(ERROR) << fmt::format("stop failed. cpu:{}\n", i); @@ -305,31 +142,36 @@ int PMUInfo::stop_all_pmcs() { return 0; } -int PMUInfo::unfreeze_counters_cbo_all() { +int PMUInfo::unfreeze_counters_cha_all() { int i, r; - for (i = 0; i < helper->num_of_cbo(); i++) { - r = this->cbos[i].perf->start(); - if (r < 0) { - LOG(ERROR) << fmt::format("perf_start failed. cbo:{}\n", i); - return r; + for (i = 0; i < this->chas.size(); i++) { + for (int j : {0, 1, 2, 3}) { + r = this->chas[i].perf[j]->start(); + if (r < 0) { + LOG(ERROR) << fmt::format("perf_start failed. cha:{}\n", i); + return r; + } } } return 0; } -int PMUInfo::freeze_counters_cbo_all() { +int PMUInfo::freeze_counters_cha_all() { int i, r; - for (i = 0; i < helper->num_of_cbo(); i++) { - r = this->cbos[i].perf->stop(); - if (r < 0) { - LOG(ERROR) << fmt::format("perf_stop failed. cbo:{}\n", i); - return r; + for (i = 0; i < this->chas.size(); i++) { + for (int j : {0, 1, 2, 3}) { + r = this->chas[i].perf[j]->stop(); + if (r < 0) { + LOG(ERROR) << fmt::format("perf_stop failed. cha:{}\n", i); + return r; + } } } return 0; } PMUInfo::~PMUInfo() { this->cpus.clear(); - this->cbos.clear(); + this->chas.clear(); + stop_all_pmcs(); } diff --git a/src/incore.cpp b/src/incore.cpp index accda65..c2c39ab 100644 --- a/src/incore.cpp +++ b/src/incore.cpp @@ -4,7 +4,7 @@ #include "incore.h" #include "helper.h" - +extern Helper helper; void pcm_cpuid(const unsigned leaf, CPUID_INFO *info) { __asm__ __volatile__("cpuid" : "=a"(info->reg.eax), "=b"(info->reg.ebx), "=c"(info->reg.ecx), "=d"(info->reg.edx) @@ -14,7 +14,7 @@ void pcm_cpuid(const unsigned leaf, CPUID_INFO *info) { int Incore::start() { int i, r = -1; - for (i = 0; i < 4; i++) { + for (i = 0; i < this->perf.size(); i++) { r = this->perf[i]->start(); if (r < 0) { LOG(ERROR) << fmt::format("perf_start failed. i:{}\n", i); @@ -26,7 +26,7 @@ int Incore::start() { int Incore::stop() { int i, r = -1; - for (i = 0; i < 4; i++) { + for (i = 0; i < this->perf.size(); i++) { r = this->perf[i]->stop(); if (r < 0) { LOG(ERROR) << fmt::format("perf_stop failed. i:{}\n", i); @@ -35,95 +35,35 @@ int Incore::stop() { } return r; } -void Incore::init_all_dram_rds(const pid_t pid, const int cpu) { - this->perf[0] = init_incore_perf(pid, cpu, perf_config->all_dram_rds_config, perf_config->all_dram_rds_config1); -} -void Incore::init_cpu_mem_read(const pid_t pid, const int cpu) { - this->perf[0] = init_incore_perf(pid, cpu, perf_config->cpu_bandwidth_read_config, 0); -} -void Incore::init_cpu_l2stall(const pid_t pid, const int cpu) { - this->perf[1] = init_incore_perf(pid, cpu, perf_config->cpu_l2stall_config, 0); -} -void Incore::init_cpu_llcl_hits(const pid_t pid, const int cpu) { - this->perf[2] = init_incore_perf(pid, cpu, perf_config->cpu_llcl_hits_config, 0); -} -void Incore::init_cpu_llcl_miss(const pid_t pid, const int cpu) { - this->perf[3] = init_incore_perf(pid, cpu, perf_config->cpu_llcl_miss_config, 0); -} -void Incore::init_cpu_mem_write(const pid_t pid, const int cpu) { - this->perf[5] = init_incore_perf(pid, cpu, perf_config->cpu_bandwidth_write_config, 0); -} -void Incore::init_cpu_ebpf(const pid_t pid, const int cpu) { - if (cpu == 0) - this->perf[4] = init_incore_bpf_perf(pid, cpu); - else - this->perf[4] = nullptr; -} -int Incore::read_cpu_elems(struct CPUElem *elem) { - ssize_t r; - - r = this->perf[0]->read_pmu(&elem->cpu_bandwidth_read); - if (r < 0) { - LOG(ERROR) << fmt::format("read cpu_bandwidth_read failed.\n"); - return r; - } - LOG(DEBUG) << fmt::format("read cpu_bandwidth_read:{}\n", elem->cpu_bandwidth_read); - r = this->perf[1]->read_pmu(&elem->cpu_l2stall_t); - if (r < 0) { - LOG(ERROR) << fmt::format("read cpu_l2stall_t failled.\n"); - return r; - } - LOG(DEBUG) << fmt::format("read cpu_l2stall_t:{}\n", elem->cpu_l2stall_t); - - r = this->perf[2]->read_pmu(&elem->cpu_llcl_hits); - if (r < 0) { - LOG(ERROR) << fmt::format("read cpu_llcl_hits failed.\n"); - return r; +ssize_t Incore::read_cpu_elems(struct CPUElem *elem) { + ssize_t r; + for (auto const &[idx, value] : this->perf | enumerate) { + r = value->read_pmu(&elem->cpu[idx]); + if (r < 0) { + LOG(ERROR) << fmt::format("read cpu_elems[{}] failed.\n", std::get<0>(helper.perf_conf.cha[idx])); + return r; + } + LOG(DEBUG) << fmt::format("read cpu_elems[{}]:{}\n", std::get<0>(helper.perf_conf.cpu[idx]), elem->cpu[idx]); } - LOG(DEBUG) << fmt::format("read cpu_llcl_hits:{}\n", elem->cpu_llcl_hits); - r = this->perf[3]->read_pmu(&elem->cpu_llcl_miss); - if (r < 0) { - LOG(ERROR) << fmt::format("read cpu_llcl_miss failed.\n"); - return r; - } - LOG(DEBUG) << fmt::format("read cpu_llcl_miss:{}\n", elem->cpu_llcl_miss); - - // r = this->perf[4]->read_pmu(&elem->cpu_bandwidth_read); - // if (r < 0) { - // LOG(ERROR) << fmt::format("read cpu_bandwidth_read failed.\n"); - // return r; - // } - // LOG(DEBUG) << fmt::format("read cpu_bandwidth_read:{}\n", elem->cpu_bandwidth_read); - // r = this->perf[5]->read_pmu(&elem->cpu_bandwidth_write); - // if (r < 0) { - // LOG(ERROR) << fmt::format("read cpu_bandwidth_write failed.\n"); - // return r; - // } - // LOG(DEBUG) << fmt::format("read cpu_bandwidth_write:{}\n", elem->cpu_bandwidth_write); - if (this->perf[4] != nullptr) { - elem->cpu_munmap_address_length = this->perf[4]->read_trace_pipe(); - LOG(DEBUG) << "read munmap result with size:" << elem->cpu_munmap_address_length.size() << "\n"; - } + return 0; } + Incore::Incore(const pid_t pid, const int cpu, struct PerfConfig *perf_config) : perf_config(perf_config) { /* reset all pmc values */ - // this->init_all_dram_rds(pid, cpu); - this->init_cpu_mem_read(pid, cpu); - this->init_cpu_l2stall(pid, cpu); - this->init_cpu_llcl_hits(pid, cpu); - this->init_cpu_llcl_miss(pid, cpu); - this->init_cpu_ebpf(pid, cpu); - // this->init_cpu_mem_write(pid, cpu); + for (int i = 0; i < perf_config->cpu.size(); i++) { + this->perf[i] = init_incore_perf(pid, cpu, std::get<1>(perf_config->cpu[i]), std::get<2>(perf_config->cpu[i])); + } } + bool get_cpu_info(struct CPUInfo *cpu_info) { char buffer[1024]; union { char cbuf[16]; int ibuf[16 / sizeof(int)]; - } buf; - CPUID_INFO cpuinfo; + } buf{}; + CPUID_INFO cpuinfo{}; pcm_cpuid(0, &cpuinfo); @@ -133,7 +73,7 @@ bool get_cpu_info(struct CPUInfo *cpu_info) { buf.ibuf[1] = cpuinfo.array[3]; buf.ibuf[2] = cpuinfo.array[2]; - if (strncmp(buf.cbuf, "GenuineIntel", 4 * 3) != 0) { + if (strncmp(buf.cbuf, "GenuineIntel", 12) != 0) { LOG(ERROR) << fmt::format("We only Support Intel CPU\n"); return false; } diff --git a/src/logging.cpp b/src/logging.cpp index 3ae18ec..83bc062 100644 --- a/src/logging.cpp +++ b/src/logging.cpp @@ -3,11 +3,17 @@ // #include "logging.h" +#include +#include void LogWriter::operator<(const LogStream &stream) { std::ostringstream msg; - msg << stream.sstream_->rdbuf(); - output_log(msg); + if (log_level_ == TRACE) + file_ << stream.sstream_->rdbuf(); + else { + msg << stream.sstream_->rdbuf(); + output_log(msg); + } } void LogWriter::output_log(const std::ostringstream &msg) { diff --git a/src/main.cpp b/src/main.cc similarity index 56% rename from src/main.cpp rename to src/main.cc index 3c38dde..59f2efa 100644 --- a/src/main.cpp +++ b/src/main.cc @@ -1,50 +1,56 @@ // // Created by victoryang00 on 1/12/23. // + #include "cxlendpoint.h" #include "helper.h" -#include "logging.h" #include "monitor.h" #include "policy.h" +#include "sock.h" #include #include -#include #include #include #include #include -#include #include #include #include #include -#define SOCKET_PATH "/tmp/cxl_mem_simulator.sock" - +Helper helper{}; int main(int argc, char *argv[]) { - - cxxopts::Options options("CXL-MEM-Simulator", - "For simulation of CXL.mem Type 3 on Broadwell, Skylake, and Saphire Rapids"); + cxxopts::Options options("CXLMemSim", "For simulation of CXL.mem Type 3 on Sapphire Rapids"); options.add_options()("t,target", "The script file to execute", - cxxopts::value()->default_value("./microbench/many_calloc"))( - "h,help", "The value for epoch value", cxxopts::value()->default_value("false"))( - "i,interval", "The value for epoch value", cxxopts::value()->default_value("5"))( - "c,cpuset", "The CPUSET for CPU to set affinity on", - cxxopts::value>()->default_value("0,1,2,3,4,5,6,7,8,9,10,11,12,13,14"))( - "d,dramlatency", "The current platform's dram latency", cxxopts::value()->default_value("85"))( - "p,pebsperiod", "The pebs sample period", cxxopts::value()->default_value("1"))( + cxxopts::value()->default_value("./microbench/ld_simple"))( + "h,help", "Help for CXLMemSim", cxxopts::value()->default_value("false"))( + "i,interval", "The value for epoch value", cxxopts::value()->default_value("1000"))( + "s,source", "Collection Phase or Validation Phase", cxxopts::value()->default_value("false"))( + "c,cpuset", "The CPUSET for CPU to set affinity on and only run the target process on those CPUs", + cxxopts::value>()->default_value("0"))("d,dramlatency", "The current platform's dram latency", + cxxopts::value()->default_value("110"))( + "p,pebsperiod", "The pebs sample period", cxxopts::value()->default_value("100"))( "m,mode", "Page mode or cacheline mode", cxxopts::value()->default_value("p"))( "o,topology", "The newick tree input for the CXL memory expander topology", cxxopts::value()->default_value("(1,(2,3))"))( - "s,capacity", "The capacity vector of the CXL memory expander with the firsgt local", + "e,capacity", "The capacity vector of the CXL memory expander with the firsgt local", cxxopts::value>()->default_value("0,20,20,20"))( "f,frequency", "The frequency for the running thread", cxxopts::value()->default_value("4000"))( "l,latency", "The simulated latency by epoch based calculation for injected latency", cxxopts::value>()->default_value("100,150,100,150,100,150"))( - "w,weight", "The simulated weight for multiplying with the LLC miss", - cxxopts::value()->default_value("4.1"))( "b,bandwidth", "The simulated bandwidth by linear regression", - cxxopts::value>()->default_value("50,50,50,50,50,50")); + cxxopts::value>()->default_value("50,50,50,50,50,50"))( + "x,pmu_name", "The input for Collected PMU", + cxxopts::value>()->default_value( + "tatal_stall,all_dram_rds,l2stall,snoop_fw_wb,llcl_hits,llcl_miss,null,null"))( + "y,pmu_config1", "The config0 for Collected PMU", + cxxopts::value>()->default_value("0x04004a3,0x01b7,0x05005a3,0x205c,0x08d2,0x01d3,0,0"))( + "z,pmu_config2", "The config1 for Collected PMU", + cxxopts::value>()->default_value("0,0x63FC00491,0,0,0,0,0,0"))( + "w,weight", "The weight for Linear Regression", + cxxopts::value>()->default_value("88, 88, 88, 88, 88, 88, 88"))( + "v,weight_vec", "The weight vector for Linear Regression", + cxxopts::value>()->default_value("400, 800, 1200, 1600, 2000, 2400, 3000")); auto result = options.parse(argc, argv); if (result["help"].as()) { @@ -56,31 +62,51 @@ int main(int argc, char *argv[]) { auto cpuset = result["cpuset"].as>(); auto pebsperiod = result["pebsperiod"].as(); auto latency = result["latency"].as>(); - auto weight = result["weight"].as(); auto bandwidth = result["bandwidth"].as>(); auto frequency = result["frequency"].as(); auto topology = result["topology"].as(); auto capacity = result["capacity"].as>(); auto dramlatency = result["dramlatency"].as(); - auto mode = result["mode"].as() == "p" ? true : false; - Helper helper{}; - InterleavePolicy *policy = new InterleavePolicy(); + auto pmu_name = result["pmu_name"].as>(); + auto pmu_config1 = result["pmu_config1"].as>(); + auto pmu_config2 = result["pmu_config2"].as>(); + auto weight = result["weight"].as>(); + auto weight_vec = result["weight_vec"].as>(); + auto source = result["source"].as(); + enum page_type mode; + if (result["mode"].as() == "hugepage_2M") { + mode = page_type::HUGEPAGE_2M; + } else if (result["mode"].as() == "hugepage_1G") { + mode = page_type::HUGEPAGE_1G; + } else if (result["mode"].as() == "cacheline") { + mode = page_type::CACHELINE; + } else { + mode = page_type::PAGE; + } + + auto *policy = new InterleavePolicy(); CXLController *controller; + uint64_t use_cpus = 0; cpu_set_t use_cpuset; CPU_ZERO(&use_cpuset); - for (int i = 0; i < helper.cpu; i++) { + for (auto i : cpuset) { if (!use_cpus || use_cpus & 1UL << i) { CPU_SET(i, &use_cpuset); - LOG(DEBUG) << fmt::format("use cpuid: {}{}\n", i, use_cpus); /** TODO: set CAT here */ + LOG(DEBUG) << fmt::format("use cpuid: {}{}\n", i, use_cpus); } } + auto tnum = CPU_COUNT(&use_cpuset); auto cur_processes = 0; - auto ncpu = helper.cpu; - auto ncbo = helper.cbo; - LOG(DEBUG) << fmt::format("tnum:{}, intrval:{}, weight:{}\n", tnum, interval, weight); - for (auto const &[idx, value] : capacity | ranges::views::enumerate) { + auto ncpu = helper.num_of_cpu(); + auto ncha = helper.num_of_cha(); + LOG(DEBUG) << fmt::format("tnum:{}, intrval:{}\n", tnum, interval); + for (auto const &[idx, value] : weight | enumerate) { + LOG(DEBUG) << fmt::format("weight[{}]:{}\n", weight_vec[idx], value); + } + + for (auto const &[idx, value] : capacity | enumerate) { if (idx == 0) { LOG(DEBUG) << fmt::format("local_memory_region capacity:{}\n", value); controller = new CXLController(policy, capacity[0], mode, interval); @@ -101,67 +127,62 @@ int main(int argc, char *argv[]) { int sock; struct sockaddr_un addr {}; + /** Hove been got by socket if it's not main thread and synchro */ sock = socket(AF_UNIX, SOCK_DGRAM, 0); addr.sun_family = AF_UNIX; strcpy(addr.sun_path, SOCKET_PATH); remove(addr.sun_path); - if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) == -1) { // can be blocked for multi thread LOG(ERROR) << "Failed to execute. Can't bind to a socket.\n"; exit(1); } + + size_t sock_buf_size = sizeof(op_data) + 1; + char *sock_buf = (char *)malloc(sock_buf_size); + LOG(DEBUG) << fmt::format("cpu_freq:{}\n", frequency); - LOG(DEBUG) << fmt::format("num_of_cbo:{}\n", ncbo); + LOG(DEBUG) << fmt::format("num_of_cha:{}\n", ncha); LOG(DEBUG) << fmt::format("num_of_cpu:{}\n", ncpu); - Monitors monitors{tnum, &use_cpuset, static_cast(capacity.size()) - 1, helper}; + for (auto j : cpuset) { + helper.used_cpu.push_back(cpuset[j]); + helper.used_cha.push_back(cpuset[j]); + } + Monitors monitors{tnum, &use_cpuset}; - // https://stackoverflow.com/questions/24796266/tokenizing-a-string-to-pass-as-char-into-execve + /** Reinterpret the input for the argv argc */ char cmd_buf[1024] = {0}; strncpy(cmd_buf, target.c_str(), sizeof(cmd_buf)); - - /* This strtok_r() call puts '\0' after the first token in the buffer, - * It saves the state to the strtok_state and subsequent calls resume from that point. */ char *strtok_state = nullptr; char *filename = strtok_r(cmd_buf, " ", &strtok_state); - - /* Allocate an array of pointers. - * We will make them point to certain locations inside the cmd_buf. */ char *args[32] = {nullptr}; args[0] = filename; - /* loop the strtok_r() call while there are tokens and free space in the array */ size_t current_arg_idx; for (current_arg_idx = 1; current_arg_idx < 32; ++current_arg_idx) { - /* Note that the first argument to strtok_r() is nullptr. - * That means resume from a point saved in the strtok_state. */ char *current_arg = strtok_r(nullptr, " ", &strtok_state); if (current_arg == nullptr) { break; } - args[current_arg_idx] = current_arg; LOG(INFO) << fmt::format("args[{}] = {}\n", current_arg_idx, args[current_arg_idx]); } - /* zombie avoid */ + /** Create target process */ Helper::detach_children(); - /* create target process */ auto t_process = fork(); if (t_process < 0) { LOG(ERROR) << "Fork: failed to create target process"; exit(1); } else if (t_process == 0) { - execv(filename, args); - /* We do not need to check the return value */ + execv(filename, args); // taskset in lpace LOG(ERROR) << "Exec: failed to create target process\n"; exit(1); } - /** TODO: bind the rest of core in 0-7 and affine the CXL Simulator to 8 */ - // In case of process, use SIGSTOP. - auto res = monitors.enable(t_process, t_process, true, pebsperiod, tnum, mode); + /** In case of process, use SIGSTOP. */ + auto res = monitors.enable(t_process, t_process, true, pebsperiod, tnum); if (res == -1) { LOG(ERROR) << fmt::format("Failed to enable monitor\n"); exit(0); } else if (res < 0) { - // pid not found. might be already terminated. LOG(DEBUG) << fmt::format("pid({}) not found. might be already terminated.\n", t_process); } cur_processes++; @@ -174,53 +195,117 @@ int main(int argc, char *argv[]) { exit(0); } - // Wait all the target processes until emulation process initialized. + /** Wait all the target processes until emulation process initialized. */ monitors.stop_all(cur_processes); - /* get CPU information */ + /** Get CPU information */ if (!get_cpu_info(&monitors.mon[0].before->cpuinfo)) { LOG(DEBUG) << "Failed to obtain CPU information.\n"; } - - /* check the CPU model */ - auto perf_config = helper.detect_model(monitors.mon[0].before->cpuinfo.cpu_model); - + auto perf_config = + helper.detect_model(monitors.mon[0].before->cpuinfo.cpu_model, pmu_name, pmu_config1, pmu_config2); PMUInfo pmu{t_process, &helper, &perf_config}; - /* Caculate epoch time */ + /*% Caculate epoch time */ struct timespec waittime {}; waittime.tv_sec = interval / 1000; waittime.tv_nsec = (interval % 1000) * 1000000; LOG(DEBUG) << "The target process starts running.\n"; LOG(DEBUG) << fmt::format("set nano sec = {}\n", waittime.tv_nsec); + LOG(TRACE) << fmt::format("{}\n", monitors); + monitors.print_flag = false; - /* read CBo params */ - for (auto mon : monitors.mon) { - for (auto const &[idx, value] : pmu.cbos | ranges::views::enumerate) { - pmu.cbos[idx].read_cbo_elems(&mon.before->cbos[idx]); + /* read CHA params */ + for (const auto &mon : monitors.mon) { + for (auto const &[idx, value] : pmu.chas | enumerate) { + pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]); } - for (auto const &[idx, value] : pmu.cpus | ranges::views::enumerate) { + for (auto const &[idx, value] : pmu.cpus | enumerate) { pmu.cpus[idx].read_cpu_elems(&mon.before->cpus[idx]); } } uint32_t diff_nsec = 0; - struct timespec start_ts, end_ts; - struct timespec sleep_start_ts, sleep_end_ts; + struct timespec start_ts { + }, end_ts{}; + struct timespec sleep_start_ts { + }, sleep_end_ts{}; - // Wait all the target processes until emulation process initialized. + /** Wait all the target processes until emulation process initialized. */ monitors.run_all(cur_processes); for (int i = 0; i < cur_processes; i++) { clock_gettime(CLOCK_MONOTONIC, &monitors.mon[i].start_exec_ts); } while (true) { + /** Get from the CXLMemSimHook */ + int n; + do { + memset(sock_buf, 0, sock_buf_size); + // without blocking + n = recv(sock, sock_buf, sock_buf_size, MSG_DONTWAIT); + if (n < 1) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + // no data + break; + } else { + LOG(ERROR) << "Failed to recv"; + exit(-1); + } + } else if (n >= sizeof(struct op_data) && n <= sock_buf_size - 1) { + auto *opd = (struct op_data *)sock_buf; + LOG(ERROR) << fmt::format("received data: size={}, tgid={}, tid=[], opcode={}\n", n, opd->tgid, + opd->tid, opd->opcode); + + if (opd->opcode == CXLMEMSIM_THREAD_CREATE || opd->opcode == CXLMEMSIM_PROCESS_CREATE) { + int t; + bool is_process = opd->opcode == CXLMEMSIM_PROCESS_CREATE; + // register to monitor + + t = monitors.enable(opd->tgid, opd->tid, is_process, pebsperiod, tnum); + if (t == -1) { + LOG(ERROR) << "Failed to enable monitor\n"; + } else if (t < 0) { + // tid not found. might be already terminated. + continue; + } + auto mon = monitors.mon[t]; + // Wait the t processes until emulation process initialized. + mon.stop(); + /* read CHA params */ + for (auto const &[idx, value] : pmu.chas | enumerate) { + pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]); + } + for (auto const &[idx, value] : pmu.chas | enumerate) { + pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]); + } + // Run the t processes. + mon.run(); + clock_gettime(CLOCK_MONOTONIC, &mon.start_exec_ts); + } else if (opd->opcode == CXLMEMSIM_THREAD_EXIT) { + // unregister from monitor, and display results. + // get the tid from the tgid + auto mon = monitors.get_mon(opd->tgid, opd->tid); + mon.stop(); + } else if (opd->opcode == CXLMEMSIM_STABLE_SIGNAL) { + for (auto const &[i, mon] : monitors.mon | enumerate) { + if (mon.status == MONITOR_ON) { + mon.stop(); + mon.status = MONITOR_SUSPEND; + } + } + } + + } else { + LOG(ERROR) << fmt::format("received data is invalid size: size={}", n); + } + } while (n > 0); // check the next message. + /* wait for pre-defined interval */ clock_gettime(CLOCK_MONOTONIC, &sleep_start_ts); /** Here was a definition for the multi process and thread to enable multiple monitor */ - struct timespec req = waittime; struct timespec rem = {0}; while (true) { @@ -242,53 +327,61 @@ int main(int argc, char *argv[]) { } } } - clock_gettime(CLOCK_MONOTONIC, &sleep_end_ts); - for (auto const &[i, mon] : monitors.mon | ranges::views::enumerate) { + uint64_t calibrated_delay; + for (auto const &[i, mon] : monitors.mon | enumerate) { + // check other process if (mon.status == MONITOR_DISABLE) { continue; } - if (mon.status == MONITOR_ON) { + if (mon.status == MONITOR_ON || mon.status == MONITOR_SUSPEND) { clock_gettime(CLOCK_MONOTONIC, &start_ts); LOG(DEBUG) << fmt::format("[{}:{}:{}] start_ts: {}.{}\n", i, mon.tgid, mon.tid, start_ts.tv_sec, start_ts.tv_nsec); mon.stop(); - /* read CBo values */ + /** Read CHA values */ uint64_t wb_cnt = 0; - for (int j = 0; j < ncbo; j++) { - pmu.cbos[j].read_cbo_elems(&mon.after->cbos[j]); - wb_cnt += mon.after->cbos[j].llc_wb - mon.before->cbos[j].llc_wb; + std::vector cha_vec, cpu_vec{}; + // for (int j = 0; j < ncha; j++) { + // pmu.chas[j].read_cha_elems(&mon.after->chas[j]); + // wb_cnt += mon.after->chas[j].cpu_llc_wb - mon.before->chas[j].cpu_llc_wb; + // } + // LOG(INFO) << fmt::format("[{}:{}:{}] LLC_WB = {}\n", i, mon.tgid, mon.tid, wb_cnt); + // } + for (int j = 0; j < helper.used_cha.size(); j++) { + for (auto const &[idx, value] : pmu.chas | enumerate) { + value.read_cha_elems(&mon.after->chas[j]); + cha_vec.emplace_back(mon.after->chas[j].cha[idx] - mon.before->chas[j].cha[idx]); + } } - LOG(INFO) << fmt::format("[{}:{}:{}] LLC_WB = {}\n", i, mon.tgid, mon.tid, wb_cnt); - - /* read CPU params */ + /*** read CPU params */ uint64_t read_config = 0; uint64_t target_l2stall = 0, target_llcmiss = 0, target_llchits = 0; - for (int j = 0; j < ncpu; ++j) { - pmu.cpus[j].read_cpu_elems(&mon.after->cpus[j]); - if (pmu.cpus[j].perf[4] != nullptr) { - for (auto &i : mon.after->cpus[j].cpu_munmap_address_length) { // delete by ebpf - LOG(DEBUG) << fmt::format("munmap address:{}, length:{}\n", i.first, i.second); - controller->delete_entry(i.first, i.second); - } - } - read_config += mon.after->cpus[j].cpu_bandwidth_read - mon.before->cpus[j].cpu_bandwidth_read; - } + // for (int j = 0; j < ncpu; ++j) { + // pmu.cpus[j].read_cpu_elems(&mon.after->cpus[j]); + // read_config += mon.after->cpus[j].cpu_bandwidth - mon.before->cpus[j].cpu_bandwidth; + // } /* read PEBS sample */ if (mon.pebs_ctx->read(controller, &mon.after->pebs) < 0) { LOG(ERROR) << fmt::format("[{}:{}:{}] Warning: Failed PEBS read\n", i, mon.tgid, mon.tid); } - target_llcmiss = mon.after->pebs.total - mon.before->pebs.total; + // target_llcmiss = mon.after->pebs.total - mon.before->pebs.total; // target_l2stall = // mon.after->cpus[mon.cpu_core].cpu_l2stall_t - mon.before->cpus[mon.cpu_core].cpu_l2stall_t; // target_llchits = // mon.after->cpus[mon.cpu_core].cpu_llcl_hits - mon.before->cpus[mon.cpu_core].cpu_llcl_hits; - for (auto const &[idx, value] : pmu.cpus | ranges::views::enumerate) { - target_l2stall += mon.after->cpus[idx].cpu_l2stall_t - mon.before->cpus[idx].cpu_l2stall_t; - target_llchits += mon.after->cpus[idx].cpu_llcl_hits - mon.before->cpus[idx].cpu_llcl_hits; + // for (auto const &[idx, value] : pmu.cpus | enumerate) { + // target_l2stall += mon.after->cpus[idx].cpu_l2stall_t - mon.before->cpus[idx].cpu_l2stall_t; + // target_llchits += mon.after->cpus[idx].cpu_llcl_hits - mon.before->cpus[idx].cpu_llcl_hits; + // } + for (int j = 0; j < helper.used_cpu.size(); j++) { + for (auto const &[idx, value] : pmu.cpus | enumerate) { + value.read_cpu_elems(&mon.after->cpus[j]); + // wb_cnt = mon.after->cpus[j].cpu[idx] - mon.before->cpus[j].cpu[idx]; + cpu_vec.emplace_back(mon.after->cpus[j].cpu[idx] - mon.before->cpus[j].cpu[idx]); + } } - uint64_t llcmiss_wb = 0; // To estimate the number of the writeback-involving LLC // misses of the CPU core (llcmiss_wb), the total number of @@ -297,10 +390,10 @@ int main(int argc, char *argv[]) { // the LLC misses of the CPU core (target_llcmiss) to that // of the LLC misses of all the CPU cores and the // prefetchers (cpus_dram_rds). - llcmiss_wb = wb_cnt * ((double)target_llcmiss / read_config); - + // llcmiss_wb = wb_cnt * std::lround(((double)target_llcmiss) / ((double)read_config)); + // TODO Calculate through the vector !!! target latency uint64_t llcmiss_ro = 0; - if (target_llcmiss < llcmiss_wb) { + if (target_llcmiss < llcmiss_wb) { // tunning LOG(ERROR) << fmt::format("[{}:{}:{}] cpus_dram_rds {}, llcmiss_wb {}, target_llcmiss {}\n", i, mon.tgid, mon.tid, read_config, llcmiss_wb, target_llcmiss); llcmiss_wb = target_llcmiss; @@ -311,23 +404,6 @@ int main(int argc, char *argv[]) { LOG(DEBUG) << fmt::format("[{}:{}:{}]llcmiss_wb={}, llcmiss_ro={}\n", i, mon.tgid, mon.tid, llcmiss_wb, llcmiss_ro); - uint64_t mastall_wb = 0; - uint64_t mastall_ro = 0; - // If both target_llchits and target_llcmiss are 0, it means that hit in L2. - // Stall by LLC misses is 0. - mastall_wb = (double)(target_l2stall / frequency) * - ((double)(weight * llcmiss_wb) / (double)(target_llchits + (weight * target_llcmiss))) * - 1000; - mastall_ro = (double)(target_l2stall / frequency) * - ((double)(weight * llcmiss_ro) / (double)(target_llchits + (weight * target_llcmiss))) * - 1000; - LOG(DEBUG) << fmt::format( - "l2stall={}, mastall_wb={}, mastall_ro={}, target_llchits={}, target_llcmiss={}, weight={}\n", - target_l2stall, mastall_wb, mastall_ro, target_llchits, target_llcmiss, weight); - - auto ma_wb = (double)mastall_wb / dramlatency; - auto ma_ro = (double)mastall_ro / dramlatency; - uint64_t emul_delay = 0; LOG(DEBUG) << fmt::format("[{}:{}:{}] pebs: total={}, \n", i, mon.tgid, mon.tid, mon.after->pebs.total); @@ -338,65 +414,53 @@ int main(int argc, char *argv[]) { LatencyPass lat_pass = { .all_access = all_access, .dramlatency = dramlatency, - .ma_ro = ma_ro, - .ma_wb = ma_wb, + .readonly = llcmiss_ro, + .writeback = llcmiss_wb, }; BandwidthPass bw_pass = { .all_access = all_access, .read_config = read_config, .write_config = read_config, }; - emul_delay += controller->calculate_latency(lat_pass); + emul_delay += std::lround(controller->calculate_latency(lat_pass)); emul_delay += controller->calculate_bandwidth(bw_pass); emul_delay += std::get<0>(controller->calculate_congestion()); mon.before->pebs.total = mon.after->pebs.total; - LOG(DEBUG) << fmt::format("ma_wb={}, ma_ro={}, delay={}\n", ma_wb, ma_ro, emul_delay); + LOG(DEBUG) << fmt::format("delay={}\n", emul_delay); /* compensation of delay END(1) */ clock_gettime(CLOCK_MONOTONIC, &end_ts); diff_nsec += (end_ts.tv_sec - start_ts.tv_sec) * 1000000000 + (end_ts.tv_nsec - start_ts.tv_nsec); LOG(DEBUG) << fmt::format("dif:{}\n", diff_nsec); - uint64_t calibrated_delay = (diff_nsec > emul_delay) ? 0 : emul_delay - diff_nsec; - // uint64_t calibrated_delay = emul_delay; + calibrated_delay = (diff_nsec > emul_delay) ? 0 : emul_delay - diff_nsec; mon.total_delay += (double)calibrated_delay / 1000000000; diff_nsec = 0; /* insert emulated NVM latency */ - mon.injected_delay.tv_sec += (calibrated_delay / 1000000000); - mon.injected_delay.tv_nsec += (calibrated_delay % 1000000000); + mon.injected_delay.tv_sec += std::lround(calibrated_delay / 1000000000); + mon.injected_delay.tv_nsec += std::lround(calibrated_delay % 1000000000); LOG(DEBUG) << fmt::format("[{}:{}:{}]delay:{} , total delay:{}\n", i, mon.tgid, mon.tid, calibrated_delay, mon.total_delay); - auto swap = mon.before; - mon.before = mon.after; - mon.after = swap; - /* continue suspended processes: send SIGCONT */ - // unfreeze_counters_cbo_all(fds.msr[0]); - // start_pmc(&fds, i); - if (calibrated_delay == 0) { - mon.clear_time(&mon.wasted_delay); - mon.clear_time(&mon.injected_delay); - mon.run(); - } } else if (mon.status == MONITOR_OFF) { // Wasted epoch time clock_gettime(CLOCK_MONOTONIC, &start_ts); uint64_t sleep_diff = (sleep_end_ts.tv_sec - sleep_start_ts.tv_sec) * 1000000000 + (sleep_end_ts.tv_nsec - sleep_start_ts.tv_nsec); - struct timespec sleep_time; - sleep_time.tv_sec = sleep_diff / 1000000000; - sleep_time.tv_nsec = sleep_diff % 1000000000; + struct timespec sleep_time {}; + sleep_time.tv_sec = std::lround(sleep_diff / 1000000000); + sleep_time.tv_nsec = std::lround(sleep_diff % 1000000000); mon.wasted_delay.tv_sec += sleep_time.tv_sec; mon.wasted_delay.tv_nsec += sleep_time.tv_nsec; LOG(DEBUG) << fmt::format("[{}:{}:{}][OFF] total: {}| wasted : {}| waittime : {}| squabble : {}\n", i, mon.tgid, mon.tid, mon.injected_delay.tv_nsec, mon.wasted_delay.tv_nsec, waittime.tv_nsec, mon.squabble_delay.tv_nsec); if (monitors.check_continue(i, sleep_time)) { - mon.clear_time(&mon.wasted_delay); - mon.clear_time(&mon.injected_delay); + Monitor::clear_time(&mon.wasted_delay); + Monitor::clear_time(&mon.injected_delay); mon.run(); } clock_gettime(CLOCK_MONOTONIC, &end_ts); @@ -412,16 +476,33 @@ int main(int argc, char *argv[]) { LOG(DEBUG) << fmt::format("[SQ]total: {}| wasted : {}| waittime : {}| squabble : {}\n", mon.injected_delay.tv_nsec, mon.wasted_delay.tv_nsec, waittime.tv_nsec, mon.squabble_delay.tv_nsec); - mon.clear_time(&mon.wasted_delay); - mon.clear_time(&mon.injected_delay); + Monitor::clear_time(&mon.wasted_delay); + Monitor::clear_time(&mon.injected_delay); mon.run(); } else { mon.injected_delay.tv_nsec += mon.squabble_delay.tv_nsec; - mon.clear_time(&mon.squabble_delay); + Monitor::clear_time(&mon.squabble_delay); } } } } // End for-loop for all target processes + LOG(TRACE) << fmt::format("{}\n", monitors); + for (auto mon : monitors.mon) { + if (mon.status == MONITOR_ON) { + auto swap = mon.before; + mon.before = mon.after; + mon.after = swap; + + /* continue suspended processes: send SIGCONT */ + // mon.unfreeze_counters_cha_all(fds.msr[0]); + // start_pmc(&fds, i); + if (calibrated_delay == 0) { + Monitor::clear_time(&mon.wasted_delay); + Monitor::clear_time(&mon.injected_delay); + mon.run(); + } + } + } if (monitors.check_all_terminated(tnum)) { break; } diff --git a/src/module.cc b/src/module.cc new file mode 100644 index 0000000..7ac58cd --- /dev/null +++ b/src/module.cc @@ -0,0 +1,207 @@ +// +// Created by victoryang00 on 11/9/23. +// + +/** for thread creation and memory monitor */ +#include "sock.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#define CXLMEMSIM_EXPORT __attribute__((visibility("default"))) +#define CXLMEMSIM_CONSTRUCTOR(n) __attribute__((constructor((n)))) +#define CXLMEMSIM_CONSTRUCTOR_PRIORITY 102 + +typedef void *(*mmap_ptr_t)(void *, size_t, int, int, int, off_t); +typedef int (*munmap_ptr_t)(void *, size_t); +typedef void *(*malloc_ptr_t)(size_t); +typedef int (*calloc_ptr_t)(void *, size_t); +typedef void *(*realloc_ptr_t)(void *, size_t); +typedef int (*posix_memalign_ptr_t)(void **, size_t, size_t); +typedef void *(*aligned_alloc_ptr_t)(size_t, size_t); +typedef int (*free_ptr_t)(void *); +typedef int (*pthread_create_ptr_t)(pthread_t *, const pthread_attr_t *, void *(*)(void *), void *); +typedef int (*pthread_join_ptr_t)(pthread_t, void **); +typedef int (*pthread_detach_ptr_t)(pthread_t); +typedef size_t (*malloc_usable_size_ptr_t)(void *); +// typedef int (*mpi_send_t)(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm); + +typedef struct cxlmemsim_param { + int sock; + struct sockaddr_un addr; + mmap_ptr_t mmap; + munmap_ptr_t munmap; + malloc_ptr_t malloc; + calloc_ptr_t calloc; + realloc_ptr_t realloc; + posix_memalign_ptr_t posix_memalign; + aligned_alloc_ptr_t aligned_alloc; + free_ptr_t free; + pthread_create_ptr_t pthread_create; + pthread_join_ptr_t pthread_join; + pthread_detach_ptr_t pthread_detach; + malloc_usable_size_ptr_t malloc_usable_size; +} cxlmemsim_param_t; + +cxlmemsim_param_t param = {.sock = 0, + .addr = {}, + .mmap = nullptr, + .munmap = nullptr, + .malloc = nullptr, + .free = nullptr, + .pthread_create = nullptr, + .pthread_join = nullptr, + .pthread_detach = nullptr}; + +inline void call_socket_with_int3() { + const char *message = "hello"; + fprintf(stderr, "call_socket_with_int3\n"); + // sendback tid + + __asm__("int $0x3"); +} + +inline int init_mmap_ptr(void) { + if (param.mmap == nullptr) { + param.mmap = (mmap_ptr_t)dlsym(RTLD_NEXT, "mmap64"); + if (!param.mmap) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"mmap\")\n"); + return -1; + } + } + return 0; +} + +CXLMEMSIM_EXPORT +void *malloc(size_t size) { + call_socket_with_int3(); + fprintf(stderr, "malloc%ld\n", size); + return param.malloc(size); +} + +CXLMEMSIM_EXPORT +void *calloc(size_t num, size_t size) { + call_socket_with_int3(); + if (param.mmap == nullptr) { + return (void *)param.calloc; + } + + return param.malloc(num * size); +} + +CXLMEMSIM_EXPORT +void *realloc(void *ptr, size_t size) { + call_socket_with_int3(); + return param.realloc(ptr, size); +} + +CXLMEMSIM_EXPORT +int posix_memalign(void **memptr, size_t alignment, size_t size) { + call_socket_with_int3(); + return param.posix_memalign(memptr, alignment, size); +} + +CXLMEMSIM_EXPORT +void *aligned_alloc(size_t alignment, size_t size) { + call_socket_with_int3(); + return param.aligned_alloc(alignment, size); +} + +CXLMEMSIM_EXPORT +void free(void *ptr) { + call_socket_with_int3(); + if (ptr == (void *)param.calloc) { + return; + } + + param.free(ptr); +} + +CXLMEMSIM_EXPORT +void *mmap(void *start, size_t len, int prot, int flags, int fd, off_t off) { + call_socket_with_int3(); + void *ret = NULL; + int mmap_initialized = init_mmap_ptr(); + + if (mmap_initialized != 0) { + fprintf(stderr, "init_mmap_ptr() failed\n"); + return ret; + } + ret = param.mmap(start, len, prot, flags, fd, off); + + return ret; +} + +CXLMEMSIM_EXPORT +void *mmap64(void *start, size_t len, int prot, int flags, int fd, off_t off) { + call_socket_with_int3(); + return mmap(start, len, prot, flags, fd, off); +} + +CXLMEMSIM_EXPORT +size_t malloc_usable_size(void *ptr) { /* added for redis */ + call_socket_with_int3(); + return param.malloc_usable_size(ptr); +} + +CXLMEMSIM_CONSTRUCTOR(CXLMEMSIM_CONSTRUCTOR_PRIORITY) static void cxlmemsim_constructor() { + // save the original impl of mmap + + init_mmap_ptr(); + param.munmap = (munmap_ptr_t)dlsym(RTLD_NEXT, "munmap"); + if (!param.munmap) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"munmap\")\n"); + exit(-1); + } + param.malloc = (malloc_ptr_t)dlsym(RTLD_NEXT, "malloc"); + if (!param.malloc) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"malloc\")\n"); + exit(-1); + } + param.free = (free_ptr_t)dlsym(RTLD_NEXT, "free"); + if (!param.free) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"free\")\n"); + exit(-1); + } + param.calloc = (calloc_ptr_t)dlsym(RTLD_NEXT, "calloc"); + if (!param.calloc) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"calloc\")\n"); + exit(-1); + } + param.realloc = (realloc_ptr_t)dlsym(RTLD_NEXT, "realloc"); + if (!param.realloc) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"realloc\")\n"); + exit(-1); + } + param.pthread_create = (pthread_create_ptr_t)dlsym(RTLD_NEXT, "pthread_create"); + if (!param.pthread_create) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"pthread_create\")\n"); + exit(-1); + } + + param.pthread_detach = (pthread_detach_ptr_t)dlsym(RTLD_NEXT, "pthread_detach"); + if (!param.pthread_detach) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"pthread_detach\")\n"); + exit(-1); + } + + param.pthread_join = (pthread_join_ptr_t)dlsym(RTLD_NEXT, "pthread_join"); + if (!param.pthread_join) { + fprintf(stderr, "Error in dlsym(RTLD_NEXT,\"pthread_join\")\n"); + exit(-1); + } + param.sock = socket(AF_UNIX, SOCK_DGRAM, 0); + /** register the original impl */ + struct sockaddr_un addr {}; + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, SOCKET_PATH, sizeof(addr.sun_path) - 1); + fprintf(stderr, "start\n"); +} + +__attribute__((destructor)) static void cxlmemsim_destructor() { fprintf(stderr, "fini"); } diff --git a/src/monitor.cpp b/src/monitor.cpp index 11b4f7c..20d3992 100644 --- a/src/monitor.cpp +++ b/src/monitor.cpp @@ -3,15 +3,14 @@ // #include "monitor.h" -Monitors::Monitors(int tnum, cpu_set_t *use_cpuset, int nmem, Helper h) { - mon = std::vector(tnum, Monitor(nmem, h)); - /* init mon */ +Monitors::Monitors(int tnum, cpu_set_t *use_cpuset) : print_flag(true) { + mon = std::vector(tnum, Monitor()); + /** Init mon */ for (int i = 0; i < tnum; i++) { disable(i); - int cpucnt = 0; - int cpuid = 0; - for (cpuid = 0; cpuid < h.cpu; cpuid++) { - if (CPU_ISSET(cpuid, use_cpuset)) { + int cpucnt = 0, cpuid; + for (cpuid = 0; cpuid < helper.num_of_cpu(); cpuid++) { + if (!CPU_ISSET(cpuid, use_cpuset)) { if (i == cpucnt) { mon[i].cpu_core = cpuid; break; @@ -35,8 +34,15 @@ void Monitors::run_all(const int processes) { } } } +Monitor Monitors::get_mon(int tgid, int tid) { + for (auto &i : mon) { + if (i.tgid == tgid && i.tid == tid) { + return i; + } + } +} int Monitors::enable(const uint32_t tgid, const uint32_t tid, bool is_process, uint64_t pebs_sample_period, - const int32_t tnum, bool is_page) { + const int32_t tnum) { int target = -1; for (int i = 0; i < tnum; i++) { @@ -77,22 +83,22 @@ int Monitors::enable(const uint32_t tgid, const uint32_t tid, bool is_process, u disable(target); mon[target].status = MONITOR_ON; mon[target].tgid = tgid; - mon[target].tid = tid; + mon[target].tid = tid; // We can setup the process here mon[target].is_process = is_process; if (pebs_sample_period) { /* pebs start */ - mon[target].pebs_ctx = new PEBS(tgid, pebs_sample_period, is_page); + mon[target].pebs_ctx = new PEBS(tgid, pebs_sample_period); LOG(DEBUG) << fmt::format("{}Process [tgid={}, tid={}]: enable to pebs.\n", target, mon[target].tgid, - mon[target].tid); + mon[target].tid); // multiple tid multiple pid } - LOG(INFO) << fmt::format("========== Process {}[tgid={}, tid={}] monitoring start ==========\n", target, - mon[target].tgid, mon[target].tid); - return 0; + LOG(INFO) << fmt::format("pid {}[tgid={}, tid={}] monitoring start\n", target, mon[target].tgid, mon[target].tid); + + return target; } void Monitors::disable(const uint32_t target) { - mon[target].is_process = false; + mon[target].is_process = false; // Here to add the multi process. mon[target].status = MONITOR_DISABLE; mon[target].tgid = 0; mon[target].tid = 0; @@ -114,9 +120,9 @@ void Monitors::disable(const uint32_t target) { mon[target].pebs_ctx->mp = nullptr; mon[target].pebs_ctx->sample_period = 0; } - for (int j = 0; j < 2; j++) { - mon[target].elem[j].pebs.total = 0; - mon[target].elem[j].pebs.llcmiss = 0; + for (auto &j : mon[target].elem) { + j.pebs.total = 0; + j.pebs.llcmiss = 0; } } bool Monitors::check_all_terminated(const uint32_t processes) { @@ -181,8 +187,8 @@ bool Monitors::check_continue(const uint32_t target, const struct timespec w) { return false; } -void Monitor::stop() { - int ret = -1; +void Monitor::stop() { // thread create and proecess create get the pmu + int ret; if (this->is_process) { // In case of process, use SIGSTOP. @@ -211,6 +217,7 @@ void Monitor::stop() { LOG(DEBUG) << fmt::format("Process [{}:{}] is stopped.\n", this->tgid, this->tid); } } + void Monitor::run() { LOG(DEBUG) << fmt::format("Send SIGCONT to tid={}(tgid={})\n", this->tid, this->tgid); if (syscall(SYS_tgkill, this->tgid, this->tid, SIGCONT) == -1) { @@ -229,24 +236,19 @@ void Monitor::run() { LOG(DEBUG) << fmt::format("Process [{}:{}] is running.\n", this->tgid, this->tid); } } + void Monitor::clear_time(struct timespec *time) { time->tv_sec = 0; time->tv_nsec = 0; } -Monitor::Monitor(const int nmem, Helper h) + +Monitor::Monitor() // which one to hook : tgid(0), tid(0), cpu_core(0), status(0), injected_delay({0}), wasted_delay({0}), squabble_delay({0}), before(nullptr), after(nullptr), total_delay(0), start_exec_ts({0}), end_exec_ts({0}), is_process(false), pebs_ctx(nullptr) { + for (auto &j : this->elem) { - j.cpus = (struct CPUElem *)calloc(sizeof(struct CPUElem), h.cpu); - if (j.cpus == nullptr) { - LOG(ERROR) << "calloc"; - throw; - } - j.cbos = (struct CBOElem *)calloc(sizeof(struct CBOElem), h.cbo); - if (j.cbos == nullptr) { - LOG(ERROR) << "calloc"; - throw; - } + j.cpus = std::vector(helper.used_cpu.size()); + j.chas = std::vector(helper.used_cha.size()); } -} +} \ No newline at end of file diff --git a/src/pebs.cpp b/src/pebs.cpp index 20f5f6c..ffe699e 100644 --- a/src/pebs.cpp +++ b/src/pebs.cpp @@ -24,17 +24,17 @@ struct perf_sample { long perf_event_open(struct perf_event_attr *event_attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, event_attr, pid, cpu, group_fd, flags); } -PEBS::PEBS(pid_t pid, uint64_t sample_period, bool is_page) : pid(pid), sample_period(sample_period), is_page(is_page) { +PEBS::PEBS(pid_t pid, uint64_t sample_period) : pid(pid), sample_period(sample_period) { // Configure perf_event_attr struct struct perf_event_attr pe = { .type = PERF_TYPE_RAW, .size = sizeof(struct perf_event_attr), .config = 0x20d1, // mem_load_retired.l3_miss - .sample_period = 1, + .sample_period = sample_period, .sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR | PERF_SAMPLE_READ | PERF_SAMPLE_PHYS_ADDR, .read_format = PERF_FORMAT_TOTAL_TIME_ENABLED, .disabled = 1, // Event is initially disabled - .exclude_kernel = 0, + .exclude_kernel = 1, .precise_ip = 1, .config1 = 3, }; // excluding events that happen in the kernel-space @@ -50,7 +50,7 @@ PEBS::PEBS(pid_t pid, uint64_t sample_period, bool is_page) : pid(pid), sample_p } this->mplen = MMAP_SIZE; - this->mp = (perf_event_mmap_page *)mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, this->fd, 0); + this->mp = (perf_event_mmap_page *)mmap(nullptr, MMAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, this->fd, 0); if (this->mp == MAP_FAILED) { perror("mmap"); @@ -68,14 +68,13 @@ int PEBS::read(CXLController *controller, struct PEBSElem *elem) { return -1; int r = 0; - int i; struct perf_event_header *header; struct perf_sample *data; uint64_t last_head; char *dp = ((char *)mp) + PAGE_SIZE; do { - this->seq = mp->lock; + this->seq = mp->lock; // explicit copy barrier(); last_head = mp->data_head; while ((uint64_t)this->rdlen < last_head) { @@ -99,7 +98,7 @@ int PEBS::read(CXLController *controller, struct PEBSElem *elem) { data->value, data->timestamp); controller->insert(data->timestamp, data->phys_addr, data->addr, 0); elem->total++; - elem->llcmiss = data->value; + elem->llcmiss = data->value; // this is the number of llc miss } break; case PERF_RECORD_THROTTLE: @@ -122,7 +121,7 @@ int PEBS::read(CXLController *controller, struct PEBSElem *elem) { mp->data_tail = last_head; barrier(); } while (mp->lock != this->seq); - + return r; } int PEBS::start() { diff --git a/src/perf.cpp b/src/perf.cpp index c70c8ce..b8c81c5 100644 --- a/src/perf.cpp +++ b/src/perf.cpp @@ -4,359 +4,7 @@ #include "perf.h" #include "pebs.h" -#include -#define MAX_MAPS 32 -#define DEBUGFS "/sys/kernel/debug/tracing/" -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; - unsigned int inner_map_idx; -}; -struct bpf_map_data { - int fd; - char *name; - size_t elf_offset; - struct bpf_map_def def; -}; -static char license[128]; -static int kern_version; -static bool processed_sec[128]; -char bpf_log_buf[BPF_LOG_BUF_SIZE]; -int map_fd[MAX_MAPS]; -int prog_fd; -int event_fd; -int prog_array_fd = -1; -struct bpf_map_data map_data[MAX_MAPS]; -int map_data_count = 0; -static int cmp_symbols(const void *l, const void *r) { - const GElf_Sym *lsym = (const GElf_Sym *)l; - const GElf_Sym *rsym = (const GElf_Sym *)r; - - if (lsym->st_value < rsym->st_value) - return -1; - else if (lsym->st_value > rsym->st_value) - return 1; - else - return 0; -} -static int load_maps(struct bpf_map_data *maps, int nr_maps) { - int i; - - for (i = 0; i < nr_maps; i++) { - if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { - int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; - struct bpf_map_create_opts opt = { - .inner_map_fd = static_cast<__u32>(inner_map_fd), - .map_flags = maps[i].def.map_flags, - .numa_node = 0, - }; - - map_fd[i] = bpf_map_create(((enum bpf_map_type)maps[i].def.type), "my_map", maps[i].def.key_size, - maps[i].def.value_size, maps[i].def.max_entries, &opt); - } else { - struct bpf_map_create_opts opt = { - .map_flags = maps[i].def.map_flags, - .numa_node = 0, - }; - map_fd[i] = bpf_map_create(((enum bpf_map_type)maps[i].def.type), "my_map", maps[i].def.key_size, - maps[i].def.value_size, maps[i].def.max_entries, &opt); - } - if (map_fd[i] < 0) { - LOG(ERROR) << fmt::format("failed to create a map: {} {}\n", errno, strerror(errno)); - return 1; - } - maps[i].fd = map_fd[i]; - - if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) - prog_array_fd = map_fd[i]; - } - return 0; -} - -static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, GElf_Shdr *shdr, struct bpf_insn *insn, - struct bpf_map_data *maps, int nr_maps) { - int i, nrels; - - nrels = shdr->sh_size / shdr->sh_entsize; - - for (i = 0; i < nrels; i++) { - GElf_Sym sym; - GElf_Rel rel; - unsigned int insn_idx; - bool match = false; - int map_idx; - - gelf_getrel(data, i, &rel); - - insn_idx = rel.r_offset / sizeof(struct bpf_insn); - - gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); - - if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { - LOG(ERROR) << fmt::format("invalid relo for insn[{}].code 0x%x\n", insn_idx, insn[insn_idx].code); - return 1; - } - insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; - - /* Match FD relocation against recorded map_data[] offset */ - for (map_idx = 0; map_idx < nr_maps; map_idx++) { - if (maps[map_idx].elf_offset == sym.st_value) { - match = true; - break; - } - } - if (match) { - insn[insn_idx].imm = maps[map_idx].fd; - } else { - LOG(ERROR) << fmt::format("invalid relo for insn[{}] no map_data match\n", insn_idx); - return 1; - } - } - - return 0; -} - -static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, GElf_Shdr *shdr, Elf_Data **data) { - Elf_Scn *scn; - - scn = elf_getscn(elf, i); - if (!scn) - return 1; - - if (gelf_getshdr(scn, shdr) != shdr) - return 2; - - *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); - if (!*shname || !shdr->sh_size) - return 3; - - *data = elf_getdata(scn, 0); - if (!*data || elf_getdata(scn, *data) != nullptr) - return 4; - - return 0; -} - -static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, Elf *elf, Elf_Data *symbols, - int strtabidx) { - int map_sz_elf, map_sz_copy; - bool validate_zero = false; - Elf_Data *data_maps; - int i, nr_maps; - GElf_Sym *sym; - Elf_Scn *scn; - int copy_sz; - - if (maps_shndx < 0) - return -EINVAL; - if (!symbols) - return -EINVAL; - - /* Get data for maps section via elf index */ - scn = elf_getscn(elf, maps_shndx); - if (scn) - data_maps = elf_getdata(scn, NULL); - if (!scn || !data_maps) { - printf("Failed to get Elf_Data from maps section {}\n", maps_shndx); - return -EINVAL; - } - - /* For each map get corrosponding symbol table entry */ - sym = static_cast(calloc(MAX_MAPS + 1, sizeof(GElf_Sym))); - for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { - if (!gelf_getsym(symbols, i, &sym[nr_maps])) - continue; - if (sym[nr_maps].st_shndx != maps_shndx) - continue; - /* Only increment iif maps section */ - nr_maps++; - } - - /* Align to map_fd[] order, via sort on offset in sym.st_value */ - qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); - - map_sz_elf = data_maps->d_size / nr_maps; - map_sz_copy = sizeof(struct bpf_map_def); - if (map_sz_elf < map_sz_copy) { - /* - * Backward compat, loading older ELF file with - * smaller struct, keeping remaining bytes zero. - */ - map_sz_copy = map_sz_elf; - } else if (map_sz_elf > map_sz_copy) { - /* - * Forward compat, loading newer ELF file with larger - * struct with unknown features. Assume zero means - * feature not used. Thus, validate rest of struct - * data is zero. - */ - validate_zero = true; - } - - /* Memcpy relevant part of ELF maps data to loader maps */ - for (i = 0; i < nr_maps; i++) { - unsigned char *addr, *end; - struct bpf_map_def *def; - const char *map_name; - size_t offset; - - map_name = elf_strptr(elf, strtabidx, sym[i].st_name); - maps[i].name = strdup(map_name); - if (!maps[i].name) { - printf("strdup({}): {}({})\n", map_name, strerror(errno), errno); - free(sym); - return -errno; - } - - /* Symbol value is offset into ELF maps section data area */ - offset = sym[i].st_value; - def = (struct bpf_map_def *)(((long)data_maps->d_buf) + offset); - maps[i].elf_offset = offset; - memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); - memcpy(&maps[i].def, def, map_sz_copy); - - /* Verify no newer features were requested */ - if (validate_zero) { - addr = (unsigned char *)def + map_sz_copy; - end = (unsigned char *)def + map_sz_elf; - for (; addr < end; addr++) { - if (*addr != 0) { - free(sym); - return -EFBIG; - } - } - } - } - - free(sym); - return nr_maps; -} - -static perf_event_attr load_and_attach(const char *event, struct bpf_insn *prog, int size, int pid, int cpu) { - bool is_socket = strncmp(event, "socket", 6) == 0; - bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; - bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; - bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; - bool is_xdp = strncmp(event, "xdp", 3) == 0; - bool is_perf_event = strncmp(event, "perf_event", 10) == 0; - bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; - bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; - size_t insns_cnt = size / sizeof(struct bpf_insn); - enum bpf_prog_type prog_type; - char buf[256]; - int fd, efd, err, id; - struct perf_event_attr attr = {}; - - attr.type = PERF_TYPE_TRACEPOINT; - attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; - - if (is_socket) { - prog_type = BPF_PROG_TYPE_SOCKET_FILTER; - } else if (is_kprobe || is_kretprobe) { - prog_type = BPF_PROG_TYPE_KPROBE; - } else if (is_tracepoint) { - prog_type = BPF_PROG_TYPE_TRACEPOINT; - } else if (is_xdp) { - prog_type = BPF_PROG_TYPE_XDP; - } else if (is_perf_event) { - prog_type = BPF_PROG_TYPE_PERF_EVENT; - } else if (is_cgroup_skb) { - prog_type = BPF_PROG_TYPE_CGROUP_SKB; - } else if (is_cgroup_sk) { - prog_type = BPF_PROG_TYPE_CGROUP_SOCK; - } else { - LOG(ERROR) << fmt::format("Unknown event '{}'\n", event); - throw; - } - - fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, bpf_log_buf, BPF_LOG_BUF_SIZE); - if (fd < 0) { - LOG(ERROR) << fmt::format("bpf_load_program() err={}\n{}", errno, bpf_log_buf); - throw; - } - - prog_fd = fd; - - if (is_kprobe || is_kretprobe) { - if (is_kprobe) - event += 7; - else - event += 10; - - if (*event == 0) { - LOG(ERROR) << fmt::format("event name cannot be empty\n"); - throw; - } - - snprintf(buf, sizeof(buf), "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", is_kprobe ? 'p' : 'r', - event, event); - err = system(buf); - LOG(INFO) << fmt::format("echo '{}:{} {}' >> /sys/kernel/debug/tracing/kprobe_events", is_kprobe ? 'p' : 'r', - event, event); - if (err < 0) { - LOG(ERROR) << fmt::format("failed to create kprobe '{}' error '{}'\n", event, strerror(errno)); - } - - strcpy(buf, DEBUGFS); - strcat(buf, "events/kprobes/"); - strcat(buf, event); - strcat(buf, "/id"); - } else if (is_tracepoint) { - event += 11; - - if (*event == 0) { - LOG(ERROR) << fmt::format("event name cannot be empty\n"); - throw; - } - strcpy(buf, DEBUGFS); - strcat(buf, "events/"); - strcat(buf, event); - strcat(buf, "/id"); - } - - efd = open(buf, O_RDONLY, 0); - if (efd < 0) { - LOG(ERROR) << fmt::format("failed to open event {}\n", event); - throw; - } - - err = read(efd, buf, sizeof(buf)); - if (err < 0 || err >= sizeof(buf)) { - LOG(ERROR) << fmt::format("read from '{}' failed '{}'\n", event, strerror(errno)); - throw; - } - - close(efd); - - buf[err] = 0; - id = atoi(buf); - attr.config = id; - - efd = perf_event_open(&attr, pid, cpu, -1, 0); - if (efd < 0) { - LOG(ERROR) << fmt::format("event {} fd {} err {}\n", id, efd, strerror(errno)); - throw; - } - event_fd = efd; - ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); - ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); - return attr; -} - -// PerfInfo::PerfInfo() { -// this->fd = perf_event_open(&this->attr, this->pid, this->cpu, this->group_fd, this->flags); -// if (this->fd == -1) { -// LOG(ERROR) << "perf_event_open"; -// throw; -// } -// ioctl(this->fd, PERF_EVENT_IOC_RESET, 0); -// } PerfInfo::PerfInfo(int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr) : group_fd(group_fd), cpu(cpu), pid(pid), flags(flags), attr(attr) { this->fd = perf_event_open(&this->attr, this->pid, this->cpu, this->group_fd, this->flags); @@ -366,13 +14,7 @@ PerfInfo::PerfInfo(int group_fd, int cpu, pid_t pid, unsigned long flags, struct } ioctl(this->fd, PERF_EVENT_IOC_RESET, 0); } -PerfInfo::PerfInfo(int fd, int group_fd, int cpu, pid_t pid, unsigned long flags, struct perf_event_attr attr) - : fd(fd), group_fd(group_fd), cpu(cpu), pid(pid), flags(flags), attr(attr) { - this->map = new ThreadSafeMap(); - this->j = std::jthread{[&] { write_trace_to_map(map); }}; -} PerfInfo::~PerfInfo() { - this->j.join(); if (this->fd != -1) { close(this->fd); this->fd = -1; @@ -384,212 +26,53 @@ PerfInfo::~PerfInfo() { * This can be avoided by executing nanosleep with 0. */ ssize_t PerfInfo::read_pmu(uint64_t *value) { - struct timespec zero = {0}; - nanosleep(&zero, nullptr); ssize_t r = read(this->fd, value, sizeof(*value)); if (r < 0) { - LOG(ERROR) << "read"; + LOG(ERROR) << "read\n"; } return r; } int PerfInfo::start() { if (ioctl(this->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { - LOG(ERROR) << "ioctl"; + LOG(ERROR) << "ioctl\n"; return -1; } return 0; } int PerfInfo::stop() { if (ioctl(this->fd, PERF_EVENT_IOC_DISABLE, 0) < 0) { - LOG(ERROR) << "ioctl"; + LOG(ERROR) << "ioctl\n"; return -1; } return 0; } -std::map PerfInfo::read_trace_pipe() { - auto traces = map->get(); - std::map addr_map; - for (auto r : traces) { - std::cout << r.first << " " << std::get<0>(r.second) << " " << std::get<1>(r.second) << std::endl; - // address, length, time -> address, length no lazyaccess - addr_map[r.first] = std::get<0>(r.second); - } - map->reset(); - return addr_map; -} PerfInfo *init_incore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1) { - int r, n_pid, n_cpu, group_fd, flags; + int n_pid, n_cpu, group_fd, flags; struct perf_event_attr attr { .type = PERF_TYPE_RAW, .size = sizeof(attr), .config = conf, .disabled = 1, .inherit = 1, .config1 = conf1, .clockid = 0 }; - if ((0 <= cpu) && (cpu < Helper::num_of_cpu())) { - n_pid = -1; - n_cpu = cpu; - } else { - n_pid = pid; - n_cpu = -1; - } + n_pid = -1; + n_cpu = cpu; group_fd = -1; flags = 0x08; - return new PerfInfo(group_fd, n_cpu, n_pid, static_cast(flags), attr); + return new PerfInfo{group_fd, n_cpu, n_pid, static_cast(flags), attr}; } -PerfInfo *init_incore_bpf_perf(const pid_t pid, const int cpu) { - int fd, i, ret, maps_shndx = -1, strtabidx = -1; - struct perf_event_attr attr {}; - Elf *elf; - GElf_Ehdr ehdr; - GElf_Shdr shdr, shdr_prog; - Elf_Data *data, *data_prog, *data_maps = nullptr, *symbols = nullptr; - char *shname, *shname_prog; - int nr_maps = 0; - - /* reset global variables */ - kern_version = 0; - memset(license, 0, sizeof(license)); - memset(processed_sec, 0, sizeof(processed_sec)); - - if (elf_version(EV_CURRENT) == EV_NONE) - throw; - - fd = open("./collectmmap.o", O_RDONLY, 0); - if (fd < 0) - throw; - - elf = elf_begin(fd, ELF_C_READ, nullptr); - - if (!elf) - throw; - - if (gelf_getehdr(elf, &ehdr) != &ehdr) - throw; - - /* clear all kprobes */ - i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events"); - - /* scan over all elf sections to get license and map info */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (strcmp(shname, "license") == 0) { - processed_sec[i] = true; - memcpy(license, data->d_buf, data->d_size); - } else if (strcmp(shname, "version") == 0) { - processed_sec[i] = true; - if (data->d_size != sizeof(int)) { - LOG(ERROR) << fmt::format("invalid size of version section %zd\n", data->d_size); - throw; - } - memcpy(&kern_version, data->d_buf, sizeof(int)); - } else if (strcmp(shname, "maps") == 0) { - int j; - - maps_shndx = i; - data_maps = data; - for (j = 0; j < MAX_MAPS; j++) - map_data[j].fd = -1; - } else if (shdr.sh_type == SHT_SYMTAB) { - strtabidx = shdr.sh_link; - symbols = data; - } - } - - ret = 1; - - if (!symbols) { - LOG(ERROR) << fmt::format("missing SHT_SYMTAB section\n"); - throw; - } - - if (data_maps) { - nr_maps = load_elf_maps_section(map_data, maps_shndx, elf, symbols, strtabidx); - if (nr_maps < 0) { - LOG(ERROR) << fmt::format("Error: Failed loading ELF maps (errno:{}):{}\n", nr_maps, strerror(-nr_maps)); - ret = 1; - throw; - } - if (load_maps(map_data, nr_maps)) - throw; - map_data_count = nr_maps; - - processed_sec[maps_shndx] = true; - } - - /* load programs that need map fixup (relocations) */ - for (i = 1; i < ehdr.e_shnum; i++) { - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - if (shdr.sh_type == SHT_REL) { - struct bpf_insn *insns; - - if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, &shdr_prog, &data_prog)) - continue; - - if (shdr_prog.sh_type != SHT_PROGBITS || !(shdr_prog.sh_flags & SHF_EXECINSTR)) - continue; - - insns = (struct bpf_insn *)data_prog->d_buf; - - processed_sec[shdr.sh_info] = true; - processed_sec[i] = true; - - if (parse_relo_and_apply(data, symbols, &shdr, insns, map_data, nr_maps)) - continue; - - if (memcmp(shname_prog, "kprobe/", 7) == 0 || memcmp(shname_prog, "kretprobe/", 10) == 0 || - memcmp(shname_prog, "tracepoint/", 11) == 0 || memcmp(shname_prog, "xdp", 3) == 0 || - memcmp(shname_prog, "perf_event", 10) == 0 || memcmp(shname_prog, "socket", 6) == 0 || - memcmp(shname_prog, "cgroup/", 7) == 0) - attr = load_and_attach(shname_prog, insns, data_prog->d_size, pid, cpu); - return new PerfInfo(event_fd, -1, cpu, pid, 0, attr); - } - } - - /* load programs that don't use maps */ - for (i = 1; i < ehdr.e_shnum; i++) { - - if (processed_sec[i]) - continue; - - if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) - continue; - - if (memcmp(shname, "kprobe/", 7) == 0 || memcmp(shname, "kretprobe/", 10) == 0 || - memcmp(shname, "tracepoint/", 11) == 0 || memcmp(shname, "xdp", 3) == 0 || - memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || - memcmp(shname, "cgroup/", 7) == 0) - attr = load_and_attach(shname, (struct bpf_insn *)data->d_buf, data->d_size, pid, cpu); - return new PerfInfo(event_fd, -1, cpu, pid, 0, attr); - } - - return nullptr; -} +PerfInfo *init_uncore_perf(const pid_t pid, const int cpu, uint64_t conf, uint64_t conf1, int value) { + int group_fd = -1; + auto attr = perf_event_attr{ + .type = (uint32_t)value, + .size = sizeof(struct perf_event_attr), + .config = conf, + .disabled = 1, + .inherit = 1, + .enable_on_exec = 1, + .config1 = conf1, + }; -void write_trace_to_map(ThreadSafeMap *map) { - std::ifstream fp(DEBUGFS "trace_pipe"); - int i; - unsigned long size; - unsigned long address; - unsigned long long time; - std::string line; - while (std::getline(fp, line)) { - if (line.size() > 50 && line.contains("ls")) { - i = std::sscanf(line.substr(51, 57).c_str(), "bpf_trace_printk: munmap %lu %lu %llu", &size, &address, - &time); - std::cout << line.substr(51, 57).c_str() << " " << i << std::endl; - if (i > 1) { - map->insert(address, size, time); - std::cout << address << " " << size << " " << time << std::endl; - } - } - } + return new PerfInfo{group_fd, cpu, pid, 0, attr}; } diff --git a/src/policy.cpp b/src/policy.cpp index 4139be5..b1ca4b1 100644 --- a/src/policy.cpp +++ b/src/policy.cpp @@ -4,11 +4,26 @@ #include "policy.h" #include -Policy::Policy() {} -InterleavePolicy::InterleavePolicy() {} +// TODO: +AllocationPolicy::AllocationPolicy() = default; +InterleavePolicy::InterleavePolicy() = default; // If the number is -1 for local, else it is the index of the remote server int InterleavePolicy::compute_once(CXLController *controller) { - auto per_size = controller->is_page ? 4096 : 64; + int per_size; + switch (controller->page_type_) { + case CACHELINE: + per_size = 64; + break; + case PAGE: + per_size = 4096; + break; + case HUGEPAGE_2M: + per_size = 2 * 1024 * 1024; + break; + case HUGEPAGE_1G: + per_size = 1024 * 1024 * 1024; + break; + }; if (controller->occupation.size() * per_size / 1024 / 1024 < controller->capacity * 0.9) { return -1; } else { diff --git a/src/sock.cc b/src/sock.cc new file mode 100644 index 0000000..54fc44b --- /dev/null +++ b/src/sock.cc @@ -0,0 +1,121 @@ +#include "sock.h" +#include "cxlendpoint.h" +#include "helper.h" +#include "monitor.h" +#include "policy.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +Helper helper{}; +int main() { + // auto tnum = 1; + // auto pebsperiod = 1000000; + // std::vector cpuset = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + // std::vector pmu_name = {"1","2","3","4","5","6","7","8"}; + // std::vector pmu_config1 = {0, 1, 2, 3, 4, 5, 6, 7}; + // std::vector pmu_config2 = {0, 1, 2, 3, 4, 5, 6, 7}; + // uint64_t use_cpus = 0; + // cpu_set_t use_cpuset; + // CPU_ZERO(&use_cpuset); + // for (auto i : cpuset) { + // if (!use_cpus || use_cpus & 1UL << i) { + // CPU_SET(i, &use_cpuset); + // LOG(DEBUG) << fmt::format("use cpuid: {}{}\n", i, use_cpus); + // } + // } + auto sock = socket(AF_UNIX, SOCK_DGRAM, 0); + struct sockaddr_un addr {}; + + addr.sun_family = AF_UNIX; + strcpy(addr.sun_path, SOCKET_PATH); + remove(addr.sun_path); + if (bind(sock, (struct sockaddr *)&addr, sizeof(addr)) == -1) { // can be blocked for multi thread + LOG(ERROR) << "Failed to execute. Can't bind to a socket.\n"; + exit(1); + } + + size_t sock_buf_size = sizeof(op_data) + 1; + char *sock_buf = (char *)malloc(sock_buf_size); + + // Monitors monitors{tnum, &use_cpuset}; + // auto perf_config = + // helper.detect_model(monitors.mon[0].before->cpuinfo.cpu_model, pmu_name, pmu_config1, pmu_config2); + // PMUInfo pmu{1234, &helper, &perf_config}; + + while (true) { + /** Get from the CXLMemSimHook */ + int n; + do { + memset(sock_buf, 0, sock_buf_size); + // without blocking + n = recv(sock, sock_buf, sock_buf_size, MSG_DONTWAIT); + if (n < 1) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + // no data + break; + } else { + LOG(ERROR) << "Failed to recv"; + exit(-1); + } + } else if (n >= sizeof(struct op_data) && n <= sock_buf_size - 1) { + auto *opd = (struct op_data *)sock_buf; + LOG(ERROR) << fmt::format("received data: size={}, tgid={}, tid=[], opcode={}\n", n, opd->tgid, + opd->tid, opd->opcode); + + if (opd->opcode == CXLMEMSIM_THREAD_CREATE || opd->opcode == CXLMEMSIM_PROCESS_CREATE) { + int t; + bool is_process = opd->opcode == CXLMEMSIM_PROCESS_CREATE; + // register to monitor + LOG(DEBUG) << fmt::format("enable monitor: tgid={}, tid={}, is_process={}\n", opd->tgid, opd->tid, + is_process); + + // t = monitors.enable(opd->tgid, opd->tid, is_process, pebsperiod, tnum); + if (t == -1) { + LOG(ERROR) << "Failed to enable monitor\n"; + } else if (t < 0) { + // tid not found. might be already terminated. + continue; + } + // auto mon = monitors.mon[t]; + // Wait the t processes until emulation process initialized. + // mon.stop(); + /* read CHA params */ + // for (auto const &[idx, value] : pmu.chas | enumerate) { + // pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]); + // } + // for (auto const &[idx, value] : pmu.chas | enumerate) { + // pmu.chas[idx].read_cha_elems(&mon.before->chas[idx]); + // } + // // Run the t processes. + // mon.run(); + // clock_gettime(CLOCK_MONOTONIC, &mon.start_exec_ts); + } else if (opd->opcode == CXLMEMSIM_THREAD_EXIT) { + // unregister from monitor, and display results. + // get the tid from the tgid + LOG(ERROR)<< fmt::format("disable monitor: tgid={}, tid={}\n", opd->tgid, opd->tid); + // auto mon = monitors.get_mon(opd->tgid, opd->tid); + // mon.stop(); + } else if (opd->opcode == CXLMEMSIM_STABLE_SIGNAL) { + // for (auto const &[i, mon] : monitors.mon | enumerate) { + // if (mon.status == MONITOR_ON) { + // mon.stop(); + // mon.status = MONITOR_SUSPEND; + // } + // } + } + + } else { + LOG(ERROR) << fmt::format("received data is invalid size: size={}", n); + } + } while (n > 0); // check the next message. + } +} \ No newline at end of file diff --git a/src/uncore.cpp b/src/uncore.cpp index 5101fc6..8dbdc00 100644 --- a/src/uncore.cpp +++ b/src/uncore.cpp @@ -3,14 +3,13 @@ // #include "uncore.h" +extern Helper helper; Uncore::Uncore(const uint32_t unc_idx, PerfConfig *perf_config) { - int ret, fd; - ssize_t r; unsigned long value; + int r; char path[64], buf[32]; - memset(path, 0, sizeof(path)); - snprintf(path, sizeof(path) - 1, perf_config->path_format_cbo_type, unc_idx); + snprintf(path, sizeof(path) - 1, perf_config->path_format_cha_type.c_str(), unc_idx); fd = open(path, O_RDONLY); if (fd < 0) { @@ -21,7 +20,7 @@ Uncore::Uncore(const uint32_t unc_idx, PerfConfig *perf_config) { memset(buf, 0, sizeof(buf)); r = read(fd, buf, sizeof(buf) - 1); if (r < 0) { - LOG(ERROR) << fmt::format("read {} failed", path); + LOG(ERROR) << fmt::format("read {} failed", fd); close(fd); throw std::runtime_error("read"); } @@ -29,32 +28,26 @@ Uncore::Uncore(const uint32_t unc_idx, PerfConfig *perf_config) { value = strtoul(buf, nullptr, 10); if (value == ULONG_MAX) { - LOG(ERROR) << fmt::format("strtoul {} failed", path); + LOG(ERROR) << fmt::format("strtoul {} failed", fd); throw std::runtime_error("strtoul"); } - int cpu = (int)unc_idx; - pid_t pid = -1; /* when using uncore, pid must be -1. */ - int group_fd = -1; - auto attr = perf_event_attr{ - .type = (uint32_t)value, - .size = sizeof(struct perf_event_attr), - .config = perf_config->cbo_config, - .disabled = 1, - .inherit = 1, - .enable_on_exec = 1, - }; - - /* when using uncore, don't set exclude_xxx flags. */ - this->perf = new PerfInfo(group_fd, cpu, pid, 0, attr); + for (auto const &[k, v] : this->perf | enumerate) { + v = init_uncore_perf(-1, (int)unc_idx, std::get<1>(perf_config->cha[k]), std::get<2>(perf_config->cha[k]), + value); + } } -int Uncore::read_cbo_elems(struct CBOElem *elem) { - int r = this->perf->read_pmu(&elem->llc_wb); - if (r < 0) { - LOG(ERROR) << fmt::format("perf_read_pmu failed.\n"); +int Uncore::read_cha_elems(struct CHAElem *elem) { + ssize_t r; + for (auto const &[idx, value] : this->perf | enumerate) { + r = value->read_pmu(&elem->cha[idx]); + if (r < 0) { + LOG(ERROR) << fmt::format("read cha_elems[{}] failed.\n", std::get<0>(helper.perf_conf.cha[idx])); + return r; + } + LOG(DEBUG) << fmt::format("read cha_elems[{}]:{}\n", std::get<0>(helper.perf_conf.cha[idx]), elem->cha[idx]); } - LOG(DEBUG) << fmt::format("llc_wb:{}\n", elem->llc_wb); - return r; + return 0; } diff --git a/workloads/CMakeLists.txt b/workloads/CMakeLists.txt new file mode 100644 index 0000000..e69de29