diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index fcd57864..00000000 --- a/.travis.yml +++ /dev/null @@ -1,30 +0,0 @@ -sudo: false -branches: - only: - - master -language: cpp -compiler: - - gcc - - clang -install: -- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-4.8 - - g++-4.8 - - clang - - zlib1g-dev -script: - - scons - - scons debug=1 -git: - submodules: false -notifications: - email: - recipients: - - macsim-dev-commits@googlegroups.com - on_success: change - on_failure: always diff --git a/INSTALL b/INSTALL index 3696c4fc..78407e38 100644 --- a/INSTALL +++ b/INSTALL @@ -2,10 +2,10 @@ === Requirement === - SCons + SCons and dependencies * How to get SCons - Ubuntu --> apt-get install scons + Ubuntu --> apt-get install scons python-metaconfig === Build steps === diff --git a/README.md b/README.md index a4de7e95..c96ef7db 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -[![Build Status](https://travis-ci.org/gthparch/macsim.svg?branch=master)](https://travis-ci.org/gthparch/macsim) - # Macsim ## Introduction @@ -16,7 +14,7 @@ cores) and SMT or MT architectures as well. * Currently interconnection network model (based on IRIS) and power model (based on McPAT) are connected. -* MacSim is also one of the components of SST, so muiltiple MacSim simulatore +* MacSim is also one of the components of SST, so multiple MacSim simulatore can run concurrently. * The project has been supported by Intel, NSF, Sandia National Lab. diff --git a/SConscript b/SConscript index 83afce8c..5cfcdb8b 100644 --- a/SConscript +++ b/SConscript @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ######################################################################################### # Author : Jaekyu Lee (jq.lee17@gmail.com) @@ -38,7 +38,7 @@ warn_flags = ' '.join(warn_flags) env = Environment() custom_vars = set(['AS', 'AR', 'CC', 'CXX', 'HOME', 'LD_LIBRARY_PATH', 'PATH', 'RANLIB']) -for key,val in os.environ.iteritems(): +for key,val in os.environ.items(): if key in custom_vars: env[key] = val diff --git a/SConstruct b/SConstruct index a65d4a57..2995cf00 100644 --- a/SConstruct +++ b/SConstruct @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ######################################################################################### # Author : Jaekyu Lee (jq.lee17@gmail.com) @@ -8,7 +8,7 @@ import os import sys -import ConfigParser +import configparser ## Check c++14 support @@ -48,7 +48,7 @@ def pre_compile_check(): env = Environment() custom_vars = set(['AS', 'AR', 'CC', 'CXX', 'HOME', 'LD_LIBRARY_PATH', 'PATH', 'RANLIB']) - for key,val in os.environ.iteritems(): + for key,val in os.environ.items(): if key in custom_vars: env[key] = val @@ -68,17 +68,17 @@ flags = {} ## Configuration from file -Config = ConfigParser.ConfigParser() +Config = configparser.ConfigParser() Config.read('macsim.config') -flags['dram'] = Config.get('Library', 'dram', '0') -flags['power'] = Config.get('Library', 'power', '0') -flags['iris'] = Config.get('Library', 'iris', '0') -flags['qsim'] = Config.get('Library', 'qsim', '0') -flags['debug'] = Config.get('Build', 'debug', '0') -flags['gprof'] = Config.get('Build', 'gprof', '0') -flags['pin_3_13_trace'] = Config.get('Build', 'pin_3_13_trace', '0') -flags['val'] = Config.get('Build_Extra', 'val', '0') -flags['ramulator'] = Config.get('Library', 'ramulator', '0') +flags['dram'] = Config.get('Library', 'dram', fallback='0') +flags['power'] = Config.get('Library', 'power', fallback='0') +flags['iris'] = Config.get('Library', 'iris', fallback='0') +flags['qsim'] = Config.get('Library', 'qsim', fallback='0') +flags['debug'] = Config.get('Build', 'debug', fallback='0') +flags['gprof'] = Config.get('Build', 'gprof', fallback='0') +flags['pin_3_13_trace'] = Config.get('Build', 'pin_3_13_trace', fallback='0') +flags['val'] = Config.get('Build_Extra', 'val', fallback='0') +flags['ramulator'] = Config.get('Library', 'ramulator', fallback='0') ## Configuration from commandline flags['debug'] = ARGUMENTS.get('debug', flags['debug']) diff --git a/build.py b/build.py index d62e320b..757528d0 100755 --- a/build.py +++ b/build.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ######################################################################################### # Author : Jaekyu Lee (jq.lee17@gmail.com) diff --git a/def/general.stat.def b/def/general.stat.def index fba83e5a..b0508bfe 100644 --- a/def/general.stat.def +++ b/def/general.stat.def @@ -38,7 +38,7 @@ DEF_STAT(EXE_TIME, COUNT, NO_RATIO) DEF_STAT(NUM_REPEAT, COUNT, NO_RATIO) DEF_STAT(CYC_COUNT_X86, COUNT, NO_RATIO) -DEF_STAT(CYC_COUNT_PTX, COUNT, NO_RATIO) +DEF_STAT(CYC_COUNT_ACC, COUNT, NO_RATIO) DEF_STAT(AVG_BLOCK_EXE_CYCLE, COUNT, NO_RATIO) DEF_STAT(AVG_BLOCK_EXE_CYCLE_BASE, COUNT, NO_RATIO) diff --git a/doc/latex/Makefile b/doc/latex/Makefile index 51c48f90..2290b38b 100644 --- a/doc/latex/Makefile +++ b/doc/latex/Makefile @@ -9,4 +9,4 @@ all: cp macsim.pdf ../; clean: - $(RM) *.log *.aux *.blg *.bbl *.dvi *.brf macsim.ps macsim.pdf + $(RM) *.out *.log *.aux *.blg *.bbl *.toc *.styles *.dvi *.brf macsim.ps macsim.pdf diff --git a/doc/latex/trace.tex b/doc/latex/trace.tex index cafd2f80..3aa5c7c6 100644 --- a/doc/latex/trace.tex +++ b/doc/latex/trace.tex @@ -381,42 +381,81 @@ \subsection{trace\_xx.raw} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The trace\_xx.raw file is generated for each thread/warp and contains the -dynamic instruction trace for the thread/warp in binary -format. The structure/format for encoding instructions is the same in -both x86 and PTX traces and looks like as follows (in order): +dynamic instruction trace for the thread/warp in binary format. The +structure/format for encoding instructions different for x86 and for PTX. In the +source, this is defined in \Verb+src/trace_read.h+. +For x86, the trace entry format is (\Verb+struct trace_info_cpu_s+): -%trace format for an instruction in trace_xx.raw +\vspace{0.2in} +\begin{footnotesize} +\begin{tabular}{l c c l l} +Type & Size (Bytes) & Offset(Bytes) & Field & Description \\ \hline \hline +\Verb+uint8_t+ & 1 & 0 & \Verb+m_num_read_regs+ & number of source registers \\ +\Verb+uint8_t+ & 1 & 1 & \Verb+m_num_dest_regs+ & number of destination registers \\ +\Verb+uint8_t+ & 9 & 2 & \Verb+m_src[MAX_SRC_NUM]+ & source register IDs \\ +\Verb+uint8_t+ & 6 & 11 & \Verb+m_dst[MAX_DST_NUM]+ & destination register IDs \\ +\Verb+uint8_t+ & 1 & 17 & \Verb+m_cf_type+ & branch type \\ +\Verb+bool+ & 1 & 18 & \Verb+m_has_immediate+ & indicates whether this instruction has immediate field \\ +\Verb+uint8_t+ & 1 & 19 & \Verb+m_opcode+ & opcode \\ +\Verb+bool+ & 1 & 20 & \Verb+m_has_st+ & indicates whether this instruction has store operation \\ +\Verb+bool+ & 1 & 21 & \Verb+m_is_fp+ & indicates whether this instruction is a FP operation \\ +\Verb+bool+ & 1 & 22 & \Verb+m_write_flg+ & write flag \\ +\Verb+uint8_t+ & 1 & 23 & \Verb+m_num_ld+ & number of load operations \\ +\Verb+uint8_t+ & 1 & 24 & \Verb+m_size+ & instruction size \\ +\Verb+uint32_t+ & 4 & 28 & \Verb+m_ld_vaddr1+ & load address 1 \\ +\Verb+uint32_t+ & 4 & 32 & \Verb+m_ld_vaddr2+ & load address 2 \\ +\Verb+uint32_t+ & 4 & 36 & \Verb+m_st_vaddr+ & store address \\ +\Verb+uint32_t+ & 4 & 40 & \Verb+m_instruction_addr+ & PC address \\ +\Verb+uint32_t+ & 4 & 44 & \Verb+m_branch_target+ & branch target address \\ +\Verb+uint8_t+ & 1 & 48 & \Verb+m_mem_read_size+ & memory read size \\ +\Verb+uint8_t+ & 1 & 49 & \Verb+m_mem_write_size+ & memory write size \\ +\Verb+bool+ & 1 & 50 & \Verb+m_rep_dir+ & repetition direction \\ +\Verb+bool+ & 1 & 51 & \Verb+m_actually_taken+ & indicates whether branch is actually taken \\ +\end{tabular} +\end{footnotesize} +\vspace{0.2in} + +For PTX, the trace entry format is (\Verb+struct trace_info_gpu_small_s+): \vspace{0.2in} \begin{footnotesize} -\begin{tabular}{l c l l} -Type & Size (Bytes) & Field & Description \\ \hline \hline -\Verb+uint8_t+ & 1 & \Verb+m_num_read_regs+ & number of source registers \\ -\Verb+uint8_t+ & 1 & \Verb+m_num_dest_regs+ & number of destination registers \\ -\Verb+uint8_t+ & 9 & \Verb+m_src[MAX_SRC_NUM]+ & source register IDs \\ -\Verb+uint8_t+ & 6 & \Verb+m_dst[MAX_DST_NUM]+ & destination register IDs \\ -\Verb+uint8_t+ & 1 & \Verb+m_cf_type+ & branch type \\ -\Verb+bool+ & 1 & \Verb+m_has_immediate+ & indicates whether this instruction has immediate field \\ -\Verb+uint8_t+ & 1 & \Verb+m_opcode+ & opcode \\ -\Verb+bool+ & 1 & \Verb+m_has_st+ & indicates whether this instruction has store operation \\ -\Verb+bool+ & 1 & \Verb+m_is_fp+ & indicates whether this instruction is a FP operation \\ -\Verb+bool+ & 1 & \Verb+m_write_flg+ & write flag \\ -\Verb+uint8_t+ & 1 & \Verb+m_num_ld+ & number of load operations \\ -\Verb+uint8_t+ & 1 & \Verb+m_size+ & instruction size \\ -\Verb+uint32_t+ & 4 & \Verb+m_ld_vaddr1+ & load address 1 \\ -\Verb+uint32_t+ & 4 & \Verb+m_ld_vaddr2+ & load address 2 \\ -\Verb+uint32_t+ & 4 & \Verb+m_st_vaddr+ & store address \\ -\Verb+uint32_t+ & 4 & \Verb+m_instruction_addr+ & PC address \\ -\Verb+uint32_t+ & 4 & \Verb+m_branch_target+ & branch target address \\ -\Verb+uint8_t+ & 1 & \Verb+m_mem_read_size+ & memory read size \\ -\Verb+uint8_t+ & 1 & \Verb+m_mem_write_size+ & memory write size \\ -\Verb+bool+ & 1 & \Verb+m_rep_dir+ & repetition direction \\ -\Verb+bool+ & 1 & \Verb+m_actually_taken+ & indicates whether branch is actually taken \\ +\begin{tabular}{l c c l l} +Type & Size (Bytes) & Offset(Bytes) & Field & Description \\ \hline \hline +\Verb+uint8_t+ & 1 & 0 & \Verb+m_opcode+ & from \Verb+GPU_OPCODE_ENUM+ \\ +\Verb+bool+ & 1 & 1 & \Verb+m_is_fp+ & whether this instruction deals with \Verb+float+s \\ +\Verb+bool+ & 1 & 2 & \Verb+m_is_load+ & whether this instruction loads from memory \\ +\Verb+uint8_t+ & 1 & 3 & \Verb+m_cf_type+ & branch type \\ +\Verb+uint8_t+ & 1 & 4 & \Verb+m_num_read_regs+ & number of source registers \\ +\Verb+uint8_t+ & 1 & 5 & \Verb+m_num_dest_regs+ & number of destination registers \\ +\Verb+uint16_t+ & 10 & 6 & \Verb+m_src[MAX_GPU_SRC_NUM]+ & source register IDs \\ +\Verb+uint16_t+ & 8 & 16 & \Verb+m_dst[MAX_GPU_DST_NUM]+ & destination register IDs \\ +\Verb+uint8_t+ & 1 & 24 & \Verb+m_size+ & instruction size \\ +\Verb+uint32_t+ & 4 & 28 & \Verb+m_active_mask+ & warp's non-blocked threads \\ +\Verb+uint32_t+ & 4 & 32 & \Verb+m_br_taken_mask+ & warp's threads that took branch \\ +\Verb+uint64_t+ & 8 & 40 & \Verb+m_inst_addr+ & address of current instruction \\ +\Verb+uint64_t+ & 8 & 48 & \Verb+m_br_target_addr+ & address to branch to \\ +\Verb+uint64_t+ & 8 & 56 & \Verb+m_reconv_inst_addr+ & address of branch reconvergence \\ +\Verb+uint64_t+ & 8 & 56 & \Verb+m_mem_addr+ & memory address to load/store to \\ +\Verb+uint8_t+ & 1 & 64 & \Verb+m_barrier_id+ & \\ +\Verb+uint8_t+ & 1 & 64 & \Verb+m_mem_access_size+ & memory access granularity in bytes \\ +\Verb+uint16_t+ & 2 & 66 & \Verb+m_num_barrier_threads+ & \\ +\Verb+uint8_t+ & 1 & 68 & \Verb+m_addr_space+ & memory region to work on \\ +\Verb+uint8_t+ & 1 & 68 & \Verb+m_level+ & \\ +\Verb+uint8_t+ & 1 & 69 & \Verb+m_cache_level+ & \\ +\Verb+uint8_t+ & 1 & 69 & \Verb+m_cache_operator+ & \\ \end{tabular} \end{footnotesize} \vspace{0.2in} +Additionally, if a PTX trace entry is a load or a store, it will be followed by +32 8-byte (aligned) addresses, each corresponding to the address used by one of +the warps. Notably, he code for handling this was built on top of the existing +simulator. As a result, the output generated by \Verb+debug_trace_read+ or +\Verb+debug_print_trace+ will contain one extra instruction after the memory +operation, erroneously decoded from the memory address data. Nonetheless, the +simulation results are correct --- only the debug output is affected. + Note that the raw trace is compressed with zlib to reduce the sizes of the generated trace files, and the size of each field is the size diff --git a/doc/macsim.pdf b/doc/macsim.pdf index 54aa847c..ab79e0e3 100644 Binary files a/doc/macsim.pdf and b/doc/macsim.pdf differ diff --git a/internal b/internal index 12d986cd..2a78c449 160000 --- a/internal +++ b/internal @@ -1 +1 @@ -Subproject commit 12d986cd890180ec7cfb0be0e6f4358f8da888c1 +Subproject commit 2a78c4493c05682101c0ce6ff66c673a2f332800 diff --git a/macsimComponent.cpp b/macsimComponent.cpp index 80888627..4bfc53af 100644 --- a/macsimComponent.cpp +++ b/macsimComponent.cpp @@ -60,7 +60,16 @@ macsimComponent::macsimComponent(ComponentId_t id, Params& params) m_clock_freq, new Clock::Handler(this, &macsimComponent::ticReceived)); - m_ptx_core = params.find("ptx_core", 0); + if (params.find("ptx_core", 0)) { + m_acc_type = PTX_CORE; + m_acc_core = 1; + } else if (params.find("igpu_core", 0)) { + m_acc_type = IGPU_CORE; + m_acc_core = 1; + } else { + m_acc_core = 0; + m_acc_type = NO_ACC; + } m_num_link = params.find("num_link", 1); configureLinks(params, tc); @@ -150,7 +159,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) { m_data_cache_requests.push_back(std::map()); m_data_cache_responses.push_back(std::set()); - if (m_ptx_core) { + if (m_acc_core) { auto ccache_link = loadUserSubComponent( "core" + std::to_string(l) + "-ccache", ComponentInfo::SHARE_NONE, tc, new Interfaces::SimpleMem::Handler( @@ -194,7 +203,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) { m_data_cache_request_counters = std::vector(m_num_link, 0); m_data_cache_response_counters = std::vector(m_num_link, 0); - if (m_ptx_core) { + if (m_acc_core) { m_const_cache_request_counters = std::vector(m_num_link, 0); m_const_cache_response_counters = std::vector(m_num_link, 0); m_texture_cache_request_counters = std::vector(m_num_link, 0); @@ -275,7 +284,7 @@ void macsimComponent::setup() { new Callback( this, &macsimComponent::strobeDataCacheRespQ); - if (m_ptx_core) { + if (m_acc_core) { CallbackSendConstCacheRequest* scr = new Callback( this, &macsimComponent::sendConstCacheRequest); @@ -347,7 +356,7 @@ bool macsimComponent::ticReceived(Cycle_t) { // Debugging if (m_cycle % 100000 == 0) { for (unsigned int l = 0; l < m_num_link; ++l) { - if (m_ptx_core) { + if (m_acc_core) { MSC_DEBUG( "Core[%2d] I$: (%lu, %lu), D$: (%lu, %lu) C$: (%lu, %lu), T$: (%lu, " "%lu)\n", diff --git a/macsimComponent.h b/macsimComponent.h index c7d40781..cc625f9e 100644 --- a/macsimComponent.h +++ b/macsimComponent.h @@ -105,7 +105,8 @@ class macsimComponent : public SST::Component macsim_c *m_macsim; bool m_sim_running; - bool m_ptx_core; + bool m_acc_core; + ACC_Type m_acc_type; bool m_cube_connected; bool m_debug_all; int64_t m_debug_addr; diff --git a/src/config.h b/src/config.h index bda86158..8a334276 100644 --- a/src/config.h +++ b/src/config.h @@ -50,6 +50,7 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L1_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS); \ } else if (level == MEM_L2) { \ @@ -65,6 +66,7 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L2_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS); \ } else if (level == MEM_L3) { \ @@ -79,6 +81,7 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L3_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS); \ } else if (m_level == MEM_LLC) { \ @@ -93,6 +96,7 @@ POSSIBILITY OF SUCH DAMAGE. m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS); \ } \ @@ -112,6 +116,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L1_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS); \ } else if (level == MEM_L2) { \ @@ -129,6 +134,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L2_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS); \ } else if (level == MEM_L3) { \ @@ -145,6 +151,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L3_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS); \ } else if (level == MEM_LLC) { \ @@ -161,6 +168,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS); \ } \ @@ -180,6 +188,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L1_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS); \ } else if (level == MEM_L2) { \ @@ -197,6 +206,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L2_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS); \ } else if (level == MEM_L3) { \ @@ -213,6 +223,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_L3_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS); \ } else if (level == MEM_LLC) { \ @@ -229,6 +240,7 @@ POSSIBILITY OF SUCH DAMAGE. m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS); \ m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS); \ } \ @@ -274,42 +286,44 @@ POSSIBILITY OF SUCH DAMAGE. break; \ } -#define RETIRE_CONFIG() \ - switch (m_unit_type) { \ - case UNIT_SMALL: \ - m_knob_width = *m_simBase->m_knobs->KNOB_WIDTH; \ - m_knob_ptx_sim = \ - static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_knob_igpu_sim = \ - static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - break; \ - case UNIT_MEDIUM: \ - m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH; \ - m_knob_ptx_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_knob_igpu_sim = \ - static_cast(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == \ - "igpu" \ - ? true \ - : false; \ - break; \ - case UNIT_LARGE: \ - m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH; \ - m_knob_ptx_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx" \ - ? true \ - : false; \ - m_knob_igpu_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ - break; \ +#define RETIRE_CONFIG() \ + switch (m_unit_type) { \ + case UNIT_SMALL: \ + m_knob_width = *m_simBase->m_knobs->KNOB_WIDTH; \ + m_ptx_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = \ + static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + break; \ + case UNIT_MEDIUM: \ + m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH; \ + m_ptx_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + break; \ + case UNIT_LARGE: \ + m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH; \ + m_ptx_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx" \ + ? true \ + : false; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ + break; \ } #define EXEC_CONFIG() \ @@ -332,6 +346,7 @@ POSSIBILITY OF SUCH DAMAGE. static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ break; \ \ case UNIT_MEDIUM: \ @@ -344,10 +359,7 @@ POSSIBILITY OF SUCH DAMAGE. *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx" \ ? true \ : false; \ - m_igpu_sim = static_cast( \ - *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ - ? true \ - : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ break; \ \ case UNIT_LARGE: \ @@ -364,6 +376,7 @@ POSSIBILITY OF SUCH DAMAGE. *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \ ? true \ : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ break; \ } \ m_max_port[gen_ALLOCQ] = int_sched_rate; \ @@ -444,14 +457,18 @@ POSSIBILITY OF SUCH DAMAGE. m_knob_icache_line_size = *m_simBase->m_knobs->KNOB_ICACHE_LINE_SIZE; \ m_fetch_modulo = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO - 1; \ if (static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx") { \ - m_knob_ptx_sim = true; \ + m_ptx_sim = true; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO; \ } else { \ - m_knob_ptx_sim = false; \ + m_ptx_sim = false; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO; \ } \ break; \ - \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ case UNIT_MEDIUM: \ m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH; \ m_knob_fetch_width = *m_simBase->m_knobs->KNOB_FETCH_MEDIUM_WDITH; \ @@ -460,13 +477,18 @@ POSSIBILITY OF SUCH DAMAGE. m_fetch_modulo = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO - 1; \ if (static_cast(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == \ "ptx") { \ - m_knob_ptx_sim = true; \ + m_ptx_sim = true; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO; \ } else { \ - m_knob_ptx_sim = false; \ + m_ptx_sim = false; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO; \ } \ break; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ \ case UNIT_LARGE: \ m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH; \ @@ -476,13 +498,18 @@ POSSIBILITY OF SUCH DAMAGE. m_fetch_modulo = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO - 1; \ if (static_cast(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == \ "ptx") { \ - m_knob_ptx_sim = true; \ + m_ptx_sim = true; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO; \ } else { \ - m_knob_ptx_sim = false; \ + m_ptx_sim = false; \ m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO; \ } \ break; \ + m_igpu_sim = static_cast( \ + *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \ + ? true \ + : false; \ + m_acc_sim = (m_igpu_sim || m_ptx_sim); \ } #define CORE_CONFIG() \ diff --git a/src/dram_ctrl.cc b/src/dram_ctrl.cc index b8b57e6c..bfd8384d 100644 --- a/src/dram_ctrl.cc +++ b/src/dram_ctrl.cc @@ -578,9 +578,9 @@ void dram_ctrl_c::send(void) { for (auto I = m_output_buffer->begin(), E = m_output_buffer->end(); I != E; ++I) { mem_req_s* req = (*I); - if (req_type_allowed[req->m_ptx] == false) continue; + if (req_type_allowed[req->m_acc] == false) continue; - req_type_checked[req->m_ptx] = true; + req_type_checked[req->m_acc] = true; req->m_msg_type = NOC_FILL; bool insert_packet = @@ -764,7 +764,7 @@ void dram_ctrl_c::channel_schedule_data(void) { m_current_list[bank]->m_req->m_id); ASSERT(m_current_list[bank]->m_state == DRAM_DATA); m_data_ready[bank] = acquire_data_bus( - ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_ptx); + ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_acc); m_data_avail[bank] = ULLONG_MAX; m_current_list[bank]->m_state = DRAM_DATA_WAIT; } else diff --git a/src/exec.cc b/src/exec.cc index 7fb9600f..d9ba5932 100644 --- a/src/exec.cc +++ b/src/exec.cc @@ -538,7 +538,7 @@ bool exec_c::exec(int thread_id, int entry, uop_c* uop) { use_port(thread_id, entry); // GPU : if we use load-block policy, block current thread due to load instruction - if (uop_latency == -1 && m_ptx_sim && + if (uop_latency == -1 && m_acc_sim && *m_simBase->m_knobs->KNOB_FETCH_ONLY_LOAD_READY) { m_frontend->set_load_wait(uop->m_thread_id, uop->m_uop_num); @@ -741,7 +741,7 @@ void exec_c::br_exec(uop_c* uop) { } // GPU : stall on branch policy - if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) { + if (m_acc_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) { m_frontend->set_br_ready(uop->m_thread_id); } } @@ -793,7 +793,7 @@ void exec_c::run_a_cycle(void) { if (responseArrived) { DEBUG_CORE(m_core_id, "key found: 0x%lx, addr = 0x%llx\n", key, uop->m_vaddr); - if (m_ptx_sim || m_igpu_sim) { + if (m_acc_sim || m_igpu_sim) { if (uop->m_parent_uop) { uop_c* puop = uop->m_parent_uop; ++puop->m_num_child_uops_done; @@ -883,7 +883,7 @@ int exec_c::access_data_cache(uop_c* uop) { auto i = m_uop_buffer.find(key); ASSERTM(m_uop_buffer.end() == i, "uop has already been executed!\n"); - int block_size = m_ptx_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue() + int block_size = m_acc_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue() : KNOB(KNOB_L1_LARGE_LINE_SIZE)->getValue(); // Addr block_addr = uop->m_vaddr & ~((uint64_t)block_size-1); @@ -936,7 +936,7 @@ int exec_c::access_data_cache(uop_c* uop) { } int exec_c::access_const_texture_cache(uop_c* uop) { - ASSERT(m_ptx_sim); + ASSERT(m_acc_sim); ASSERT(uop->m_mem_type == MEM_LD_CM || uop->m_mem_type == MEM_LD_TM); // assign unique key to each memory request; this will be used later in time for strobbing diff --git a/src/exec.h b/src/exec.h index 6f7a3914..ab80e142 100644 --- a/src/exec.h +++ b/src/exec.h @@ -184,8 +184,9 @@ class exec_c uns16 m_mem_sched_rate; /**< memory schedule rate */ uns16 m_fp_sched_rate; /**< fp schedule rate */ uns8 m_dcache_cycles; /**< L1 cache latency */ - bool m_ptx_sim; /**< gpu simulation */ + bool m_acc_sim; /**< gpu simulation */ bool m_igpu_sim; /**< intel gpu simulation */ + bool m_ptx_sim; /**< PTX simulation */ int m_latency[NUM_UOP_TYPES]; /**< latency map */ Counter m_cur_core_cycle; /**< current core cycle */ int m_max_port[max_ALLOCQ]; /**< maximum port */ diff --git a/src/frontend.cc b/src/frontend.cc index c9146543..fd41e17d 100644 --- a/src/frontend.cc +++ b/src/frontend.cc @@ -191,6 +191,8 @@ void frontend_c::run_a_cycle(void) { // fetch every KNOB_FETCH_RATIO cycle // CPU : every cycle // NVIDIA G80 : 1/4 cycles, NVIDIA Fermi: 1/2 cycles + // check core type for the fetch rate + // Hyesoon: Aug-2020 please check whether this need to be changed with heteroe and igpu if (m_fetch_ratio != 1) { m_fetch_modulo++; if (m_fetch_modulo == m_fetch_ratio) @@ -300,7 +302,7 @@ void frontend_c::run_a_cycle(void) { // TONAGESH // nagesh - comments for BAR are incomplete... - if (m_knob_ptx_sim) { + if (m_ptx_sim) { // handling of BAR instruction in PTX - can/should this be moved? // do we have any blocks for which all warps have reached (retired) // their next barrier? @@ -346,7 +348,7 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid, // First time : set up traces for current thread if (fetch_data->m_first_time) { - m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_knob_ptx_sim); + m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_ptx_sim); fetch_data->m_first_time = false; ++m_core->m_inst_fetched[tid]; /*! initial increase */ @@ -356,11 +358,18 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid, // set up initial fetch address thread_s *thread = m_core->get_trace_info(tid); - if (thread->m_ptx) { - trace_info_gpu_s *prev_trace_info = - static_cast(thread->m_prev_trace_info); - fetch_data->m_MT_scheduler.m_next_fetch_addr = - prev_trace_info->m_inst_addr; + if (thread->m_acc) { + if (m_ptx_sim) { + trace_info_gpu_s *prev_trace_info = + static_cast(thread->m_prev_trace_info); + fetch_data->m_MT_scheduler.m_next_fetch_addr = + prev_trace_info->m_inst_addr; + } else if (m_igpu_sim) { + trace_info_igpu_s *prev_trace_info = + static_cast(thread->m_prev_trace_info); + fetch_data->m_MT_scheduler.m_next_fetch_addr = + prev_trace_info->m_instruction_addr; + } } else { if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") { trace_info_cpu_s *prev_trace_info = @@ -372,11 +381,6 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid, static_cast(thread->m_prev_trace_info); fetch_data->m_MT_scheduler.m_next_fetch_addr = prev_trace_info->m_instruction_addr; - } else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu") { - trace_info_igpu_s *prev_trace_info = - static_cast(thread->m_prev_trace_info); - fetch_data->m_MT_scheduler.m_next_fetch_addr = - prev_trace_info->m_instruction_addr; } else { ASSERTM(0, "Wrong core type %s\n", KNOB(KNOB_LARGE_CORE_TYPE)->getValue().c_str()); @@ -457,8 +461,8 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid, ASSERT(new_uop); // read an uop from the traces - if (!m_simBase->m_trace_reader->get_uops_from_traces( - m_core_id, new_uop, tid, m_knob_ptx_sim)) { + if (!m_simBase->m_trace_reader->get_uops_from_traces(m_core_id, new_uop, + tid, m_ptx_sim)) { // couldn't get an uop DEBUG_CORE(m_core_id, "not success\n"); m_uop_pool->release_entry(new_uop->free()); @@ -631,7 +635,7 @@ bool frontend_c::access_icache(int tid, Addr fetch_addr, int result = m_simBase->m_memory->new_mem_req( MRT_IFETCH, line_addr, m_knob_icache_line_size, false, false, 0, NULL, icache_fill_line_wrapper, m_core->get_unique_uop_num(), NULL, m_core_id, - tid, m_knob_ptx_sim); + tid, m_ptx_sim); // mshr full if (!result) return false; @@ -712,7 +716,7 @@ bool frontend_c::icache_fill_line(mem_req_s *req) { if (m_icache->access_cache(req->m_addr, &line_addr, false, req->m_appl_id) == NULL) { m_icache->insert_cache(req->m_addr, &line_addr, &repl_line_addr, - req->m_appl_id, req->m_ptx); + req->m_appl_id, req->m_acc); POWER_CORE_EVENT(req->m_core_id, POWER_ICACHE_W); } @@ -806,7 +810,7 @@ int frontend_c::predict_bpu(uop_c *uop) { // no branch prediction else { // GPU : stall on branch policy, stop fetching - if (m_knob_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) { + if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) { set_br_wait(uop->m_thread_id); mispredicted = false; } @@ -906,7 +910,7 @@ int frontend_c::fetch_rr(void) { } // check the thread is ready to fetch - if (m_knob_ptx_sim) { + if (m_ptx_sim) { // GPU : stall on branch policy, check whether previous branch has been resolved if (*m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR && !check_br_ready(fetch_id)) { diff --git a/src/frontend.h b/src/frontend.h index e6399f06..d193731f 100644 --- a/src/frontend.h +++ b/src/frontend.h @@ -407,7 +407,9 @@ class frontend_c uns m_knob_icache_line_size; /**< icache line size */ bool m_fe_stall; /**< frontend stalled */ bool m_fe_running; /**< enabled frontend */ - bool m_knob_ptx_sim; /**< GPU simulation */ + bool m_ptx_sim; /**< PTX simulation */ + bool m_igpu_sim; /**< iGPU simulation */ + bool m_acc_sim; /**< Accelerator simulation */ bool m_ready_thread_available; /**< ready thread available */ bool m_last_fetch_tid_failed; core_c* m_core; /**< core pointer */ diff --git a/src/global_types.h b/src/global_types.h index 42fbaffc..6cf323ac 100644 --- a/src/global_types.h +++ b/src/global_types.h @@ -80,4 +80,9 @@ typedef enum uop_latency_map { // enum for x86 latency maps - Michael NUM_LATENCY_MAPS } latency_map; +typedef enum _ACC_Type_enum { + NO_ACC = 0, /**< no accelerator */ + PTX_CORE, /**< PTX core */ + IGPU_CORE /**< IGPU core */ +} ACC_Type; #endif diff --git a/src/macsim.cc b/src/macsim.cc index c284675f..92d5e414 100644 --- a/src/macsim.cc +++ b/src/macsim.cc @@ -336,7 +336,7 @@ void macsim_c::init_cores(int num_max_core) { // insert to the core type pool if (static_cast(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx") - m_ptx_core_pool.push(ii); + m_acc_core_pool.push(ii); else m_x86_core_pool.push(ii); } @@ -352,7 +352,7 @@ void macsim_c::init_cores(int num_max_core) { // insert to the core type pool if (static_cast(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx") - m_ptx_core_pool.push(ii + total_core); + m_acc_core_pool.push(ii + total_core); else m_x86_core_pool.push(ii + total_core); } @@ -367,7 +367,7 @@ void macsim_c::init_cores(int num_max_core) { // insert to the core type pool if (static_cast(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx") - m_ptx_core_pool.push(ii + total_core); + m_acc_core_pool.push(ii + total_core); else m_x86_core_pool.push(ii + total_core); } diff --git a/src/macsim.h b/src/macsim.h index 34dc50ec..9c12ddca 100644 --- a/src/macsim.h +++ b/src/macsim.h @@ -243,7 +243,7 @@ class macsim_c // process manager process_manager_c *m_process_manager; /**< process manager */ queue m_x86_core_pool; /**< x86 cores pool */ - queue m_ptx_core_pool; /**< GPU cores pool */ + queue m_acc_core_pool; /**< GPU cores pool */ multi_key_map_c *m_block_id_mapper; /**< block id mapper */ // data structure pools (to reduce overhead of memory allocation) diff --git a/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc b/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc index 4c0f4e58..add1b9f0 100644 --- a/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc +++ b/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc @@ -249,13 +249,12 @@ SimpleRouter::handle_link_arrival( int port, LinkData* data ) } //track flit_id of tail flit - cout << manifold::kernel::Manifold::NowTicks() << ",p," - << ((HeadFlit*)data->f)->req->m_id << "," - << ((HeadFlit*)data->f)->req->m_ptx << "," - << mem_state_copy[((HeadFlit*)data->f)->req->m_state] << "," - << mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << "," - << node_id << "," << ((HeadFlit*)data->f)->dst_node << "," - << endl; + cout << manifold::kernel::Manifold::NowTicks() << ",p," + << ((HeadFlit*)data->f)->req->m_id << "," + << ((HeadFlit*)data->f)->req->m_acc << "," + << mem_state_copy[((HeadFlit*)data->f)->req->m_state] << "," + << mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << "," + << node_id << "," << ((HeadFlit*)data->f)->dst_node << "," << endl; /* cout << manifold::kernel::Manifold::NowTicks() << ",b," << node_id << ","; for(uint i=0; im_knobs->KNOB_HETERO_MEM_PRIORITY_CPU) { - if (a->m_ptx != true && b->m_ptx == true) { + if (a->m_acc != true && b->m_acc == true) { return true; - } else if (a->m_ptx == true && b->m_ptx != true) + } else if (a->m_acc == true && b->m_acc != true) return false; } else if (*m_simBase->m_knobs->KNOB_HETERO_MEM_PRIORITY_GPU) { - if (a->m_ptx != true && b->m_ptx == true) { + if (a->m_acc != true && b->m_acc == true) { return false; - } else if (a->m_ptx == true && b->m_ptx != true) + } else if (a->m_acc == true && b->m_acc != true) return true; } @@ -479,7 +479,7 @@ int dcu_c::access(uop_c* uop) { } else { POWER_EVENT(POWER_LLC_R); } - STAT_EVENT(L1_HIT_CPU + this->m_ptx_sim); + STAT_EVENT(L1_HIT_CPU + this->m_acc_sim); DEBUG_CORE(uop->m_core_id, "L%d[%d] uop_num:%lld cache hit\n", m_level, m_id, uop->m_uop_num); // stat @@ -496,7 +496,7 @@ int dcu_c::access(uop_c* uop) { if (*m_simBase->m_knobs->KNOB_ENABLE_CACHE_COHERENCE) { } - if (this->m_ptx_sim && + if (this->m_acc_sim && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && type == MEM_ST) { // evict global data on write hit in L1 @@ -504,7 +504,7 @@ int dcu_c::access(uop_c* uop) { int req_size; Addr req_addr; - if (m_ptx_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) { + if (m_acc_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) { req_size = uop->m_mem_size; req_addr = vaddr; } else { @@ -518,7 +518,7 @@ int dcu_c::access(uop_c* uop) { int result = m_simBase->m_memory->new_mem_req( req_type, req_addr, req_size, cache_hit, true, m_latency, uop, - done_func, uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_ptx_sim); + done_func, uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_acc_sim); if (!result) { uop->m_state = OS_DCACHE_MEM_ACCESS_DENIED; @@ -534,7 +534,7 @@ int dcu_c::access(uop_c* uop) { // DCACHE miss // ------------------------------------- else { // !cache_hit - STAT_EVENT(L1_MISS_CPU + this->m_ptx_sim); + STAT_EVENT(L1_MISS_CPU + this->m_acc_sim); DEBUG_CORE(uop->m_core_id, "L%d[%d] uop_num:%lld cache miss\n", m_level, m_id, uop->m_uop_num); @@ -579,7 +579,7 @@ int dcu_c::access(uop_c* uop) { // ------------------------------------- int req_size; Addr req_addr; - if (m_ptx_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) { + if (m_acc_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) { req_size = uop->m_mem_size; req_addr = vaddr; } else { @@ -600,7 +600,7 @@ int dcu_c::access(uop_c* uop) { // Generate a new memory request (MSHR access) // ------------------------------------- function done_func = NULL; - if (this->m_ptx_sim && + if (this->m_acc_sim && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && (type == MEM_ST || type == MEM_ST_LM)) { done_func = dcache_write_ack_wrapper; @@ -612,7 +612,7 @@ int dcu_c::access(uop_c* uop) { result = m_simBase->m_memory->new_mem_req( req_type, req_addr, req_size, cache_hit, (type == MEM_ST_GM || type == MEM_ST_LM), m_latency, uop, done_func, - uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_ptx_sim); + uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_acc_sim); // ------------------------------------- // MSHR full @@ -772,7 +772,7 @@ void dcu_c::process_in_queue() { m_level, req->m_thread_id, req->m_addr, req->m_pc, req->m_uop ? req->m_uop : NULL, true); - STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + req->m_ptx); + STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + req->m_acc); if (line && req->m_type == MRT_DSTORE) { line->m_dirty = true; @@ -841,7 +841,7 @@ void dcu_c::process_in_queue() { // req->m_addr, req->m_pc, req->m_uop ? req->m_uop : NULL, false); } - STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + 2 + req->m_ptx); + STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + 2 + req->m_acc); // handle_coherence(m_level, false, ); @@ -895,7 +895,7 @@ void dcu_c::process_in_queue() { m_level, m_id, req->m_id, mem_req_c::mem_req_type_name[req->m_type], m_cycle - req->m_in); - if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && + if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && req->m_type == MRT_DSTORE) { m_simBase->m_memory->free_write_req(req); } else { @@ -989,7 +989,7 @@ void dcu_c::process_out_queue() { // ------------------------------------- if (req->m_state == MEM_OUTQUEUE_NEW) { int msg_type; - if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && + if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && req->m_with_data && m_level != MEM_LLC) { // can change if to req->m_type == MRT_DSTORE msg_type = NOC_NEW_WITH_DATA; @@ -1009,7 +1009,7 @@ void dcu_c::process_out_queue() { // ------------------------------------- else if (req->m_state == MEM_OUT_FILL) { int msg_type; - if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && + if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && req->m_with_data && m_level == MEM_LLC) { // can change if to req->m_type == MRT_DSTORE msg_type = NOC_ACK; @@ -1068,7 +1068,7 @@ void dcu_c::process_fill_queue() { mem_req_s* req = (*I); - if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && + if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && m_level == MEM_L1 && req->m_type == MRT_DSTORE) { ASSERTM(m_done && req->m_done_func && req->m_done_func(req), "done function failed\n"); @@ -1127,7 +1127,7 @@ void dcu_c::process_fill_queue() { dcache_data_s* data; data = (dcache_data_s*)m_cache->insert_cache( req->m_addr, &line_addr, &victim_line_addr, req->m_appl_id, - req->m_ptx); + req->m_acc); if (m_level != MEM_LLC) { POWER_CORE_EVENT(req->m_core_id, POWER_DCACHE_W + (m_level - 1)); @@ -1147,7 +1147,7 @@ void dcu_c::process_fill_queue() { // new write-back request mem_req_s* wb = m_simBase->m_memory->new_wb_req( - victim_line_addr, m_line_size, m_ptx_sim, data, m_level); + victim_line_addr, m_line_size, m_acc_sim, data, m_level); wb->m_rdy_cycle = m_cycle + 1; @@ -1322,7 +1322,7 @@ void dcu_c::process_fill_queue() { m_level, m_id, req->m_id, mem_req_c::mem_req_type_name[req->m_type], m_cycle - req->m_in); - if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && + if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f && req->m_type == MRT_DSTORE) { m_simBase->m_memory->free_write_req(req); } else { @@ -1426,7 +1426,7 @@ bool dcu_c::done(mem_req_s* req) { // DCACHE insertion // ------------------------------------- data = (dcache_data_s*)m_cache->insert_cache( - addr, &line_addr, &repl_line_addr, req->m_appl_id, req->m_ptx); + addr, &line_addr, &repl_line_addr, req->m_appl_id, req->m_acc); if (m_level != MEM_LLC) { POWER_CORE_EVENT(req->m_core_id, POWER_DCACHE_W + (m_level - 1)); @@ -1450,7 +1450,7 @@ bool dcu_c::done(mem_req_s* req) { // new write back request mem_req_s* wb = m_simBase->m_memory->new_wb_req( - repl_line_addr, m_line_size, m_ptx_sim, data, m_level); + repl_line_addr, m_line_size, m_acc_sim, data, m_level); wb->m_rdy_cycle = m_cycle + 1; @@ -1489,7 +1489,7 @@ bool dcu_c::done(mem_req_s* req) { uop->m_inst_num, uop->m_uop_num, req->m_in_global); uop->m_done_cycle = m_simBase->m_core_cycle[uop->m_core_id] + 1; uop->m_state = OS_SCHEDULED; - if (m_ptx_sim || m_igpu_sim) { + if (m_acc_sim) { if (uop->m_parent_uop) { uop_c* puop = uop->m_parent_uop; ++puop->m_num_child_uops_done; @@ -1521,7 +1521,7 @@ bool dcu_c::write_done(mem_req_s* req) { uop_c* uop = req->m_uop; uop->m_done_cycle = m_simBase->m_core_cycle[uop->m_core_id] + 1; uop->m_state = OS_SCHEDULED; - if (m_ptx_sim || m_igpu_sim) { + if (m_acc_sim || m_igpu_sim) { if (uop->m_parent_uop) { uop_c* puop = uop->m_parent_uop; ++puop->m_num_child_uops_done; @@ -1575,17 +1575,20 @@ memory_c::memory_c(macsim_c* simBase) { m_num_gpu = 0; m_num_cpu = 0; - if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "ptx") + if ((KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "ptx") || + (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu")) m_num_gpu += *KNOB(KNOB_NUM_SIM_LARGE_CORES); else m_num_cpu += *KNOB(KNOB_NUM_SIM_LARGE_CORES); - if (KNOB(KNOB_MEDIUM_CORE_TYPE)->getValue() == "ptx") + if ((KNOB(KNOB_MEDIUM_CORE_TYPE)->getValue() == "ptx") || + (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu")) m_num_gpu += *KNOB(KNOB_NUM_SIM_MEDIUM_CORES); else m_num_cpu += *KNOB(KNOB_NUM_SIM_MEDIUM_CORES); - if (KNOB(KNOB_CORE_TYPE)->getValue() == "ptx") + if ((KNOB(KNOB_CORE_TYPE)->getValue() == "ptx") || + (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu")) m_num_gpu += *KNOB(KNOB_NUM_SIM_SMALL_CORES); else m_num_cpu += *KNOB(KNOB_NUM_SIM_SMALL_CORES); @@ -1912,7 +1915,7 @@ void memory_c::init_new_req(mem_req_s* req, Mem_Req_Type type, Addr addr, req->m_pc = uop ? uop->m_pc : 0; req->m_prefetcher_id = 0; req->m_pref_loadPC = 0; - req->m_ptx = ptx; + req->m_acc = ptx; req->m_done_func = done_func; req->m_uop = uop ? uop : NULL; if (type == MRT_DPRF) req->m_uop = NULL; @@ -1948,7 +1951,7 @@ void memory_c::adjust_req(mem_req_s* req, Mem_Req_Type type, Addr addr, req->m_pc = uop ? uop->m_pc : 0; req->m_prefetcher_id = 0; req->m_pref_loadPC = 0; - req->m_ptx = ptx; + req->m_acc = ptx; req->m_done_func = done_func; req->m_uop = uop ? uop : NULL; if (type == MRT_DPRF) req->m_uop = NULL; @@ -2110,7 +2113,7 @@ mem_req_s* memory_c::new_wb_req(Addr addr, int size, bool ptx, req->m_pc = data->m_pc; req->m_prefetcher_id = 0; req->m_pref_loadPC = 0; - req->m_ptx = ptx; + req->m_acc = ptx; req->m_done_func = NULL; req->m_uop = NULL; req->m_in = m_cycle; diff --git a/src/memory.h b/src/memory.h index 97719bf6..3efa846f 100644 --- a/src/memory.h +++ b/src/memory.h @@ -303,8 +303,9 @@ class dcu_c int m_line_size; /**< cache line size */ int m_banks; /**< number of cache banks */ int m_latency; /**< cache access latency */ - bool m_ptx_sim; /**< gpu cache */ + bool m_acc_sim; /**< gpu cache */ bool m_igpu_sim; /**< intel gpu cache */ + bool m_ptx_sim; /**< gpu cache */ queue_c* m_in_queue; /**< input queue */ queue_c* m_wb_queue; /**< write-back queue */ queue_c* m_fill_queue; /**< fill queue */ diff --git a/src/memreq_info.cc b/src/memreq_info.cc index 5f184c1b..80dbc963 100644 --- a/src/memreq_info.cc +++ b/src/memreq_info.cc @@ -82,7 +82,7 @@ void mem_req_s::init(void) { m_pc = 0; m_prefetcher_id = 0; m_pref_loadPC = 0; - m_ptx = false; + m_acc = false; m_queue = NULL; for (int ii = 0; ii < MEM_LAST; ++ii) m_cache_id[ii] = 0; m_uop = NULL; diff --git a/src/memreq_info.h b/src/memreq_info.h index 4fad80d4..354f5b6f 100644 --- a/src/memreq_info.h +++ b/src/memreq_info.h @@ -152,7 +152,7 @@ typedef struct mem_req_s { Addr m_pc; /**< load pc */ uns8 m_prefetcher_id; /**< prefetcher id, if prefetch request */ Addr m_pref_loadPC; /**< prefetch load pc */ - bool m_ptx; /**< GPU request */ + bool m_acc; /**< GPU request */ queue_c* m_queue; /**< current memory queue in */ int m_cache_id[MEM_LAST]; /**< each level cache id */ uop_c* m_uop; /**< uop that generates this request */ diff --git a/src/network.cc b/src/network.cc index c995b30d..36aa0243 100644 --- a/src/network.cc +++ b/src/network.cc @@ -319,12 +319,12 @@ void router_c::local_packet_injection(void) { if (m_input_buffer[0][ii].size() + num_flit <= m_buffer_max_size) { #endif // flit generation and insert into the buffer - STAT_EVENT(TOTAL_PACKET_CPU + req->m_ptx); + STAT_EVENT(TOTAL_PACKET_CPU + req->m_acc); req->m_noc_cycle = m_cycle; // stat handling ++g_total_packet; - if (req->m_ptx) { + if (req->m_acc) { ++g_total_gpu_packet; STAT_EVENT(NOC_AVG_ACTIVE_PACKET_BASE_GPU); STAT_EVENT_N(NOC_AVG_ACTIVE_PACKET_GPU, g_total_gpu_packet); @@ -460,7 +460,7 @@ void router_c::stage_vca(void) { "op:%d oc:%d ptx:%d\n", m_cycle, m_id, flit->m_req->m_id, flit->m_id, flit->m_req->m_msg_src, flit->m_req->m_msg_dst, iport, ivc, - m_route_fixed[iport][ivc], ovc, flit->m_req->m_ptx); + m_route_fixed[iport][ivc], ovc, flit->m_req->m_acc); } } } @@ -652,7 +652,7 @@ void router_c::stage_lt(void) { if (port == LOCAL) { --g_total_packet; - if (f->m_req->m_ptx) { + if (f->m_req->m_acc) { --g_total_gpu_packet; } else { --g_total_cpu_packet; @@ -667,8 +667,8 @@ void router_c::stage_lt(void) { STAT_EVENT(NOC_AVG_LATENCY_BASE); STAT_EVENT_N(NOC_AVG_LATENCY, m_cycle - f->m_req->m_noc_cycle); - STAT_EVENT(NOC_AVG_LATENCY_BASE_CPU + f->m_req->m_ptx); - STAT_EVENT_N(NOC_AVG_LATENCY_CPU + f->m_req->m_ptx, + STAT_EVENT(NOC_AVG_LATENCY_BASE_CPU + f->m_req->m_acc); + STAT_EVENT_N(NOC_AVG_LATENCY_CPU + f->m_req->m_acc, m_cycle - f->m_req->m_noc_cycle); } } diff --git a/src/noc.cc b/src/noc.cc index 42b9007c..30e1e350 100644 --- a/src/noc.cc +++ b/src/noc.cc @@ -67,12 +67,12 @@ bool noc_c::insert(int src, int dst, int msg, mem_req_s* req) { noc_entry_s* new_entry = m_pool->acquire_entry(); if (src > dst) { - if (req->m_ptx == true) + if (req->m_acc == true) m_cpu_entry_up->push_back(new_entry); else m_gpu_entry_up->push_back(new_entry); } else { - if (req->m_ptx == true) + if (req->m_acc == true) m_cpu_entry_down->push_back(new_entry); else m_gpu_entry_down->push_back(new_entry); diff --git a/src/process_manager.cc b/src/process_manager.cc index e5da115a..1456f5f8 100644 --- a/src/process_manager.cc +++ b/src/process_manager.cc @@ -127,7 +127,7 @@ process_s::process_s() { m_no_of_threads_created = 0; m_no_of_threads_terminated = 0; m_core_pool = NULL; - m_ptx = false; + m_acc = false; m_repeat = 0; m_current_file_name_base = ""; m_kernel_config_name = ""; @@ -253,7 +253,7 @@ void process_manager_c::create_thread_node(process_s *process, int tid, node->m_process = process; node->m_tid = tid; node->m_main = main; - node->m_ptx = process->m_ptx; + node->m_acc = process->m_acc; // create a new thread start information thread_start_info_s *start_info = &(process->m_thread_start_info[tid]); @@ -284,7 +284,7 @@ void process_manager_c::create_thread_node(process_s *process, int tid, process->m_block_list[node->m_block_id] = true; // add a new node to m_thread_queue (for x86) or m_block_queue (for ptx) - if (process->m_ptx == true) + if (process->m_acc == true) insert_block(node); else insert_thread(node); @@ -399,8 +399,8 @@ int process_manager_c::create_process(string appl, int repeat, int pid) { // setup core pool if (trace_type == "ptx" || trace_type == "newptx") { - process->m_ptx = true; - process->m_core_pool = &m_simBase->m_ptx_core_pool; + process->m_acc = true; + process->m_core_pool = &m_simBase->m_acc_core_pool; // most basic check to ensure that offsets of members in structure that we // use for reading traces (which contains one extra field compared to the @@ -414,7 +414,7 @@ int process_manager_c::create_process(string appl, int repeat, int pid) { (sizeof(trace_info_gpu_small_s) + sizeof(uint32_t))); } } else { - process->m_ptx = false; + process->m_acc = false; process->m_core_pool = &m_simBase->m_x86_core_pool; } @@ -569,7 +569,7 @@ void process_manager_c::setup_process(process_s *process) { trace_config_file.close(); // GPU simulation - if (true == process->m_ptx) { + if (true == process->m_acc) { string path = process->m_current_file_name_base; path += "_info.txt"; @@ -670,7 +670,7 @@ void process_manager_c::setup_process(process_s *process) { // Insert the main thread to the pool create_thread_node(process, 0, true); - if (process->m_ptx) { + if (process->m_acc) { for (int tid = 1; tid < thread_count; ++tid) { if (process->m_thread_start_info[tid].m_inst_count == 0) { create_thread_node(process, process->m_no_of_threads_created++, false); @@ -765,7 +765,7 @@ thread_s *process_manager_c::create_thread(process_s *process, int tid, process->m_thread_trace_info[tid] = trace_info; // TODO - nbl (apr-17-2013): use pools - if (process->m_ptx) { + if (process->m_acc) { trace_info->m_prev_trace_info = new trace_info_gpu_s; trace_info->m_next_trace_info = new trace_info_gpu_s; } else { @@ -817,7 +817,7 @@ thread_s *process_manager_c::create_thread(process_s *process, int tid, trace_info->m_file_opened = true; trace_info->m_trace_ended = false; - trace_info->m_ptx = process->m_ptx; + trace_info->m_acc = process->m_acc; trace_info->m_buffer_index = 0; trace_info->m_buffer_index_max = 0; trace_info->m_buffer_exhausted = true; @@ -882,7 +882,7 @@ int process_manager_c::terminate_thread(int core_id, thread_s *trace_info, --m_simBase->m_num_active_threads; // GPU simulation - if (trace_info->m_ptx == true) { + if (trace_info->m_acc == true) { int t_process_id = trace_info->m_process->m_process_id; int t_thread_id = trace_info->m_unique_thread_id; m_simBase->m_thread_stats[t_process_id][t_thread_id].m_thread_end_cycle = @@ -972,7 +972,7 @@ int process_manager_c::terminate_thread(int core_id, thread_s *trace_info, } // TODO - nbl (apr-17-2013): use pools - if (trace_info->m_process->m_ptx) { + if (trace_info->m_process->m_acc) { trace_info_gpu_s *temp = static_cast(trace_info->m_prev_trace_info); delete temp; diff --git a/src/process_manager.h b/src/process_manager.h index 1e386ec0..ee59b13a 100644 --- a/src/process_manager.h +++ b/src/process_manager.h @@ -73,7 +73,7 @@ typedef struct thread_trace_info_node_s { process_s* m_process; /**< pointer to the process */ int m_tid; /**< thread id */ bool m_main; /**< main thread */ - bool m_ptx; /**< GPU simulation */ + bool m_acc; /**< GPU simulation */ int m_block_id; /**< block id */ } thread_trace_info_node_s; @@ -188,7 +188,7 @@ typedef struct thread_s { uint64_t m_uop_count; /**< total uop counts */ bool m_trace_ended; /**< trace ended */ process_s* m_process; /**< point to the application belongs to */ - bool m_ptx; /**< GPU thread */ + bool m_acc; /**< GPU thread */ char* m_buffer; /**< trace buffer */ int m_buffer_index; /**< current trace buffer index */ int m_buffer_index_max; /**< maximum buffer index */ @@ -258,7 +258,7 @@ typedef struct process_s { map m_core_list; /**< list of cores that this process is executed */ queue* m_core_pool; /**< core pool pointer */ - bool m_ptx; /**< GPU application */ + bool m_acc; /**< GPU application */ int m_repeat; /**< application has been re-executed */ vector m_applications; /**< list of sub-applications */ vector diff --git a/src/readonly_cache.cc b/src/readonly_cache.cc index 5bc917cc..a1fdd167 100644 --- a/src/readonly_cache.cc +++ b/src/readonly_cache.cc @@ -188,7 +188,7 @@ bool readonly_cache_c::cache_fill_line(mem_req_s* req) { // insert cache m_cache->insert_cache(req->m_addr, &line_addr, &repl_line_addr, - req->m_appl_id, req->m_ptx); + req->m_appl_id, req->m_acc); if (req->m_uop) { uop_c* uop = req->m_uop; diff --git a/src/retire.cc b/src/retire.cc index d1e80b58..1a80efb7 100644 --- a/src/retire.cc +++ b/src/retire.cc @@ -108,7 +108,7 @@ retire_c::retire_c(RETIRE_INTERFACE_PARAMS(), macsim_c* simBase) RETIRE_CONFIG(); - if (m_knob_ptx_sim || m_knob_igpu_sim) m_knob_width = 1000; + if (m_ptx_sim || m_igpu_sim) m_knob_width = 1000; } // retire_c destructor @@ -130,7 +130,7 @@ void retire_c::run_a_cycle() { vector* uop_list = NULL; unsigned int uop_list_index = 0; - if (m_knob_ptx_sim || m_knob_igpu_sim) { + if (m_ptx_sim || m_igpu_sim) { // GPU : many retireable uops from multiple threads. Get entire retireable uops uop_list = m_gpu_rob->get_n_uops_in_ready_order(m_knob_width, m_cur_core_cycle); @@ -144,7 +144,7 @@ void retire_c::run_a_cycle() { // we need to handle retirement for x86 and ptx separately // retirement logic for GPU - if (m_knob_ptx_sim || m_knob_igpu_sim) { + if (m_ptx_sim || m_igpu_sim) { // GPU : many retireable uops from multiple threads. Get entire retireable uops if (uop_list_index == uop_list->size()) { uop_list->clear(); @@ -281,7 +281,7 @@ void retire_c::run_a_cycle() { STAT_EVENT(UOP_COUNT_TOT); // GPU : barrier - if (m_knob_ptx_sim && cur_uop->m_bar_type == BAR_FETCH) { + if (m_ptx_sim && cur_uop->m_bar_type == BAR_FETCH) { frontend_c* frontend = core->get_frontend(); frontend->synch_thread(cur_uop->m_block_id, cur_uop->m_thread_id); } @@ -545,7 +545,7 @@ void retire_c::update_stats(process_s* process) { core->get_core_type() == "ptx") { if ((process->m_repeat + 1) == *m_simBase->m_knobs->KNOB_REPEAT_TRACE_N) { --m_simBase->m_process_count_without_repeat; - STAT_EVENT_N(CYC_COUNT_PTX, CYCLE); + STAT_EVENT_N(CYC_COUNT_ACC, CYCLE); report("application " << process->m_process_id << " terminated " << "(" @@ -555,7 +555,7 @@ void retire_c::update_stats(process_s* process) { } else { if (process->m_repeat == 0) { if (core->get_core_type() == "ptx") { - STAT_EVENT_N(CYC_COUNT_PTX, CYCLE); + STAT_EVENT_N(CYC_COUNT_ACC, CYCLE); } else { STAT_EVENT_N(CYC_COUNT_X86, CYCLE); } diff --git a/src/retire.h b/src/retire.h index 35729d0a..6bb64a8b 100644 --- a/src/retire.h +++ b/src/retire.h @@ -218,8 +218,9 @@ class retire_c Counter m_total_insts_retired; /**< total retired instructions */ Counter m_cur_core_cycle; /**< current core cycle */ uns16 m_knob_width; /**< pipeline width */ - bool m_knob_ptx_sim; /**< gpu simulation */ - bool m_knob_igpu_sim; /**< intel gpu simulation */ + bool m_ptx_sim; /**< ptx simulation */ + bool m_acc_sim; /**< accelerator simulation */ + bool m_igpu_sim; /**< intel gpu simulation */ unordered_map m_insts_retired; /**< number of retired inst. per thread */ unordered_map diff --git a/src/trace_read.cc b/src/trace_read.cc index 24304cdd..2f10b11f 100644 --- a/src/trace_read.cc +++ b/src/trace_read.cc @@ -697,11 +697,11 @@ trace_reader_wrapper_c::trace_reader_wrapper_c(macsim_c *simBase) { } trace_reader_wrapper_c::trace_reader_wrapper_c() { - m_dprint_output->close(); - delete m_dprint_output; } trace_reader_wrapper_c::~trace_reader_wrapper_c() { + m_dprint_output->close(); + delete m_dprint_output; delete m_cpu_decoder; delete m_gpu_decoder; } diff --git a/sst-unit-test/references/vectoradd/sdl1/general.stat.out b/sst-unit-test/references/vectoradd/sdl1/general.stat.out index 2199040a..1ec1fd4f 100644 --- a/sst-unit-test/references/vectoradd/sdl1/general.stat.out +++ b/sst-unit-test/references/vectoradd/sdl1/general.stat.out @@ -14,7 +14,7 @@ NUM_REPEAT 0 CYC_COUNT_X86 0 0 -CYC_COUNT_PTX 0 0 +CYC_COUNT_ACC 0 0 AVG_BLOCK_EXE_CYCLE 0 0 diff --git a/sst-unit-test/references/vectoradd/sdl2/general.stat.out b/sst-unit-test/references/vectoradd/sdl2/general.stat.out index 2199040a..1ec1fd4f 100644 --- a/sst-unit-test/references/vectoradd/sdl2/general.stat.out +++ b/sst-unit-test/references/vectoradd/sdl2/general.stat.out @@ -14,7 +14,7 @@ NUM_REPEAT 0 CYC_COUNT_X86 0 0 -CYC_COUNT_PTX 0 0 +CYC_COUNT_ACC 0 0 AVG_BLOCK_EXE_CYCLE 0 0 diff --git a/sst-unit-test/references/vectoradd/sdl3/general.stat.out b/sst-unit-test/references/vectoradd/sdl3/general.stat.out index bea42a99..1845bae1 100644 --- a/sst-unit-test/references/vectoradd/sdl3/general.stat.out +++ b/sst-unit-test/references/vectoradd/sdl3/general.stat.out @@ -14,7 +14,7 @@ NUM_REPEAT 0 CYC_COUNT_X86 0 0 -CYC_COUNT_PTX 3346 3346 +CYC_COUNT_ACC 3346 3346 AVG_BLOCK_EXE_CYCLE 29130 29130 diff --git a/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0 b/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0 index e7b4c179..9ec3568d 100644 --- a/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0 +++ b/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0 @@ -14,7 +14,7 @@ NUM_REPEAT 0 CYC_COUNT_X86 0 0 -CYC_COUNT_PTX 3346 3346 +CYC_COUNT_ACC 3346 3346 AVG_BLOCK_EXE_CYCLE 29130 29130 diff --git a/sst-unit-test/references/vectoradd/sdl4/general.stat.out b/sst-unit-test/references/vectoradd/sdl4/general.stat.out index da987abc..149f8531 100644 --- a/sst-unit-test/references/vectoradd/sdl4/general.stat.out +++ b/sst-unit-test/references/vectoradd/sdl4/general.stat.out @@ -14,7 +14,7 @@ NUM_REPEAT 0 CYC_COUNT_X86 0 0 -CYC_COUNT_PTX 3278 3278 +CYC_COUNT_ACC 3278 3278 AVG_BLOCK_EXE_CYCLE 28314 28314 diff --git a/tools/x86_trace_generator/trace_generator.cpp b/tools/x86_trace_generator/trace_generator.cpp index c508447f..e9527e0f 100644 --- a/tools/x86_trace_generator/trace_generator.cpp +++ b/tools/x86_trace_generator/trace_generator.cpp @@ -151,6 +151,7 @@ Knob(UINT32, Knob_num_thread, "thread", "1", "Total number of threads to gather Knob(string, Knob_compiler, "compiler", "gcc", "Which compiler was used?"); Knob(string, Knob_pl, "pl", "normal", "Programming Language"); Knob(UINT64, Knob_skip, "skipinst", "0", "Instructions to skip"); +Knob(UINT64, Knob_skip_thread0, "skip_thread0", "0", "skip thread 0"); Knob(UINT64, Knob_max, "max", "0", "Max number of instruction to collect"); Knob(UINT64, Knob_rtn_min, "rmin", "0", "Max number of function calls to collect data"); Knob(UINT64, Knob_rtn_max, "rmax", "0", "Max number of function calls to collect data"); @@ -441,6 +442,9 @@ UINT64 last_count[MAX_THREADS] = {0}; VOID PIN_FAST_ANALYSIS_CALL INST_count(UINT32 count) { THREADID tid = threadMap[PIN_ThreadId()]; + + if ((Knob_skip_thread0.Value()==1) && (tid == 0)) + return; if (tid == 100000) return; @@ -565,6 +569,10 @@ VOID INST_trace(TRACE trace, VOID *v) void instrument(INS ins) { THREADID tid = threadMap[PIN_ThreadId()]; + + if ((Knob_skip_thread0.Value()==1) && (tid == 0)) + return; + if (tid == 100000) return; @@ -908,6 +916,10 @@ VOID InstHMC(ADDRINT pc) if (!Knob_enable_hmc.Value()) return; THREADID tid = threadMap[PIN_ThreadId()]; + + if ((Knob_skip_thread0.Value()==1) && (tid == 0)) + return; + if (tid != 0) return; @@ -947,6 +959,9 @@ void ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, void *v) cout << "-> Thread[" << tid << "->" << threadMap[tid] << "] begins." << endl; THREADID threadid = threadMap[tid]; + if ((Knob_skip_thread0.Value()==1) && (threadid == 0)) + return; + if (threadid == 100000) return; @@ -1028,6 +1043,9 @@ void thread_end(void) void thread_end(THREADID threadid) { // THREADID threadid = threadMap[PIN_ThreadId()]; + + if ((Knob_skip_thread0.Value()==1) && (threadid == 0)) + return; if (threadid == 100000) return; @@ -1205,6 +1223,10 @@ void sanity_check(void) void write_inst_to_file(ofstream *file, Inst_info *t_info) { THREADID tid = threadMap[PIN_ThreadId()]; + + if ((Knob_skip_thread0.Value()==1) && (tid == 0)) + return; + if (tid == 100000 || !g_enable_thread_instrument[tid] || g_inst_print_count[tid] > Knob_dump_max.Value()) return;