diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index fcd57864..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-sudo: false
-branches:
-  only:
-    - master
-language: cpp
-compiler:
-  - gcc
-  - clang
-install:
-- if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
-addons:
-  apt:
-    sources:
-    - ubuntu-toolchain-r-test
-    packages:
-    - gcc-4.8
-    - g++-4.8
-    - clang
-    - zlib1g-dev
-script:
-        - scons
-        - scons debug=1
-git:
-        submodules: false
-notifications:
-        email:
-            recipients:
-                - macsim-dev-commits@googlegroups.com
-            on_success: change
-            on_failure: always
diff --git a/INSTALL b/INSTALL
index 3696c4fc..78407e38 100644
--- a/INSTALL
+++ b/INSTALL
@@ -2,10 +2,10 @@
 
 === Requirement ===
 
-  SCons
+  SCons and dependencies
 
   * How to get SCons
-    Ubuntu --> apt-get install scons
+    Ubuntu --> apt-get install scons python-metaconfig
 
 
 === Build steps ===
diff --git a/README.md b/README.md
index a4de7e95..c96ef7db 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-[![Build Status](https://travis-ci.org/gthparch/macsim.svg?branch=master)](https://travis-ci.org/gthparch/macsim)
-
 # Macsim 
 ## Introduction
 
@@ -16,7 +14,7 @@
   cores) and SMT or MT architectures as well.
 * Currently interconnection network model (based on IRIS) and power model (based
   on McPAT) are connected.
-* MacSim is also one of the components of SST, so muiltiple MacSim simulatore
+* MacSim is also one of the components of SST, so multiple MacSim simulatore
   can run concurrently.
 * The project has been supported by Intel, NSF, Sandia National Lab.
 
diff --git a/SConscript b/SConscript
index 83afce8c..5cfcdb8b 100644
--- a/SConscript
+++ b/SConscript
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 #########################################################################################
 # Author      : Jaekyu Lee (jq.lee17@gmail.com)
@@ -38,7 +38,7 @@ warn_flags = ' '.join(warn_flags)
 env = Environment()
 custom_vars = set(['AS', 'AR', 'CC', 'CXX', 'HOME', 'LD_LIBRARY_PATH', 'PATH', 'RANLIB'])
 
-for key,val in os.environ.iteritems():
+for key,val in os.environ.items():
   if key in custom_vars:
     env[key] = val
 
diff --git a/SConstruct b/SConstruct
index a65d4a57..2995cf00 100644
--- a/SConstruct
+++ b/SConstruct
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 #########################################################################################
 # Author      : Jaekyu Lee (jq.lee17@gmail.com)
@@ -8,7 +8,7 @@
 
 import os
 import sys
-import ConfigParser
+import configparser
 
 
 ## Check c++14 support
@@ -48,7 +48,7 @@ def pre_compile_check():
   env = Environment()
   custom_vars = set(['AS', 'AR', 'CC', 'CXX', 'HOME', 'LD_LIBRARY_PATH', 'PATH', 'RANLIB'])
 
-  for key,val in os.environ.iteritems():
+  for key,val in os.environ.items():
     if key in custom_vars:
       env[key] = val
 
@@ -68,17 +68,17 @@ flags = {}
 
 
 ## Configuration from file
-Config = ConfigParser.ConfigParser()
+Config = configparser.ConfigParser()
 Config.read('macsim.config')
-flags['dram']          = Config.get('Library', 'dram', '0')
-flags['power']         = Config.get('Library', 'power', '0')
-flags['iris']          = Config.get('Library', 'iris', '0')
-flags['qsim']          = Config.get('Library', 'qsim', '0')
-flags['debug']         = Config.get('Build', 'debug', '0')
-flags['gprof']         = Config.get('Build', 'gprof', '0')
-flags['pin_3_13_trace'] = Config.get('Build', 'pin_3_13_trace', '0')
-flags['val']           = Config.get('Build_Extra', 'val', '0')
-flags['ramulator']     = Config.get('Library', 'ramulator', '0')
+flags['dram']          = Config.get('Library', 'dram', fallback='0')
+flags['power']         = Config.get('Library', 'power', fallback='0')
+flags['iris']          = Config.get('Library', 'iris', fallback='0')
+flags['qsim']          = Config.get('Library', 'qsim', fallback='0')
+flags['debug']         = Config.get('Build', 'debug', fallback='0')
+flags['gprof']         = Config.get('Build', 'gprof', fallback='0')
+flags['pin_3_13_trace'] = Config.get('Build', 'pin_3_13_trace', fallback='0')
+flags['val']           = Config.get('Build_Extra', 'val', fallback='0')
+flags['ramulator']     = Config.get('Library', 'ramulator', fallback='0')
 
 ## Configuration from commandline
 flags['debug']         = ARGUMENTS.get('debug', flags['debug'])
diff --git a/build.py b/build.py
index d62e320b..757528d0 100755
--- a/build.py
+++ b/build.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 #########################################################################################
 # Author      : Jaekyu Lee (jq.lee17@gmail.com)
diff --git a/def/general.stat.def b/def/general.stat.def
index fba83e5a..b0508bfe 100644
--- a/def/general.stat.def
+++ b/def/general.stat.def
@@ -38,7 +38,7 @@ DEF_STAT(EXE_TIME, COUNT, NO_RATIO)
 DEF_STAT(NUM_REPEAT, COUNT, NO_RATIO)
 
 DEF_STAT(CYC_COUNT_X86, COUNT, NO_RATIO)
-DEF_STAT(CYC_COUNT_PTX, COUNT, NO_RATIO)
+DEF_STAT(CYC_COUNT_ACC, COUNT, NO_RATIO)
 
 DEF_STAT(AVG_BLOCK_EXE_CYCLE, COUNT, NO_RATIO)
 DEF_STAT(AVG_BLOCK_EXE_CYCLE_BASE, COUNT, NO_RATIO)
diff --git a/doc/latex/Makefile b/doc/latex/Makefile
index 51c48f90..2290b38b 100644
--- a/doc/latex/Makefile
+++ b/doc/latex/Makefile
@@ -9,4 +9,4 @@ all:
 	cp macsim.pdf ../;
 
 clean: 
-	$(RM) *.log *.aux *.blg *.bbl *.dvi *.brf macsim.ps macsim.pdf
+	$(RM) *.out *.log *.aux *.blg *.bbl *.toc *.styles *.dvi *.brf macsim.ps macsim.pdf
diff --git a/doc/latex/trace.tex b/doc/latex/trace.tex
index cafd2f80..3aa5c7c6 100644
--- a/doc/latex/trace.tex
+++ b/doc/latex/trace.tex
@@ -381,42 +381,81 @@ \subsection{trace\_xx.raw}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 The trace\_xx.raw file is generated for each thread/warp and contains the
-dynamic instruction trace for the thread/warp in binary
-format. The structure/format for encoding instructions is the same in
-both x86 and PTX traces and looks like as follows (in order):
+dynamic instruction trace for the thread/warp in binary format. The
+structure/format for encoding instructions different for x86 and for PTX. In the
+source, this is defined in \Verb+src/trace_read.h+.
 
+For x86, the trace entry format is (\Verb+struct trace_info_cpu_s+):
 
-%trace format for an instruction in trace_xx.raw
+\vspace{0.2in}
+\begin{footnotesize}
+\begin{tabular}{l c c l l}
+Type            & Size (Bytes) & Offset(Bytes) & Field                     & Description                                            \\ \hline \hline
+\Verb+uint8_t+  & 1            & 0             & \Verb+m_num_read_regs+    & number of source registers                             \\
+\Verb+uint8_t+  & 1            & 1             & \Verb+m_num_dest_regs+    & number of destination registers                        \\
+\Verb+uint8_t+  & 9            & 2             & \Verb+m_src[MAX_SRC_NUM]+ & source register IDs                                    \\
+\Verb+uint8_t+  & 6            & 11             & \Verb+m_dst[MAX_DST_NUM]+ & destination register IDs                               \\
+\Verb+uint8_t+  & 1            & 17             & \Verb+m_cf_type+          & branch type                                            \\
+\Verb+bool+     & 1            & 18             & \Verb+m_has_immediate+    & indicates whether this instruction has immediate field \\
+\Verb+uint8_t+  & 1            & 19             & \Verb+m_opcode+           & opcode                                                 \\
+\Verb+bool+     & 1            & 20             & \Verb+m_has_st+           & indicates whether this instruction has store operation \\
+\Verb+bool+     & 1            & 21             & \Verb+m_is_fp+            & indicates whether this instruction is a FP operation   \\
+\Verb+bool+     & 1            & 22             & \Verb+m_write_flg+        & write flag                                             \\
+\Verb+uint8_t+  & 1            & 23             & \Verb+m_num_ld+           & number of load operations                              \\
+\Verb+uint8_t+  & 1            & 24             & \Verb+m_size+             & instruction size                                       \\
+\Verb+uint32_t+ & 4            & 28             & \Verb+m_ld_vaddr1+        & load address 1                                         \\
+\Verb+uint32_t+ & 4            & 32             & \Verb+m_ld_vaddr2+        & load address 2                                         \\
+\Verb+uint32_t+ & 4            & 36             & \Verb+m_st_vaddr+         & store address                                          \\
+\Verb+uint32_t+ & 4            & 40             & \Verb+m_instruction_addr+ & PC address                                             \\
+\Verb+uint32_t+ & 4            & 44             & \Verb+m_branch_target+    & branch target address                                  \\
+\Verb+uint8_t+  & 1            & 48             & \Verb+m_mem_read_size+    & memory read size                                       \\ 
+\Verb+uint8_t+  & 1            & 49             & \Verb+m_mem_write_size+   & memory write size                                      \\
+\Verb+bool+     & 1            & 50             & \Verb+m_rep_dir+          & repetition direction                                   \\
+\Verb+bool+     & 1            & 51             & \Verb+m_actually_taken+   & indicates whether branch is actually taken             \\
+\end{tabular}
+\end{footnotesize}
+\vspace{0.2in}
+
+For PTX, the trace entry format is (\Verb+struct trace_info_gpu_small_s+):
 
 \vspace{0.2in}
 \begin{footnotesize}
-\begin{tabular}{l c l l}
-Type            & Size (Bytes) & Field                     & Description                                            \\ \hline \hline
-\Verb+uint8_t+  & 1            & \Verb+m_num_read_regs+    & number of source registers                             \\
-\Verb+uint8_t+  & 1            & \Verb+m_num_dest_regs+    & number of destination registers                        \\
-\Verb+uint8_t+  & 9            & \Verb+m_src[MAX_SRC_NUM]+ & source register IDs                                    \\
-\Verb+uint8_t+  & 6            & \Verb+m_dst[MAX_DST_NUM]+ & destination register IDs                               \\
-\Verb+uint8_t+  & 1            & \Verb+m_cf_type+          & branch type                                            \\
-\Verb+bool+     & 1            & \Verb+m_has_immediate+    & indicates whether this instruction has immediate field \\
-\Verb+uint8_t+  & 1            & \Verb+m_opcode+           & opcode                                                 \\
-\Verb+bool+     & 1            & \Verb+m_has_st+           & indicates whether this instruction has store operation \\
-\Verb+bool+     & 1            & \Verb+m_is_fp+            & indicates whether this instruction is a FP operation   \\
-\Verb+bool+     & 1            & \Verb+m_write_flg+        & write flag                                             \\
-\Verb+uint8_t+  & 1            & \Verb+m_num_ld+           & number of load operations                              \\
-\Verb+uint8_t+  & 1            & \Verb+m_size+             & instruction size                                       \\
-\Verb+uint32_t+ & 4            & \Verb+m_ld_vaddr1+        & load address 1                                         \\
-\Verb+uint32_t+ & 4            & \Verb+m_ld_vaddr2+        & load address 2                                         \\
-\Verb+uint32_t+ & 4            & \Verb+m_st_vaddr+         & store address                                          \\
-\Verb+uint32_t+ & 4            & \Verb+m_instruction_addr+ & PC address                                             \\
-\Verb+uint32_t+ & 4            & \Verb+m_branch_target+    & branch target address                                  \\
-\Verb+uint8_t+  & 1            & \Verb+m_mem_read_size+    & memory read size                                       \\ 
-\Verb+uint8_t+  & 1            & \Verb+m_mem_write_size+   & memory write size                                      \\
-\Verb+bool+     & 1            & \Verb+m_rep_dir+          & repetition direction                                   \\
-\Verb+bool+     & 1            & \Verb+m_actually_taken+   & indicates whether branch is actually taken             \\
+\begin{tabular}{l c c l l}
+Type             & Size (Bytes) & Offset(Bytes) & Field                         & Description                                       \\ \hline \hline
+\Verb+uint8_t+   & 1            & 0             & \Verb+m_opcode+               & from \Verb+GPU_OPCODE_ENUM+                       \\
+\Verb+bool+      & 1            & 1             & \Verb+m_is_fp+                & whether this instruction deals with \Verb+float+s \\
+\Verb+bool+      & 1            & 2             & \Verb+m_is_load+              & whether this instruction loads from memory        \\
+\Verb+uint8_t+   & 1            & 3             & \Verb+m_cf_type+              & branch type                                       \\
+\Verb+uint8_t+   & 1            & 4             & \Verb+m_num_read_regs+        & number of source registers                        \\
+\Verb+uint8_t+   & 1            & 5             & \Verb+m_num_dest_regs+        & number of destination registers                   \\
+\Verb+uint16_t+  & 10           & 6             & \Verb+m_src[MAX_GPU_SRC_NUM]+ & source register IDs                               \\
+\Verb+uint16_t+  & 8            & 16            & \Verb+m_dst[MAX_GPU_DST_NUM]+ & destination register IDs                          \\
+\Verb+uint8_t+   & 1            & 24            & \Verb+m_size+                 & instruction size                                  \\
+\Verb+uint32_t+  & 4            & 28            & \Verb+m_active_mask+          & warp's non-blocked threads                        \\
+\Verb+uint32_t+  & 4            & 32            & \Verb+m_br_taken_mask+        & warp's threads that took branch                   \\
+\Verb+uint64_t+  & 8            & 40            & \Verb+m_inst_addr+            & address of current instruction                    \\
+\Verb+uint64_t+  & 8            & 48            & \Verb+m_br_target_addr+       & address to branch to                              \\
+\Verb+uint64_t+  & 8            & 56            & \Verb+m_reconv_inst_addr+     & address of branch reconvergence                   \\
+\Verb+uint64_t+  & 8            & 56            & \Verb+m_mem_addr+             & memory address to load/store to                   \\
+\Verb+uint8_t+   & 1            & 64            & \Verb+m_barrier_id+           &                                                   \\
+\Verb+uint8_t+   & 1            & 64            & \Verb+m_mem_access_size+      & memory access granularity in bytes                \\
+\Verb+uint16_t+  & 2            & 66            & \Verb+m_num_barrier_threads+  &                                                   \\
+\Verb+uint8_t+   & 1            & 68            & \Verb+m_addr_space+           & memory region to work on                          \\
+\Verb+uint8_t+   & 1            & 68            & \Verb+m_level+                &                                                   \\
+\Verb+uint8_t+   & 1            & 69            & \Verb+m_cache_level+          &                                                   \\
+\Verb+uint8_t+   & 1            & 69            & \Verb+m_cache_operator+       &                                                   \\
 \end{tabular}
 \end{footnotesize}
 \vspace{0.2in}
 
+Additionally, if a PTX trace entry is a load or a store, it will be followed by
+32 8-byte (aligned) addresses, each corresponding to the address used by one of
+the warps. Notably, he code for handling this was built on top of the existing
+simulator. As a result, the output generated by \Verb+debug_trace_read+ or
+\Verb+debug_print_trace+ will contain one extra instruction after the memory
+operation, erroneously decoded from the memory address data. Nonetheless, the
+simulation results are correct --- only the debug output is affected.
+
 
 Note that the raw trace is compressed with zlib to reduce the sizes of
 the generated trace files, and the size of each field is the size
diff --git a/doc/macsim.pdf b/doc/macsim.pdf
index 54aa847c..ab79e0e3 100644
Binary files a/doc/macsim.pdf and b/doc/macsim.pdf differ
diff --git a/internal b/internal
index 12d986cd..2a78c449 160000
--- a/internal
+++ b/internal
@@ -1 +1 @@
-Subproject commit 12d986cd890180ec7cfb0be0e6f4358f8da888c1
+Subproject commit 2a78c4493c05682101c0ce6ff66c673a2f332800
diff --git a/macsimComponent.cpp b/macsimComponent.cpp
index 80888627..4bfc53af 100644
--- a/macsimComponent.cpp
+++ b/macsimComponent.cpp
@@ -60,7 +60,16 @@ macsimComponent::macsimComponent(ComponentId_t id, Params& params)
     m_clock_freq,
     new Clock::Handler<macsimComponent>(this, &macsimComponent::ticReceived));
 
-  m_ptx_core = params.find<bool>("ptx_core", 0);
+  if (params.find<bool>("ptx_core", 0)) {
+    m_acc_type = PTX_CORE;
+    m_acc_core = 1;
+  } else if (params.find<bool>("igpu_core", 0)) {
+    m_acc_type = IGPU_CORE;
+    m_acc_core = 1;
+  } else {
+    m_acc_core = 0;
+    m_acc_type = NO_ACC;
+  }
   m_num_link = params.find<uint32_t>("num_link", 1);
   configureLinks(params, tc);
 
@@ -150,7 +159,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
     m_data_cache_requests.push_back(std::map<uint64_t, uint64_t>());
     m_data_cache_responses.push_back(std::set<uint64_t>());
 
-    if (m_ptx_core) {
+    if (m_acc_core) {
       auto ccache_link = loadUserSubComponent<Interfaces::SimpleMem>(
         "core" + std::to_string(l) + "-ccache", ComponentInfo::SHARE_NONE, tc,
         new Interfaces::SimpleMem::Handler<macsimComponent>(
@@ -194,7 +203,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
   m_data_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
   m_data_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);
 
-  if (m_ptx_core) {
+  if (m_acc_core) {
     m_const_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
     m_const_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);
     m_texture_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
@@ -275,7 +284,7 @@ void macsimComponent::setup() {
     new Callback<macsimComponent, bool, int, uint64_t>(
       this, &macsimComponent::strobeDataCacheRespQ);
 
-  if (m_ptx_core) {
+  if (m_acc_core) {
     CallbackSendConstCacheRequest* scr =
       new Callback<macsimComponent, void, int, uint64_t, uint64_t, int>(
         this, &macsimComponent::sendConstCacheRequest);
@@ -347,7 +356,7 @@ bool macsimComponent::ticReceived(Cycle_t) {
   // Debugging
   if (m_cycle % 100000 == 0) {
     for (unsigned int l = 0; l < m_num_link; ++l) {
-      if (m_ptx_core) {
+      if (m_acc_core) {
         MSC_DEBUG(
           "Core[%2d] I$: (%lu, %lu), D$: (%lu, %lu) C$: (%lu, %lu), T$: (%lu, "
           "%lu)\n",
diff --git a/macsimComponent.h b/macsimComponent.h
index c7d40781..cc625f9e 100644
--- a/macsimComponent.h
+++ b/macsimComponent.h
@@ -105,7 +105,8 @@ class macsimComponent : public SST::Component
 
   macsim_c *m_macsim;
   bool m_sim_running;
-  bool m_ptx_core;
+  bool m_acc_core;
+  ACC_Type m_acc_type;
   bool m_cube_connected;
   bool m_debug_all;
   int64_t m_debug_addr;
diff --git a/src/config.h b/src/config.h
index bda86158..8a334276 100644
--- a/src/config.h
+++ b/src/config.h
@@ -50,6 +50,7 @@ POSSIBILITY OF SUCH DAMAGE.
         m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \
                        ? true                                                 \
                        : false;                                               \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L1_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS);                        \
       } else if (level == MEM_L2) {                                           \
@@ -65,6 +66,7 @@ POSSIBILITY OF SUCH DAMAGE.
         m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \
                        ? true                                                 \
                        : false;                                               \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L2_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS);                        \
       } else if (level == MEM_L3) {                                           \
@@ -79,6 +81,7 @@ POSSIBILITY OF SUCH DAMAGE.
         m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \
                        ? true                                                 \
                        : false;                                               \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L3_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS);                        \
       } else if (m_level == MEM_LLC) {                                        \
@@ -93,6 +96,7 @@ POSSIBILITY OF SUCH DAMAGE.
         m_igpu_sim = m_simBase->m_knobs->KNOB_CORE_TYPE->getValue() == "igpu" \
                        ? true                                                 \
                        : false;                                               \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS);                         \
         m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS);                       \
       }                                                                       \
@@ -112,6 +116,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu"     \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L1_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS);                        \
       } else if (level == MEM_L2) {                                           \
@@ -129,6 +134,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu"     \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L2_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS);                        \
       } else if (level == MEM_L3) {                                           \
@@ -145,6 +151,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu"     \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L3_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS);                        \
       } else if (level == MEM_LLC) {                                          \
@@ -161,6 +168,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE->getValue() == "igpu"     \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS);                         \
         m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS);                       \
       }                                                                       \
@@ -180,6 +188,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu"      \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L1_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L1_WRITE_PORTS);                        \
       } else if (level == MEM_L2) {                                           \
@@ -197,6 +206,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu"      \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L2_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L2_WRITE_PORTS);                        \
       } else if (level == MEM_L3) {                                           \
@@ -213,6 +223,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu"      \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_L3_READ_PORTS);                          \
         m_num_write_port = *KNOB(KNOB_L3_WRITE_PORTS);                        \
       } else if (level == MEM_LLC) {                                          \
@@ -229,6 +240,7 @@ POSSIBILITY OF SUCH DAMAGE.
           m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE->getValue() == "igpu"      \
             ? true                                                            \
             : false;                                                          \
+        m_acc_sim = (m_igpu_sim || m_ptx_sim);                                \
         m_num_read_port = *KNOB(KNOB_LLC_READ_PORTS);                         \
         m_num_write_port = *KNOB(KNOB_LLC_WRITE_PORTS);                       \
       }                                                                       \
@@ -274,42 +286,44 @@ POSSIBILITY OF SUCH DAMAGE.
       break;                                                     \
   }
 
-#define RETIRE_CONFIG()                                                        \
-  switch (m_unit_type) {                                                       \
-    case UNIT_SMALL:                                                           \
-      m_knob_width = *m_simBase->m_knobs->KNOB_WIDTH;                          \
-      m_knob_ptx_sim =                                                         \
-        static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx"      \
-          ? true                                                               \
-          : false;                                                             \
-      m_knob_igpu_sim =                                                        \
-        static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu"     \
-          ? true                                                               \
-          : false;                                                             \
-      break;                                                                   \
-    case UNIT_MEDIUM:                                                          \
-      m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH;                   \
-      m_knob_ptx_sim = static_cast<string>(                                    \
-                         *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx"  \
-                         ? true                                                \
-                         : false;                                              \
-      m_knob_igpu_sim =                                                        \
-        static_cast<string>(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) ==     \
-            "igpu"                                                             \
-          ? true                                                               \
-          : false;                                                             \
-      break;                                                                   \
-    case UNIT_LARGE:                                                           \
-      m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH;                    \
-      m_knob_ptx_sim = static_cast<string>(                                    \
-                         *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx"   \
-                         ? true                                                \
-                         : false;                                              \
-      m_knob_igpu_sim = static_cast<string>(                                   \
-                          *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu" \
-                          ? true                                               \
-                          : false;                                             \
-      break;                                                                   \
+#define RETIRE_CONFIG()                                                    \
+  switch (m_unit_type) {                                                   \
+    case UNIT_SMALL:                                                       \
+      m_knob_width = *m_simBase->m_knobs->KNOB_WIDTH;                      \
+      m_ptx_sim =                                                          \
+        static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx"  \
+          ? true                                                           \
+          : false;                                                         \
+      m_igpu_sim =                                                         \
+        static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \
+          ? true                                                           \
+          : false;                                                         \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                               \
+      break;                                                               \
+    case UNIT_MEDIUM:                                                      \
+      m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH;               \
+      m_ptx_sim = static_cast<string>(                                     \
+                    *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx"   \
+                    ? true                                                 \
+                    : false;                                               \
+      m_igpu_sim = static_cast<string>(                                    \
+                     *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \
+                     ? true                                                \
+                     : false;                                              \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                               \
+      break;                                                               \
+    case UNIT_LARGE:                                                       \
+      m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH;                \
+      m_ptx_sim = static_cast<string>(                                     \
+                    *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx"    \
+                    ? true                                                 \
+                    : false;                                               \
+      m_igpu_sim = static_cast<string>(                                    \
+                     *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu"  \
+                     ? true                                                \
+                     : false;                                              \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                               \
+      break;                                                               \
   }
 
 #define EXEC_CONFIG()                                                      \
@@ -332,6 +346,7 @@ POSSIBILITY OF SUCH DAMAGE.
         static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "igpu" \
           ? true                                                           \
           : false;                                                         \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                               \
       break;                                                               \
                                                                            \
     case UNIT_MEDIUM:                                                      \
@@ -344,10 +359,7 @@ POSSIBILITY OF SUCH DAMAGE.
                     *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "ptx"   \
                     ? true                                                 \
                     : false;                                               \
-      m_igpu_sim = static_cast<string>(                                    \
-                     *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu" \
-                     ? true                                                \
-                     : false;                                              \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                               \
       break;                                                               \
                                                                            \
     case UNIT_LARGE:                                                       \
@@ -364,6 +376,7 @@ POSSIBILITY OF SUCH DAMAGE.
                      *m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "igpu"  \
                      ? true                                                \
                      : false;                                              \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                               \
       break;                                                               \
   }                                                                        \
   m_max_port[gen_ALLOCQ] = int_sched_rate;                                 \
@@ -444,14 +457,18 @@ POSSIBILITY OF SUCH DAMAGE.
       m_knob_icache_line_size = *m_simBase->m_knobs->KNOB_ICACHE_LINE_SIZE;    \
       m_fetch_modulo = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO - 1;          \
       if (static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx") { \
-        m_knob_ptx_sim = true;                                                 \
+        m_ptx_sim = true;                                                      \
         m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO;             \
       } else {                                                                 \
-        m_knob_ptx_sim = false;                                                \
+        m_ptx_sim = false;                                                     \
         m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO;             \
       }                                                                        \
       break;                                                                   \
-                                                                               \
+      m_igpu_sim = static_cast<string>(                                        \
+                     *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu"     \
+                     ? true                                                    \
+                     : false;                                                  \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                                   \
     case UNIT_MEDIUM:                                                          \
       m_knob_width = *m_simBase->m_knobs->KNOB_MEDIUM_WIDTH;                   \
       m_knob_fetch_width = *m_simBase->m_knobs->KNOB_FETCH_MEDIUM_WDITH;       \
@@ -460,13 +477,18 @@ POSSIBILITY OF SUCH DAMAGE.
       m_fetch_modulo = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO - 1;          \
       if (static_cast<string>(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) ==   \
           "ptx") {                                                             \
-        m_knob_ptx_sim = true;                                                 \
+        m_ptx_sim = true;                                                      \
         m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO;             \
       } else {                                                                 \
-        m_knob_ptx_sim = false;                                                \
+        m_ptx_sim = false;                                                     \
         m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO;             \
       }                                                                        \
       break;                                                                   \
+      m_igpu_sim = static_cast<string>(                                        \
+                     *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu"     \
+                     ? true                                                    \
+                     : false;                                                  \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                                   \
                                                                                \
     case UNIT_LARGE:                                                           \
       m_knob_width = *m_simBase->m_knobs->KNOB_LARGE_WIDTH;                    \
@@ -476,13 +498,18 @@ POSSIBILITY OF SUCH DAMAGE.
       m_fetch_modulo = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO - 1;          \
       if (static_cast<string>(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) ==    \
           "ptx") {                                                             \
-        m_knob_ptx_sim = true;                                                 \
+        m_ptx_sim = true;                                                      \
         m_fetch_ratio = *m_simBase->m_knobs->KNOB_GPU_FETCH_RATIO;             \
       } else {                                                                 \
-        m_knob_ptx_sim = false;                                                \
+        m_ptx_sim = false;                                                     \
         m_fetch_ratio = *m_simBase->m_knobs->KNOB_CPU_FETCH_RATIO;             \
       }                                                                        \
       break;                                                                   \
+      m_igpu_sim = static_cast<string>(                                        \
+                     *m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) == "igpu"     \
+                     ? true                                                    \
+                     : false;                                                  \
+      m_acc_sim = (m_igpu_sim || m_ptx_sim);                                   \
   }
 
 #define CORE_CONFIG()                                                          \
diff --git a/src/dram_ctrl.cc b/src/dram_ctrl.cc
index b8b57e6c..bfd8384d 100644
--- a/src/dram_ctrl.cc
+++ b/src/dram_ctrl.cc
@@ -578,9 +578,9 @@ void dram_ctrl_c::send(void) {
     for (auto I = m_output_buffer->begin(), E = m_output_buffer->end(); I != E;
          ++I) {
       mem_req_s* req = (*I);
-      if (req_type_allowed[req->m_ptx] == false) continue;
+      if (req_type_allowed[req->m_acc] == false) continue;
 
-      req_type_checked[req->m_ptx] = true;
+      req_type_checked[req->m_acc] = true;
       req->m_msg_type = NOC_FILL;
 
       bool insert_packet =
@@ -764,7 +764,7 @@ void dram_ctrl_c::channel_schedule_data(void) {
               m_current_list[bank]->m_req->m_id);
         ASSERT(m_current_list[bank]->m_state == DRAM_DATA);
         m_data_ready[bank] = acquire_data_bus(
-          ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_ptx);
+          ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_acc);
         m_data_avail[bank] = ULLONG_MAX;
         m_current_list[bank]->m_state = DRAM_DATA_WAIT;
       } else
diff --git a/src/exec.cc b/src/exec.cc
index 7fb9600f..d9ba5932 100644
--- a/src/exec.cc
+++ b/src/exec.cc
@@ -538,7 +538,7 @@ bool exec_c::exec(int thread_id, int entry, uop_c* uop) {
       use_port(thread_id, entry);
 
       // GPU : if we use load-block policy, block current thread due to load instruction
-      if (uop_latency == -1 && m_ptx_sim &&
+      if (uop_latency == -1 && m_acc_sim &&
           *m_simBase->m_knobs->KNOB_FETCH_ONLY_LOAD_READY) {
         m_frontend->set_load_wait(uop->m_thread_id, uop->m_uop_num);
 
@@ -741,7 +741,7 @@ void exec_c::br_exec(uop_c* uop) {
   }
 
   // GPU : stall on branch policy
-  if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
+  if (m_acc_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
     m_frontend->set_br_ready(uop->m_thread_id);
   }
 }
@@ -793,7 +793,7 @@ void exec_c::run_a_cycle(void) {
     if (responseArrived) {
       DEBUG_CORE(m_core_id, "key found: 0x%lx, addr = 0x%llx\n", key,
                  uop->m_vaddr);
-      if (m_ptx_sim || m_igpu_sim) {
+      if (m_acc_sim || m_igpu_sim) {
         if (uop->m_parent_uop) {
           uop_c* puop = uop->m_parent_uop;
           ++puop->m_num_child_uops_done;
@@ -883,7 +883,7 @@ int exec_c::access_data_cache(uop_c* uop) {
   auto i = m_uop_buffer.find(key);
   ASSERTM(m_uop_buffer.end() == i, "uop has already been executed!\n");
 
-  int block_size = m_ptx_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
+  int block_size = m_acc_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
                              : KNOB(KNOB_L1_LARGE_LINE_SIZE)->getValue();
   // Addr block_addr = uop->m_vaddr & ~((uint64_t)block_size-1);
 
@@ -936,7 +936,7 @@ int exec_c::access_data_cache(uop_c* uop) {
 }
 
 int exec_c::access_const_texture_cache(uop_c* uop) {
-  ASSERT(m_ptx_sim);
+  ASSERT(m_acc_sim);
   ASSERT(uop->m_mem_type == MEM_LD_CM || uop->m_mem_type == MEM_LD_TM);
 
   // assign unique key to each memory request; this will be used later in time for strobbing
diff --git a/src/exec.h b/src/exec.h
index 6f7a3914..ab80e142 100644
--- a/src/exec.h
+++ b/src/exec.h
@@ -184,8 +184,9 @@ class exec_c
   uns16 m_mem_sched_rate; /**< memory schedule rate */
   uns16 m_fp_sched_rate; /**< fp schedule rate */
   uns8 m_dcache_cycles; /**< L1 cache latency */
-  bool m_ptx_sim; /**< gpu simulation */
+  bool m_acc_sim; /**< gpu simulation */
   bool m_igpu_sim; /**< intel gpu simulation */
+  bool m_ptx_sim; /**< PTX simulation */
   int m_latency[NUM_UOP_TYPES]; /**< latency map */
   Counter m_cur_core_cycle; /**< current core cycle */
   int m_max_port[max_ALLOCQ]; /**< maximum port */
diff --git a/src/frontend.cc b/src/frontend.cc
index c9146543..fd41e17d 100644
--- a/src/frontend.cc
+++ b/src/frontend.cc
@@ -191,6 +191,8 @@ void frontend_c::run_a_cycle(void) {
   // fetch every KNOB_FETCH_RATIO cycle
   // CPU : every cycle
   // NVIDIA G80 : 1/4 cycles, NVIDIA Fermi: 1/2 cycles
+  // check core type for the fetch rate
+  // Hyesoon: Aug-2020 please check whether this need to be changed with heteroe and igpu
   if (m_fetch_ratio != 1) {
     m_fetch_modulo++;
     if (m_fetch_modulo == m_fetch_ratio)
@@ -300,7 +302,7 @@ void frontend_c::run_a_cycle(void) {
 
   // TONAGESH
   // nagesh - comments for BAR are incomplete...
-  if (m_knob_ptx_sim) {
+  if (m_ptx_sim) {
     // handling of BAR instruction in PTX - can/should this be moved?
     // do we have any blocks for which all warps have reached (retired)
     // their next barrier?
@@ -346,7 +348,7 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
 
   // First time : set up traces for current thread
   if (fetch_data->m_first_time) {
-    m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_knob_ptx_sim);
+    m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_ptx_sim);
     fetch_data->m_first_time = false;
 
     ++m_core->m_inst_fetched[tid]; /*! initial increase */
@@ -356,11 +358,18 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
 
     // set up initial fetch address
     thread_s *thread = m_core->get_trace_info(tid);
-    if (thread->m_ptx) {
-      trace_info_gpu_s *prev_trace_info =
-        static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
-      fetch_data->m_MT_scheduler.m_next_fetch_addr =
-        prev_trace_info->m_inst_addr;
+    if (thread->m_acc) {
+      if (m_ptx_sim) {
+        trace_info_gpu_s *prev_trace_info =
+          static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
+        fetch_data->m_MT_scheduler.m_next_fetch_addr =
+          prev_trace_info->m_inst_addr;
+      } else if (m_igpu_sim) {
+        trace_info_igpu_s *prev_trace_info =
+          static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
+        fetch_data->m_MT_scheduler.m_next_fetch_addr =
+          prev_trace_info->m_instruction_addr;
+      }
     } else {
       if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") {
         trace_info_cpu_s *prev_trace_info =
@@ -372,11 +381,6 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
           static_cast<trace_info_a64_s *>(thread->m_prev_trace_info);
         fetch_data->m_MT_scheduler.m_next_fetch_addr =
           prev_trace_info->m_instruction_addr;
-      } else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu") {
-        trace_info_igpu_s *prev_trace_info =
-          static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
-        fetch_data->m_MT_scheduler.m_next_fetch_addr =
-          prev_trace_info->m_instruction_addr;
       } else {
         ASSERTM(0, "Wrong core type %s\n",
                 KNOB(KNOB_LARGE_CORE_TYPE)->getValue().c_str());
@@ -457,8 +461,8 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
         ASSERT(new_uop);
 
         // read an uop from the traces
-        if (!m_simBase->m_trace_reader->get_uops_from_traces(
-              m_core_id, new_uop, tid, m_knob_ptx_sim)) {
+        if (!m_simBase->m_trace_reader->get_uops_from_traces(m_core_id, new_uop,
+                                                             tid, m_ptx_sim)) {
           // couldn't get an uop
           DEBUG_CORE(m_core_id, "not success\n");
           m_uop_pool->release_entry(new_uop->free());
@@ -631,7 +635,7 @@ bool frontend_c::access_icache(int tid, Addr fetch_addr,
     int result = m_simBase->m_memory->new_mem_req(
       MRT_IFETCH, line_addr, m_knob_icache_line_size, false, false, 0, NULL,
       icache_fill_line_wrapper, m_core->get_unique_uop_num(), NULL, m_core_id,
-      tid, m_knob_ptx_sim);
+      tid, m_ptx_sim);
 
     // mshr full
     if (!result) return false;
@@ -712,7 +716,7 @@ bool frontend_c::icache_fill_line(mem_req_s *req) {
   if (m_icache->access_cache(req->m_addr, &line_addr, false, req->m_appl_id) ==
       NULL) {
     m_icache->insert_cache(req->m_addr, &line_addr, &repl_line_addr,
-                           req->m_appl_id, req->m_ptx);
+                           req->m_appl_id, req->m_acc);
     POWER_CORE_EVENT(req->m_core_id, POWER_ICACHE_W);
   }
 
@@ -806,7 +810,7 @@ int frontend_c::predict_bpu(uop_c *uop) {
   // no branch prediction
   else {
     // GPU : stall on branch policy, stop fetching
-    if (m_knob_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
+    if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
       set_br_wait(uop->m_thread_id);
       mispredicted = false;
     }
@@ -906,7 +910,7 @@ int frontend_c::fetch_rr(void) {
     }
 
     // check the thread is ready to fetch
-    if (m_knob_ptx_sim) {
+    if (m_ptx_sim) {
       // GPU : stall on branch policy, check whether previous branch has been resolved
       if (*m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR &&
           !check_br_ready(fetch_id)) {
diff --git a/src/frontend.h b/src/frontend.h
index e6399f06..d193731f 100644
--- a/src/frontend.h
+++ b/src/frontend.h
@@ -407,7 +407,9 @@ class frontend_c
   uns m_knob_icache_line_size; /**< icache line size */
   bool m_fe_stall; /**< frontend stalled */
   bool m_fe_running; /**< enabled frontend */
-  bool m_knob_ptx_sim; /**< GPU simulation */
+  bool m_ptx_sim; /**< PTX simulation */
+  bool m_igpu_sim; /**< iGPU simulation */
+  bool m_acc_sim; /**< Accelerator simulation */
   bool m_ready_thread_available; /**< ready thread available */
   bool m_last_fetch_tid_failed;
   core_c* m_core; /**< core pointer */
diff --git a/src/global_types.h b/src/global_types.h
index 42fbaffc..6cf323ac 100644
--- a/src/global_types.h
+++ b/src/global_types.h
@@ -80,4 +80,9 @@ typedef enum uop_latency_map {  // enum for x86 latency maps - Michael
   NUM_LATENCY_MAPS
 } latency_map;
 
+typedef enum _ACC_Type_enum {
+  NO_ACC = 0, /**< no accelerator */
+  PTX_CORE, /**< PTX core */
+  IGPU_CORE /**< IGPU core */
+} ACC_Type;
 #endif
diff --git a/src/macsim.cc b/src/macsim.cc
index c284675f..92d5e414 100644
--- a/src/macsim.cc
+++ b/src/macsim.cc
@@ -336,7 +336,7 @@ void macsim_c::init_cores(int num_max_core) {
 
     // insert to the core type pool
     if (static_cast<string>(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx")
-      m_ptx_core_pool.push(ii);
+      m_acc_core_pool.push(ii);
     else
       m_x86_core_pool.push(ii);
   }
@@ -352,7 +352,7 @@ void macsim_c::init_cores(int num_max_core) {
     // insert to the core type pool
     if (static_cast<string>(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) ==
         "ptx")
-      m_ptx_core_pool.push(ii + total_core);
+      m_acc_core_pool.push(ii + total_core);
     else
       m_x86_core_pool.push(ii + total_core);
   }
@@ -367,7 +367,7 @@ void macsim_c::init_cores(int num_max_core) {
 
     // insert to the core type pool
     if (static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx")
-      m_ptx_core_pool.push(ii + total_core);
+      m_acc_core_pool.push(ii + total_core);
     else
       m_x86_core_pool.push(ii + total_core);
   }
diff --git a/src/macsim.h b/src/macsim.h
index 34dc50ec..9c12ddca 100644
--- a/src/macsim.h
+++ b/src/macsim.h
@@ -243,7 +243,7 @@ class macsim_c
   // process manager
   process_manager_c *m_process_manager; /**< process manager */
   queue<int> m_x86_core_pool; /**< x86 cores pool */
-  queue<int> m_ptx_core_pool; /**< GPU cores pool */
+  queue<int> m_acc_core_pool; /**< GPU cores pool */
   multi_key_map_c *m_block_id_mapper; /**< block id mapper */
 
   // data structure pools (to reduce overhead of memory allocation)
diff --git a/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc b/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc
index 4c0f4e58..add1b9f0 100644
--- a/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc
+++ b/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc
@@ -249,13 +249,12 @@ SimpleRouter::handle_link_arrival( int port, LinkData* data )
   }
   
   //track flit_id of tail flit
-                    cout << manifold::kernel::Manifold::NowTicks() << ",p," 
-                        << ((HeadFlit*)data->f)->req->m_id << "," 
-                        << ((HeadFlit*)data->f)->req->m_ptx << ","
-                        << mem_state_copy[((HeadFlit*)data->f)->req->m_state] << ","
-                        << mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << ","
-                        << node_id << "," << ((HeadFlit*)data->f)->dst_node << ","
-                        << endl;
+  cout << manifold::kernel::Manifold::NowTicks() << ",p,"
+       << ((HeadFlit*)data->f)->req->m_id << ","
+       << ((HeadFlit*)data->f)->req->m_acc << ","
+       << mem_state_copy[((HeadFlit*)data->f)->req->m_state] << ","
+       << mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << ","
+       << node_id << "," << ((HeadFlit*)data->f)->dst_node << "," << endl;
          /*           
 		    cout << manifold::kernel::Manifold::NowTicks() << ",b," << node_id << ","; 
                     for(uint i=0; i<ports; i++)
diff --git a/src/memory.cc b/src/memory.cc
index cfef8d72..a7e609f3 100644
--- a/src/memory.cc
+++ b/src/memory.cc
@@ -202,14 +202,14 @@ cache_c* default_llc(macsim_c* m_simBase) {
 
 bool queue_c::sort_func::operator()(mem_req_s* a, mem_req_s* b) {
   if (*m_simBase->m_knobs->KNOB_HETERO_MEM_PRIORITY_CPU) {
-    if (a->m_ptx != true && b->m_ptx == true) {
+    if (a->m_acc != true && b->m_acc == true) {
       return true;
-    } else if (a->m_ptx == true && b->m_ptx != true)
+    } else if (a->m_acc == true && b->m_acc != true)
       return false;
   } else if (*m_simBase->m_knobs->KNOB_HETERO_MEM_PRIORITY_GPU) {
-    if (a->m_ptx != true && b->m_ptx == true) {
+    if (a->m_acc != true && b->m_acc == true) {
       return false;
-    } else if (a->m_ptx == true && b->m_ptx != true)
+    } else if (a->m_acc == true && b->m_acc != true)
       return true;
   }
 
@@ -479,7 +479,7 @@ int dcu_c::access(uop_c* uop) {
     } else {
       POWER_EVENT(POWER_LLC_R);
     }
-    STAT_EVENT(L1_HIT_CPU + this->m_ptx_sim);
+    STAT_EVENT(L1_HIT_CPU + this->m_acc_sim);
     DEBUG_CORE(uop->m_core_id, "L%d[%d] uop_num:%lld cache hit\n", m_level,
                m_id, uop->m_uop_num);
     // stat
@@ -496,7 +496,7 @@ int dcu_c::access(uop_c* uop) {
     if (*m_simBase->m_knobs->KNOB_ENABLE_CACHE_COHERENCE) {
     }
 
-    if (this->m_ptx_sim &&
+    if (this->m_acc_sim &&
         *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
         type == MEM_ST) {
       // evict global data on write hit in L1
@@ -504,7 +504,7 @@ int dcu_c::access(uop_c* uop) {
 
       int req_size;
       Addr req_addr;
-      if (m_ptx_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) {
+      if (m_acc_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) {
         req_size = uop->m_mem_size;
         req_addr = vaddr;
       } else {
@@ -518,7 +518,7 @@ int dcu_c::access(uop_c* uop) {
 
       int result = m_simBase->m_memory->new_mem_req(
         req_type, req_addr, req_size, cache_hit, true, m_latency, uop,
-        done_func, uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_ptx_sim);
+        done_func, uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_acc_sim);
 
       if (!result) {
         uop->m_state = OS_DCACHE_MEM_ACCESS_DENIED;
@@ -534,7 +534,7 @@ int dcu_c::access(uop_c* uop) {
   // DCACHE miss
   // -------------------------------------
   else {  // !cache_hit
-    STAT_EVENT(L1_MISS_CPU + this->m_ptx_sim);
+    STAT_EVENT(L1_MISS_CPU + this->m_acc_sim);
     DEBUG_CORE(uop->m_core_id, "L%d[%d] uop_num:%lld cache miss\n", m_level,
                m_id, uop->m_uop_num);
 
@@ -579,7 +579,7 @@ int dcu_c::access(uop_c* uop) {
     // -------------------------------------
     int req_size;
     Addr req_addr;
-    if (m_ptx_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) {
+    if (m_acc_sim && *m_simBase->m_knobs->KNOB_BYTE_LEVEL_ACCESS) {
       req_size = uop->m_mem_size;
       req_addr = vaddr;
     } else {
@@ -600,7 +600,7 @@ int dcu_c::access(uop_c* uop) {
     // Generate a new memory request (MSHR access)
     // -------------------------------------
     function<bool(mem_req_s*)> done_func = NULL;
-    if (this->m_ptx_sim &&
+    if (this->m_acc_sim &&
         *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
         (type == MEM_ST || type == MEM_ST_LM)) {
       done_func = dcache_write_ack_wrapper;
@@ -612,7 +612,7 @@ int dcu_c::access(uop_c* uop) {
     result = m_simBase->m_memory->new_mem_req(
       req_type, req_addr, req_size, cache_hit,
       (type == MEM_ST_GM || type == MEM_ST_LM), m_latency, uop, done_func,
-      uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_ptx_sim);
+      uop->m_unique_num, NULL, m_id, uop->m_thread_id, m_acc_sim);
 
     // -------------------------------------
     // MSHR full
@@ -772,7 +772,7 @@ void dcu_c::process_in_queue() {
         m_level, req->m_thread_id, req->m_addr, req->m_pc,
         req->m_uop ? req->m_uop : NULL, true);
 
-      STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + req->m_ptx);
+      STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + req->m_acc);
 
       if (line && req->m_type == MRT_DSTORE) {
         line->m_dirty = true;
@@ -841,7 +841,7 @@ void dcu_c::process_in_queue() {
         //    req->m_addr, req->m_pc, req->m_uop ? req->m_uop : NULL, false);
       }
 
-      STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + 2 + req->m_ptx);
+      STAT_EVENT(L1_HIT_CPU + (m_level - 1) * 4 + 2 + req->m_acc);
 
       //      handle_coherence(m_level, false, );
 
@@ -895,7 +895,7 @@ void dcu_c::process_in_queue() {
         m_level, m_id, req->m_id, mem_req_c::mem_req_type_name[req->m_type],
         m_cycle - req->m_in);
 
-      if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
+      if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
           req->m_type == MRT_DSTORE) {
         m_simBase->m_memory->free_write_req(req);
       } else {
@@ -989,7 +989,7 @@ void dcu_c::process_out_queue() {
     // -------------------------------------
     if (req->m_state == MEM_OUTQUEUE_NEW) {
       int msg_type;
-      if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
+      if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
           req->m_with_data && m_level != MEM_LLC) {
         // can change if to req->m_type == MRT_DSTORE
         msg_type = NOC_NEW_WITH_DATA;
@@ -1009,7 +1009,7 @@ void dcu_c::process_out_queue() {
     // -------------------------------------
     else if (req->m_state == MEM_OUT_FILL) {
       int msg_type;
-      if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
+      if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
           req->m_with_data && m_level == MEM_LLC) {
         // can change if to req->m_type == MRT_DSTORE
         msg_type = NOC_ACK;
@@ -1068,7 +1068,7 @@ void dcu_c::process_fill_queue() {
 
     mem_req_s* req = (*I);
 
-    if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
+    if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
         m_level == MEM_L1 && req->m_type == MRT_DSTORE) {
       ASSERTM(m_done && req->m_done_func && req->m_done_func(req),
               "done function failed\n");
@@ -1127,7 +1127,7 @@ void dcu_c::process_fill_queue() {
           dcache_data_s* data;
           data = (dcache_data_s*)m_cache->insert_cache(
             req->m_addr, &line_addr, &victim_line_addr, req->m_appl_id,
-            req->m_ptx);
+            req->m_acc);
 
           if (m_level != MEM_LLC) {
             POWER_CORE_EVENT(req->m_core_id, POWER_DCACHE_W + (m_level - 1));
@@ -1147,7 +1147,7 @@ void dcu_c::process_fill_queue() {
 
               // new write-back request
               mem_req_s* wb = m_simBase->m_memory->new_wb_req(
-                victim_line_addr, m_line_size, m_ptx_sim, data, m_level);
+                victim_line_addr, m_line_size, m_acc_sim, data, m_level);
 
               wb->m_rdy_cycle = m_cycle + 1;
 
@@ -1322,7 +1322,7 @@ void dcu_c::process_fill_queue() {
         m_level, m_id, req->m_id, mem_req_c::mem_req_type_name[req->m_type],
         m_cycle - req->m_in);
 
-      if (req->m_ptx && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
+      if (req->m_acc && *m_simBase->m_knobs->KNOB_COMPUTE_CAPABILITY == 2.0f &&
           req->m_type == MRT_DSTORE) {
         m_simBase->m_memory->free_write_req(req);
       } else {
@@ -1426,7 +1426,7 @@ bool dcu_c::done(mem_req_s* req) {
       // DCACHE insertion
       // -------------------------------------
       data = (dcache_data_s*)m_cache->insert_cache(
-        addr, &line_addr, &repl_line_addr, req->m_appl_id, req->m_ptx);
+        addr, &line_addr, &repl_line_addr, req->m_appl_id, req->m_acc);
 
       if (m_level != MEM_LLC) {
         POWER_CORE_EVENT(req->m_core_id, POWER_DCACHE_W + (m_level - 1));
@@ -1450,7 +1450,7 @@ bool dcu_c::done(mem_req_s* req) {
 
           // new write back request
           mem_req_s* wb = m_simBase->m_memory->new_wb_req(
-            repl_line_addr, m_line_size, m_ptx_sim, data, m_level);
+            repl_line_addr, m_line_size, m_acc_sim, data, m_level);
 
           wb->m_rdy_cycle = m_cycle + 1;
 
@@ -1489,7 +1489,7 @@ bool dcu_c::done(mem_req_s* req) {
                uop->m_inst_num, uop->m_uop_num, req->m_in_global);
     uop->m_done_cycle = m_simBase->m_core_cycle[uop->m_core_id] + 1;
     uop->m_state = OS_SCHEDULED;
-    if (m_ptx_sim || m_igpu_sim) {
+    if (m_acc_sim) {
       if (uop->m_parent_uop) {
         uop_c* puop = uop->m_parent_uop;
         ++puop->m_num_child_uops_done;
@@ -1521,7 +1521,7 @@ bool dcu_c::write_done(mem_req_s* req) {
   uop_c* uop = req->m_uop;
   uop->m_done_cycle = m_simBase->m_core_cycle[uop->m_core_id] + 1;
   uop->m_state = OS_SCHEDULED;
-  if (m_ptx_sim || m_igpu_sim) {
+  if (m_acc_sim || m_igpu_sim) {
     if (uop->m_parent_uop) {
       uop_c* puop = uop->m_parent_uop;
       ++puop->m_num_child_uops_done;
@@ -1575,17 +1575,20 @@ memory_c::memory_c(macsim_c* simBase) {
   m_num_gpu = 0;
   m_num_cpu = 0;
 
-  if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "ptx")
+  if ((KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "ptx") ||
+      (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu"))
     m_num_gpu += *KNOB(KNOB_NUM_SIM_LARGE_CORES);
   else
     m_num_cpu += *KNOB(KNOB_NUM_SIM_LARGE_CORES);
 
-  if (KNOB(KNOB_MEDIUM_CORE_TYPE)->getValue() == "ptx")
+  if ((KNOB(KNOB_MEDIUM_CORE_TYPE)->getValue() == "ptx") ||
+      (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu"))
     m_num_gpu += *KNOB(KNOB_NUM_SIM_MEDIUM_CORES);
   else
     m_num_cpu += *KNOB(KNOB_NUM_SIM_MEDIUM_CORES);
 
-  if (KNOB(KNOB_CORE_TYPE)->getValue() == "ptx")
+  if ((KNOB(KNOB_CORE_TYPE)->getValue() == "ptx") ||
+      (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu"))
     m_num_gpu += *KNOB(KNOB_NUM_SIM_SMALL_CORES);
   else
     m_num_cpu += *KNOB(KNOB_NUM_SIM_SMALL_CORES);
@@ -1912,7 +1915,7 @@ void memory_c::init_new_req(mem_req_s* req, Mem_Req_Type type, Addr addr,
   req->m_pc = uop ? uop->m_pc : 0;
   req->m_prefetcher_id = 0;
   req->m_pref_loadPC = 0;
-  req->m_ptx = ptx;
+  req->m_acc = ptx;
   req->m_done_func = done_func;
   req->m_uop = uop ? uop : NULL;
   if (type == MRT_DPRF) req->m_uop = NULL;
@@ -1948,7 +1951,7 @@ void memory_c::adjust_req(mem_req_s* req, Mem_Req_Type type, Addr addr,
   req->m_pc = uop ? uop->m_pc : 0;
   req->m_prefetcher_id = 0;
   req->m_pref_loadPC = 0;
-  req->m_ptx = ptx;
+  req->m_acc = ptx;
   req->m_done_func = done_func;
   req->m_uop = uop ? uop : NULL;
   if (type == MRT_DPRF) req->m_uop = NULL;
@@ -2110,7 +2113,7 @@ mem_req_s* memory_c::new_wb_req(Addr addr, int size, bool ptx,
   req->m_pc = data->m_pc;
   req->m_prefetcher_id = 0;
   req->m_pref_loadPC = 0;
-  req->m_ptx = ptx;
+  req->m_acc = ptx;
   req->m_done_func = NULL;
   req->m_uop = NULL;
   req->m_in = m_cycle;
diff --git a/src/memory.h b/src/memory.h
index 97719bf6..3efa846f 100644
--- a/src/memory.h
+++ b/src/memory.h
@@ -303,8 +303,9 @@ class dcu_c
   int m_line_size; /**< cache line size */
   int m_banks; /**< number of cache banks */
   int m_latency; /**< cache access latency */
-  bool m_ptx_sim; /**< gpu cache */
+  bool m_acc_sim; /**< gpu cache */
   bool m_igpu_sim; /**< intel gpu cache */
+  bool m_ptx_sim; /**< gpu cache */
   queue_c* m_in_queue; /**< input queue */
   queue_c* m_wb_queue; /**< write-back queue */
   queue_c* m_fill_queue; /**< fill queue */
diff --git a/src/memreq_info.cc b/src/memreq_info.cc
index 5f184c1b..80dbc963 100644
--- a/src/memreq_info.cc
+++ b/src/memreq_info.cc
@@ -82,7 +82,7 @@ void mem_req_s::init(void) {
   m_pc = 0;
   m_prefetcher_id = 0;
   m_pref_loadPC = 0;
-  m_ptx = false;
+  m_acc = false;
   m_queue = NULL;
   for (int ii = 0; ii < MEM_LAST; ++ii) m_cache_id[ii] = 0;
   m_uop = NULL;
diff --git a/src/memreq_info.h b/src/memreq_info.h
index 4fad80d4..354f5b6f 100644
--- a/src/memreq_info.h
+++ b/src/memreq_info.h
@@ -152,7 +152,7 @@ typedef struct mem_req_s {
   Addr m_pc; /**< load pc */
   uns8 m_prefetcher_id; /**< prefetcher id, if prefetch request */
   Addr m_pref_loadPC; /**< prefetch load pc */
-  bool m_ptx; /**< GPU request */
+  bool m_acc; /**< GPU request */
   queue_c* m_queue; /**< current memory queue in */
   int m_cache_id[MEM_LAST]; /**< each level cache id */
   uop_c* m_uop; /**< uop that generates this request */
diff --git a/src/network.cc b/src/network.cc
index c995b30d..36aa0243 100644
--- a/src/network.cc
+++ b/src/network.cc
@@ -319,12 +319,12 @@ void router_c::local_packet_injection(void) {
       if (m_input_buffer[0][ii].size() + num_flit <= m_buffer_max_size) {
 #endif
         // flit generation and insert into the buffer
-        STAT_EVENT(TOTAL_PACKET_CPU + req->m_ptx);
+        STAT_EVENT(TOTAL_PACKET_CPU + req->m_acc);
         req->m_noc_cycle = m_cycle;
 
         // stat handling
         ++g_total_packet;
-        if (req->m_ptx) {
+        if (req->m_acc) {
           ++g_total_gpu_packet;
           STAT_EVENT(NOC_AVG_ACTIVE_PACKET_BASE_GPU);
           STAT_EVENT_N(NOC_AVG_ACTIVE_PACKET_GPU, g_total_gpu_packet);
@@ -460,7 +460,7 @@ void router_c::stage_vca(void) {
             "op:%d oc:%d ptx:%d\n",
             m_cycle, m_id, flit->m_req->m_id, flit->m_id,
             flit->m_req->m_msg_src, flit->m_req->m_msg_dst, iport, ivc,
-            m_route_fixed[iport][ivc], ovc, flit->m_req->m_ptx);
+            m_route_fixed[iport][ivc], ovc, flit->m_req->m_acc);
         }
       }
     }
@@ -652,7 +652,7 @@ void router_c::stage_lt(void) {
 
         if (port == LOCAL) {
           --g_total_packet;
-          if (f->m_req->m_ptx) {
+          if (f->m_req->m_acc) {
             --g_total_gpu_packet;
           } else {
             --g_total_cpu_packet;
@@ -667,8 +667,8 @@ void router_c::stage_lt(void) {
           STAT_EVENT(NOC_AVG_LATENCY_BASE);
           STAT_EVENT_N(NOC_AVG_LATENCY, m_cycle - f->m_req->m_noc_cycle);
 
-          STAT_EVENT(NOC_AVG_LATENCY_BASE_CPU + f->m_req->m_ptx);
-          STAT_EVENT_N(NOC_AVG_LATENCY_CPU + f->m_req->m_ptx,
+          STAT_EVENT(NOC_AVG_LATENCY_BASE_CPU + f->m_req->m_acc);
+          STAT_EVENT_N(NOC_AVG_LATENCY_CPU + f->m_req->m_acc,
                        m_cycle - f->m_req->m_noc_cycle);
         }
       }
diff --git a/src/noc.cc b/src/noc.cc
index 42b9007c..30e1e350 100644
--- a/src/noc.cc
+++ b/src/noc.cc
@@ -67,12 +67,12 @@ bool noc_c::insert(int src, int dst, int msg, mem_req_s* req) {
   noc_entry_s* new_entry = m_pool->acquire_entry();
 
   if (src > dst) {
-    if (req->m_ptx == true)
+    if (req->m_acc == true)
       m_cpu_entry_up->push_back(new_entry);
     else
       m_gpu_entry_up->push_back(new_entry);
   } else {
-    if (req->m_ptx == true)
+    if (req->m_acc == true)
       m_cpu_entry_down->push_back(new_entry);
     else
       m_gpu_entry_down->push_back(new_entry);
diff --git a/src/process_manager.cc b/src/process_manager.cc
index e5da115a..1456f5f8 100644
--- a/src/process_manager.cc
+++ b/src/process_manager.cc
@@ -127,7 +127,7 @@ process_s::process_s() {
   m_no_of_threads_created = 0;
   m_no_of_threads_terminated = 0;
   m_core_pool = NULL;
-  m_ptx = false;
+  m_acc = false;
   m_repeat = 0;
   m_current_file_name_base = "";
   m_kernel_config_name = "";
@@ -253,7 +253,7 @@ void process_manager_c::create_thread_node(process_s *process, int tid,
   node->m_process = process;
   node->m_tid = tid;
   node->m_main = main;
-  node->m_ptx = process->m_ptx;
+  node->m_acc = process->m_acc;
 
   // create a new thread start information
   thread_start_info_s *start_info = &(process->m_thread_start_info[tid]);
@@ -284,7 +284,7 @@ void process_manager_c::create_thread_node(process_s *process, int tid,
   process->m_block_list[node->m_block_id] = true;
 
   // add a new node to m_thread_queue (for x86) or m_block_queue (for ptx)
-  if (process->m_ptx == true)
+  if (process->m_acc == true)
     insert_block(node);
   else
     insert_thread(node);
@@ -399,8 +399,8 @@ int process_manager_c::create_process(string appl, int repeat, int pid) {
 
   // setup core pool
   if (trace_type == "ptx" || trace_type == "newptx") {
-    process->m_ptx = true;
-    process->m_core_pool = &m_simBase->m_ptx_core_pool;
+    process->m_acc = true;
+    process->m_core_pool = &m_simBase->m_acc_core_pool;
 
     // most basic check to ensure that offsets of members in structure that we
     // use for reading traces (which contains one extra field compared to the
@@ -414,7 +414,7 @@ int process_manager_c::create_process(string appl, int repeat, int pid) {
              (sizeof(trace_info_gpu_small_s) + sizeof(uint32_t)));
     }
   } else {
-    process->m_ptx = false;
+    process->m_acc = false;
     process->m_core_pool = &m_simBase->m_x86_core_pool;
   }
 
@@ -569,7 +569,7 @@ void process_manager_c::setup_process(process_s *process) {
   trace_config_file.close();
 
   // GPU simulation
-  if (true == process->m_ptx) {
+  if (true == process->m_acc) {
     string path = process->m_current_file_name_base;
     path += "_info.txt";
 
@@ -670,7 +670,7 @@ void process_manager_c::setup_process(process_s *process) {
   // Insert the main thread to the pool
   create_thread_node(process, 0, true);
 
-  if (process->m_ptx) {
+  if (process->m_acc) {
     for (int tid = 1; tid < thread_count; ++tid) {
       if (process->m_thread_start_info[tid].m_inst_count == 0) {
         create_thread_node(process, process->m_no_of_threads_created++, false);
@@ -765,7 +765,7 @@ thread_s *process_manager_c::create_thread(process_s *process, int tid,
   process->m_thread_trace_info[tid] = trace_info;
 
   // TODO - nbl (apr-17-2013): use pools
-  if (process->m_ptx) {
+  if (process->m_acc) {
     trace_info->m_prev_trace_info = new trace_info_gpu_s;
     trace_info->m_next_trace_info = new trace_info_gpu_s;
   } else {
@@ -817,7 +817,7 @@ thread_s *process_manager_c::create_thread(process_s *process, int tid,
 
   trace_info->m_file_opened = true;
   trace_info->m_trace_ended = false;
-  trace_info->m_ptx = process->m_ptx;
+  trace_info->m_acc = process->m_acc;
   trace_info->m_buffer_index = 0;
   trace_info->m_buffer_index_max = 0;
   trace_info->m_buffer_exhausted = true;
@@ -882,7 +882,7 @@ int process_manager_c::terminate_thread(int core_id, thread_s *trace_info,
   --m_simBase->m_num_active_threads;
 
   // GPU simulation
-  if (trace_info->m_ptx == true) {
+  if (trace_info->m_acc == true) {
     int t_process_id = trace_info->m_process->m_process_id;
     int t_thread_id = trace_info->m_unique_thread_id;
     m_simBase->m_thread_stats[t_process_id][t_thread_id].m_thread_end_cycle =
@@ -972,7 +972,7 @@ int process_manager_c::terminate_thread(int core_id, thread_s *trace_info,
   }
 
   // TODO - nbl (apr-17-2013): use pools
-  if (trace_info->m_process->m_ptx) {
+  if (trace_info->m_process->m_acc) {
     trace_info_gpu_s *temp =
       static_cast<trace_info_gpu_s *>(trace_info->m_prev_trace_info);
     delete temp;
diff --git a/src/process_manager.h b/src/process_manager.h
index 1e386ec0..ee59b13a 100644
--- a/src/process_manager.h
+++ b/src/process_manager.h
@@ -73,7 +73,7 @@ typedef struct thread_trace_info_node_s {
   process_s* m_process; /**< pointer to the process */
   int m_tid; /**< thread id */
   bool m_main; /**< main thread */
-  bool m_ptx; /**< GPU simulation */
+  bool m_acc; /**< GPU simulation */
   int m_block_id; /**< block id */
 } thread_trace_info_node_s;
 
@@ -188,7 +188,7 @@ typedef struct thread_s {
   uint64_t m_uop_count; /**< total uop counts */
   bool m_trace_ended; /**< trace ended */
   process_s* m_process; /**< point to the application belongs to */
-  bool m_ptx; /**< GPU thread */
+  bool m_acc; /**< GPU thread */
   char* m_buffer; /**< trace buffer */
   int m_buffer_index; /**< current trace buffer index */
   int m_buffer_index_max; /**< maximum buffer index */
@@ -258,7 +258,7 @@ typedef struct process_s {
   map<int, bool>
     m_core_list; /**< list of cores that this process is executed */
   queue<int>* m_core_pool; /**< core pool pointer */
-  bool m_ptx; /**< GPU application */
+  bool m_acc; /**< GPU application */
   int m_repeat; /**< application has been re-executed */
   vector<string> m_applications; /**< list of sub-applications */
   vector<int>
diff --git a/src/readonly_cache.cc b/src/readonly_cache.cc
index 5bc917cc..a1fdd167 100644
--- a/src/readonly_cache.cc
+++ b/src/readonly_cache.cc
@@ -188,7 +188,7 @@ bool readonly_cache_c::cache_fill_line(mem_req_s* req) {
 
   // insert cache
   m_cache->insert_cache(req->m_addr, &line_addr, &repl_line_addr,
-                        req->m_appl_id, req->m_ptx);
+                        req->m_appl_id, req->m_acc);
 
   if (req->m_uop) {
     uop_c* uop = req->m_uop;
diff --git a/src/retire.cc b/src/retire.cc
index d1e80b58..1a80efb7 100644
--- a/src/retire.cc
+++ b/src/retire.cc
@@ -108,7 +108,7 @@ retire_c::retire_c(RETIRE_INTERFACE_PARAMS(), macsim_c* simBase)
 
   RETIRE_CONFIG();
 
-  if (m_knob_ptx_sim || m_knob_igpu_sim) m_knob_width = 1000;
+  if (m_ptx_sim || m_igpu_sim) m_knob_width = 1000;
 }
 
 // retire_c destructor
@@ -130,7 +130,7 @@ void retire_c::run_a_cycle() {
 
   vector<uop_c*>* uop_list = NULL;
   unsigned int uop_list_index = 0;
-  if (m_knob_ptx_sim || m_knob_igpu_sim) {
+  if (m_ptx_sim || m_igpu_sim) {
     // GPU : many retireable uops from multiple threads. Get entire retireable uops
     uop_list =
       m_gpu_rob->get_n_uops_in_ready_order(m_knob_width, m_cur_core_cycle);
@@ -144,7 +144,7 @@ void retire_c::run_a_cycle() {
     // we need to handle retirement for x86 and ptx separately
 
     // retirement logic for GPU
-    if (m_knob_ptx_sim || m_knob_igpu_sim) {
+    if (m_ptx_sim || m_igpu_sim) {
       // GPU : many retireable uops from multiple threads. Get entire retireable uops
       if (uop_list_index == uop_list->size()) {
         uop_list->clear();
@@ -281,7 +281,7 @@ void retire_c::run_a_cycle() {
     STAT_EVENT(UOP_COUNT_TOT);
 
     // GPU : barrier
-    if (m_knob_ptx_sim && cur_uop->m_bar_type == BAR_FETCH) {
+    if (m_ptx_sim && cur_uop->m_bar_type == BAR_FETCH) {
       frontend_c* frontend = core->get_frontend();
       frontend->synch_thread(cur_uop->m_block_id, cur_uop->m_thread_id);
     }
@@ -545,7 +545,7 @@ void retire_c::update_stats(process_s* process) {
       core->get_core_type() == "ptx") {
     if ((process->m_repeat + 1) == *m_simBase->m_knobs->KNOB_REPEAT_TRACE_N) {
       --m_simBase->m_process_count_without_repeat;
-      STAT_EVENT_N(CYC_COUNT_PTX, CYCLE);
+      STAT_EVENT_N(CYC_COUNT_ACC, CYCLE);
       report("application "
              << process->m_process_id << " terminated "
              << "("
@@ -555,7 +555,7 @@ void retire_c::update_stats(process_s* process) {
   } else {
     if (process->m_repeat == 0) {
       if (core->get_core_type() == "ptx") {
-        STAT_EVENT_N(CYC_COUNT_PTX, CYCLE);
+        STAT_EVENT_N(CYC_COUNT_ACC, CYCLE);
       } else {
         STAT_EVENT_N(CYC_COUNT_X86, CYCLE);
       }
diff --git a/src/retire.h b/src/retire.h
index 35729d0a..6bb64a8b 100644
--- a/src/retire.h
+++ b/src/retire.h
@@ -218,8 +218,9 @@ class retire_c
   Counter m_total_insts_retired; /**< total retired instructions */
   Counter m_cur_core_cycle; /**< current core cycle */
   uns16 m_knob_width; /**< pipeline width */
-  bool m_knob_ptx_sim; /**< gpu simulation */
-  bool m_knob_igpu_sim; /**< intel gpu simulation */
+  bool m_ptx_sim; /**< ptx simulation */
+  bool m_acc_sim; /**< accelerator simulation */
+  bool m_igpu_sim; /**< intel gpu simulation */
   unordered_map<int, Counter>
     m_insts_retired; /**< number of retired inst. per thread */
   unordered_map<int, Counter>
diff --git a/src/trace_read.cc b/src/trace_read.cc
index 24304cdd..2f10b11f 100644
--- a/src/trace_read.cc
+++ b/src/trace_read.cc
@@ -697,11 +697,11 @@ trace_reader_wrapper_c::trace_reader_wrapper_c(macsim_c *simBase) {
 }
 
 trace_reader_wrapper_c::trace_reader_wrapper_c() {
-  m_dprint_output->close();
-  delete m_dprint_output;
 }
 
 trace_reader_wrapper_c::~trace_reader_wrapper_c() {
+  m_dprint_output->close();
+  delete m_dprint_output;
   delete m_cpu_decoder;
   delete m_gpu_decoder;
 }
diff --git a/sst-unit-test/references/vectoradd/sdl1/general.stat.out b/sst-unit-test/references/vectoradd/sdl1/general.stat.out
index 2199040a..1ec1fd4f 100644
--- a/sst-unit-test/references/vectoradd/sdl1/general.stat.out
+++ b/sst-unit-test/references/vectoradd/sdl1/general.stat.out
@@ -14,7 +14,7 @@ NUM_REPEAT                                                      0
 
 CYC_COUNT_X86                                                   0                             0
 
-CYC_COUNT_PTX                                                   0                             0
+CYC_COUNT_ACC                                                   0                             0
 
 AVG_BLOCK_EXE_CYCLE                                             0                             0
 
diff --git a/sst-unit-test/references/vectoradd/sdl2/general.stat.out b/sst-unit-test/references/vectoradd/sdl2/general.stat.out
index 2199040a..1ec1fd4f 100644
--- a/sst-unit-test/references/vectoradd/sdl2/general.stat.out
+++ b/sst-unit-test/references/vectoradd/sdl2/general.stat.out
@@ -14,7 +14,7 @@ NUM_REPEAT                                                      0
 
 CYC_COUNT_X86                                                   0                             0
 
-CYC_COUNT_PTX                                                   0                             0
+CYC_COUNT_ACC                                                   0                             0
 
 AVG_BLOCK_EXE_CYCLE                                             0                             0
 
diff --git a/sst-unit-test/references/vectoradd/sdl3/general.stat.out b/sst-unit-test/references/vectoradd/sdl3/general.stat.out
index bea42a99..1845bae1 100644
--- a/sst-unit-test/references/vectoradd/sdl3/general.stat.out
+++ b/sst-unit-test/references/vectoradd/sdl3/general.stat.out
@@ -14,7 +14,7 @@ NUM_REPEAT                                                      0
 
 CYC_COUNT_X86                                                   0                             0
 
-CYC_COUNT_PTX                                                3346                          3346
+CYC_COUNT_ACC                                                3346                          3346
 
 AVG_BLOCK_EXE_CYCLE                                         29130                         29130
 
diff --git a/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0 b/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0
index e7b4c179..9ec3568d 100644
--- a/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0
+++ b/sst-unit-test/references/vectoradd/sdl3/general.stat.out.0
@@ -14,7 +14,7 @@ NUM_REPEAT                                                      0
 
 CYC_COUNT_X86                                                   0                             0
 
-CYC_COUNT_PTX                                                3346                          3346
+CYC_COUNT_ACC                                                3346                          3346
 
 AVG_BLOCK_EXE_CYCLE                                         29130                         29130
 
diff --git a/sst-unit-test/references/vectoradd/sdl4/general.stat.out b/sst-unit-test/references/vectoradd/sdl4/general.stat.out
index da987abc..149f8531 100644
--- a/sst-unit-test/references/vectoradd/sdl4/general.stat.out
+++ b/sst-unit-test/references/vectoradd/sdl4/general.stat.out
@@ -14,7 +14,7 @@ NUM_REPEAT                                                      0
 
 CYC_COUNT_X86                                                   0                             0
 
-CYC_COUNT_PTX                                                3278                          3278
+CYC_COUNT_ACC                                                3278                          3278
 
 AVG_BLOCK_EXE_CYCLE                                         28314                         28314
 
diff --git a/tools/x86_trace_generator/trace_generator.cpp b/tools/x86_trace_generator/trace_generator.cpp
index c508447f..e9527e0f 100644
--- a/tools/x86_trace_generator/trace_generator.cpp
+++ b/tools/x86_trace_generator/trace_generator.cpp
@@ -151,6 +151,7 @@ Knob(UINT32, Knob_num_thread, "thread", "1", "Total number of threads to gather
 Knob(string, Knob_compiler, "compiler", "gcc", "Which compiler was used?");
 Knob(string, Knob_pl, "pl", "normal", "Programming Language");
 Knob(UINT64, Knob_skip, "skipinst", "0", "Instructions to skip");
+Knob(UINT64, Knob_skip_thread0, "skip_thread0", "0", "skip thread 0");
 Knob(UINT64, Knob_max, "max", "0", "Max number of instruction to collect");
 Knob(UINT64, Knob_rtn_min, "rmin", "0", "Max number of function calls to collect data");
 Knob(UINT64, Knob_rtn_max, "rmax", "0", "Max number of function calls to collect data");
@@ -441,6 +442,9 @@ UINT64 last_count[MAX_THREADS] = {0};
 VOID PIN_FAST_ANALYSIS_CALL INST_count(UINT32 count)
 {
   THREADID tid = threadMap[PIN_ThreadId()];
+
+  if ((Knob_skip_thread0.Value()==1) && (tid == 0))
+    return; 
   if (tid == 100000)
     return;
 
@@ -565,6 +569,10 @@ VOID INST_trace(TRACE trace, VOID *v)
 void instrument(INS ins)
 {
   THREADID tid = threadMap[PIN_ThreadId()];
+
+  if ((Knob_skip_thread0.Value()==1) && (tid == 0))
+    return; 
+
   if (tid == 100000)
     return;
 
@@ -908,6 +916,10 @@ VOID InstHMC(ADDRINT pc)
   if (!Knob_enable_hmc.Value())
     return;
   THREADID tid = threadMap[PIN_ThreadId()];
+
+  if ((Knob_skip_thread0.Value()==1) && (tid == 0))
+    return; 
+
   if (tid != 0)
     return;
 
@@ -947,6 +959,9 @@ void ThreadStart(THREADID tid, CONTEXT *ctxt, INT32 flags, void *v)
    cout << "-> Thread[" << tid << "->" << threadMap[tid] << "] begins." << endl;
   THREADID threadid = threadMap[tid];
 
+  if ((Knob_skip_thread0.Value()==1) && (threadid == 0))
+    return; 
+
   if (threadid == 100000)
     return;
 
@@ -1028,6 +1043,9 @@ void thread_end(void)
 void thread_end(THREADID threadid)
 {
   //	THREADID threadid = threadMap[PIN_ThreadId()];
+
+  if ((Knob_skip_thread0.Value()==1) && (threadid == 0))
+    return; 
   if (threadid == 100000)
     return;
 
@@ -1205,6 +1223,10 @@ void sanity_check(void)
 void write_inst_to_file(ofstream *file, Inst_info *t_info)
 {
   THREADID tid = threadMap[PIN_ThreadId()];
+
+  if ((Knob_skip_thread0.Value()==1) && (tid == 0))
+    return; 
+
   if (tid == 100000 || !g_enable_thread_instrument[tid] || g_inst_print_count[tid] > Knob_dump_max.Value())
     return;