diff --git a/.vscode/launch.json b/.vscode/launch.json
index 5fcfad64..f1bba173 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -159,8 +159,10 @@
             "program": "${workspaceFolder}/samples/fir",
             "args": [
                 "-timing",
-                "-length=64",
+                "-length=8192",
                 "-report-all",
+                "-gpus=1,2",
+                "-use-unified-memory=false",
             ],
         },
         {
diff --git a/accelsim_tracing/.gitignore b/accelsim_tracing/.gitignore
new file mode 100644
index 00000000..5588025c
--- /dev/null
+++ b/accelsim_tracing/.gitignore
@@ -0,0 +1,3 @@
+accelsim_tracing
+output/
+example/
\ No newline at end of file
diff --git a/accelsim_tracing/README.md b/accelsim_tracing/README.md
new file mode 100644
index 00000000..92051850
Binary files /dev/null and b/accelsim_tracing/README.md differ
diff --git a/accelsim_tracing/alu/alu.go b/accelsim_tracing/alu/alu.go
new file mode 100644
index 00000000..71b05002
--- /dev/null
+++ b/accelsim_tracing/alu/alu.go
@@ -0,0 +1,17 @@
+package alu
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type ALU interface {
+	withParent(aluGroup *ALUGroup) ALU
+	Execute(inst nvidia.Instruction)
+}
+
+func (a *ALUGroup) newALU() ALU {
+	switch a.meta.aluType {
+	case "int32":
+		return newInt32ALU().withParent(a)
+	default:
+		panic("Unknown ALU type")
+	}
+}
diff --git a/accelsim_tracing/alu/builder.go b/accelsim_tracing/alu/builder.go
new file mode 100644
index 00000000..921f6780
--- /dev/null
+++ b/accelsim_tracing/alu/builder.go
@@ -0,0 +1,45 @@
+package alu
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type ALUGroup struct {
+	meta *aluGroupMetaData
+	alus []ALU
+}
+
+type aluGroupMetaData struct {
+	aluType string
+	aluNum  int32
+}
+
+func NewALUGroup() *ALUGroup {
+	return &ALUGroup{
+		meta: &aluGroupMetaData{
+			aluType: "undefined",
+			aluNum:  0,
+		},
+	}
+}
+
+func (a *ALUGroup) WithALUType(aluType string) *ALUGroup {
+	a.meta.aluType = aluType
+	return a
+}
+
+func (a *ALUGroup) WithALUNum(num int32) *ALUGroup {
+	a.meta.aluNum = num
+	return a
+}
+
+func (a *ALUGroup) Build() {
+	a.alus = make([]ALU, a.meta.aluNum)
+	for i := range a.alus {
+		a.alus[i] = a.newALU()
+	}
+}
+
+func (a *ALUGroup) Execute(inst nvidia.Instruction) {
+	for _, alu := range a.alus {
+		alu.Execute(inst)
+	}
+}
diff --git a/accelsim_tracing/alu/doc.go b/accelsim_tracing/alu/doc.go
new file mode 100644
index 00000000..ff02a5be
--- /dev/null
+++ b/accelsim_tracing/alu/doc.go
@@ -0,0 +1,2 @@
+// Package alu contains the ALU implementation
+package alu
diff --git a/accelsim_tracing/alu/int32.go b/accelsim_tracing/alu/int32.go
new file mode 100644
index 00000000..caa45c68
--- /dev/null
+++ b/accelsim_tracing/alu/int32.go
@@ -0,0 +1,19 @@
+package alu
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type int32ALU struct {
+	parent *ALUGroup
+}
+
+func newInt32ALU() *int32ALU {
+	return &int32ALU{}
+}
+
+func (a *int32ALU) withParent(aluGroup *ALUGroup) ALU {
+	a.parent = aluGroup
+	return a
+}
+
+func (a *int32ALU) Execute(inst nvidia.Instruction) {
+}
diff --git a/accelsim_tracing/benchmark/build.go b/accelsim_tracing/benchmark/build.go
new file mode 100644
index 00000000..7de4dbb9
--- /dev/null
+++ b/accelsim_tracing/benchmark/build.go
@@ -0,0 +1,45 @@
+package benchmark
+
+import (
+	"errors"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpu"
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/trace"
+)
+
+type BenchMark struct {
+	fromTrace    bool
+	traceDirPath string
+	trace        *trace.Trace
+}
+
+func NewBenchMark() *BenchMark {
+	return &BenchMark{
+		fromTrace:    false,
+		traceDirPath: "",
+		trace:        nil,
+	}
+}
+
+func (bm *BenchMark) WithTraceDirPath(path string) *BenchMark {
+	bm.traceDirPath = path
+	bm.fromTrace = true
+	return bm
+}
+
+func (bm *BenchMark) Build() error {
+	if bm.fromTrace == false {
+		return errors.New("no trace dir path specified")
+	}
+	bm.trace = trace.NewTrace().WithTraceDirPath(bm.traceDirPath)
+	bm.trace.Build()
+	return nil
+}
+
+func (bm *BenchMark) Exec(gpu *gpu.GPU) error {
+	if bm.fromTrace == false {
+		panic("No trace dir path specified")
+	}
+	err := bm.trace.Exec(gpu)
+	return err
+}
diff --git a/accelsim_tracing/benchmark/doc.go b/accelsim_tracing/benchmark/doc.go
new file mode 100644
index 00000000..bc000d26
--- /dev/null
+++ b/accelsim_tracing/benchmark/doc.go
@@ -0,0 +1,2 @@
+// Package benchmark contains the driver which links traces and the simulator
+package benchmark
diff --git a/accelsim_tracing/gpc/builder.go b/accelsim_tracing/gpc/builder.go
new file mode 100644
index 00000000..8b78cba3
--- /dev/null
+++ b/accelsim_tracing/gpc/builder.go
@@ -0,0 +1,142 @@
+package gpc
+
+import (
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/sm"
+)
+
+type GPC struct {
+	meta       *gpcMetaData
+	dispatcher gpcDispatcher
+	sms        []*sm.SM
+}
+
+type gpcMetaData struct {
+	smNum     int32
+	smUnitNum int32
+
+	gpcStrategy    string
+	smStrategy     string
+	smUnitStrategy string
+
+	l2CacheSize int32
+	l1CacheSize int32
+	l0CacheSize int32
+
+	registerFileSize int32
+	laneSize         int32
+
+	alus []struct {
+		aluType string
+		aluNum  int32
+	}
+}
+
+func NewGPC() *GPC {
+	return &GPC{
+		meta: &gpcMetaData{
+			smNum:     0,
+			smUnitNum: 0,
+
+			gpcStrategy:    "default",
+			smStrategy:     "default",
+			smUnitStrategy: "default",
+
+			l2CacheSize: 0,
+			l1CacheSize: 0,
+			l0CacheSize: 0,
+
+			registerFileSize: 0,
+			laneSize:         0,
+
+			alus: nil,
+		},
+		dispatcher: nil,
+		sms:        nil,
+	}
+}
+
+func (g *GPC) WithSMNum(num int32) *GPC {
+	g.meta.smNum = num
+	return g
+}
+
+func (g *GPC) WithGPCStrategy(strategy string) *GPC {
+	g.meta.gpcStrategy = strategy
+	return g
+}
+
+func (g *GPC) WithSMUnitNum(num int32) *GPC {
+	g.meta.smUnitNum = num
+	return g
+}
+
+func (g *GPC) WithSMStrategy(strategy string) *GPC {
+	g.meta.smStrategy = strategy
+	return g
+}
+
+func (g *GPC) WithSMUnitStrategy(strategy string) *GPC {
+	g.meta.smUnitStrategy = strategy
+	return g
+}
+
+func (g *GPC) WithL2CacheSize(size int32) *GPC {
+	g.meta.l2CacheSize = size
+	return g
+}
+
+func (g *GPC) WithL1CacheSize(size int32) *GPC {
+	g.meta.l1CacheSize = size
+	return g
+}
+
+func (g *GPC) WithL0CacheSize(size int32) *GPC {
+	g.meta.l0CacheSize = size
+	return g
+}
+
+func (g *GPC) WithRegisterFileSize(size int32) *GPC {
+	g.meta.registerFileSize = size
+	return g
+}
+
+func (g *GPC) WithLaneSize(size int32) *GPC {
+	g.meta.laneSize = size
+	return g
+}
+
+func (g *GPC) WithALU(aluType string, num int32) *GPC {
+	g.meta.alus = append(g.meta.alus, struct {
+		aluType string
+		aluNum  int32
+	}{aluType: aluType, aluNum: num})
+	return g
+}
+
+func (g *GPC) Build() {
+	g.buildDispatcher()
+	g.sms = make([]*sm.SM, g.meta.smNum)
+	for i := 0; i < int(g.meta.smNum); i++ {
+		g.sms[i] = sm.NewSM().
+		WithSMStrategy(g.meta.smStrategy).
+			WithSMUnitNum(g.meta.smUnitNum).
+			WithSMUnitStrategy(g.meta.smUnitStrategy).
+			WithL1CacheSize(g.meta.l1CacheSize).
+			WithL0CacheSize(g.meta.l0CacheSize).
+			WithRegisterFileSize(g.meta.registerFileSize).
+			WithLaneSize(g.meta.laneSize)
+		for _, alu := range g.meta.alus {
+			g.sms[i].WithALU(alu.aluType, alu.aluNum)
+		}
+		g.sms[i].Build()
+	}
+}
+
+func (g *GPC) IsFree() bool {
+	return true
+}
+
+func (g *GPC) Execute(tb *nvidia.ThreadBlock) {
+	g.dispatcher.dispatch(tb)
+}
diff --git a/accelsim_tracing/gpc/default.go b/accelsim_tracing/gpc/default.go
new file mode 100644
index 00000000..40a8b5ab
--- /dev/null
+++ b/accelsim_tracing/gpc/default.go
@@ -0,0 +1,32 @@
+package gpc
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type defaultDispatcher struct {
+	parent *GPC
+}
+
+func newDefaultDispatcher() *defaultDispatcher {
+	return &defaultDispatcher{}
+}
+
+func (d *defaultDispatcher) withParent(gpc *GPC) gpcDispatcher {
+	d.parent = gpc
+	return d
+}
+
+func (d *defaultDispatcher) dispatch(tb *nvidia.ThreadBlock) {
+	for {
+		flag := false
+		for _, sm := range d.parent.sms {
+			if sm.IsFree() {
+				sm.Execute(tb)
+				flag = true
+				break
+			}
+		}
+		if flag {
+			break
+		}
+	}
+}
diff --git a/accelsim_tracing/gpc/dispatcher.go b/accelsim_tracing/gpc/dispatcher.go
new file mode 100644
index 00000000..a9eed59d
--- /dev/null
+++ b/accelsim_tracing/gpc/dispatcher.go
@@ -0,0 +1,17 @@
+package gpc
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type gpcDispatcher interface {
+	withParent(gpc *GPC) gpcDispatcher
+	dispatch(tb *nvidia.ThreadBlock)
+}
+
+func (g *GPC) buildDispatcher() {
+	switch g.meta.gpcStrategy {
+	case "default":
+		g.dispatcher = newDefaultDispatcher().withParent(g)
+	default:
+		panic("Unknown dispatcher strategy")
+	}
+}
diff --git a/accelsim_tracing/gpc/doc.go b/accelsim_tracing/gpc/doc.go
new file mode 100644
index 00000000..26c9aba3
--- /dev/null
+++ b/accelsim_tracing/gpc/doc.go
@@ -0,0 +1,2 @@
+// Package gpc implements the simulation components for the GPC Level.
+package gpc
diff --git a/accelsim_tracing/gpu/builder.go b/accelsim_tracing/gpu/builder.go
new file mode 100644
index 00000000..1667cfc7
--- /dev/null
+++ b/accelsim_tracing/gpu/builder.go
@@ -0,0 +1,157 @@
+package gpu
+
+import (
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpc"
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+)
+
+type GPU struct {
+	meta       *gpuMetaData
+	dispatcher gpuDispatcher
+	gpcs       []*gpc.GPC
+}
+
+type gpuMetaData struct {
+	gpcNum    int32
+	smNum     int32
+	smUnitNum int32
+
+	gpuStrategy    string
+	gpcStrategy    string
+	smStrategy     string
+	smUnitStrategy string
+
+	l2CacheSize int32
+	l1CacheSize int32
+	l0CacheSize int32
+
+	registerFileSize int32
+	laneSize         int32
+
+	alus []struct {
+		aluType string
+		aluNum  int32
+	}
+}
+
+func NewGPU() *GPU {
+	return &GPU{
+		meta: &gpuMetaData{
+			gpcNum:    0,
+			smNum:     0,
+			smUnitNum: 0,
+
+			gpuStrategy:    "default",
+			gpcStrategy:    "default",
+			smStrategy:     "default",
+			smUnitStrategy: "default",
+
+			l2CacheSize: 0,
+			l1CacheSize: 0,
+			l0CacheSize: 0,
+
+			registerFileSize: 0,
+			laneSize:         0,
+
+			alus: nil,
+		},
+		dispatcher: nil,
+		gpcs:       nil,
+	}
+}
+
+func (g *GPU) WithGPCNum(num int32) *GPU {
+	g.meta.gpcNum = num
+	return g
+}
+
+func (g *GPU) WithSMNum(num int32) *GPU {
+	g.meta.smNum = num
+	return g
+}
+
+func (g *GPU) WithSMUnitNum(num int32) *GPU {
+	g.meta.smUnitNum = num
+	return g
+}
+
+func (g *GPU) WithGPUStrategy(strategy string) *GPU {
+	g.meta.gpuStrategy = strategy
+	return g
+}
+
+func (g *GPU) WithGPCStrategy(strategy string) *GPU {
+	g.meta.gpcStrategy = strategy
+	return g
+}
+
+func (g *GPU) WithSMStrategy(strategy string) *GPU {
+	g.meta.smStrategy = strategy
+	return g
+}
+
+func (g *GPU) WithSMUnitStrategy(strategy string) *GPU {
+	g.meta.smUnitStrategy = strategy
+	return g
+}
+
+func (g *GPU) WithL2CacheSize(size int32) *GPU {
+	g.meta.l2CacheSize = size
+	return g
+}
+
+func (g *GPU) WithL1CacheSize(size int32) *GPU {
+	g.meta.l1CacheSize = size
+	return g
+}
+
+func (g *GPU) WithL0CacheSize(size int32) *GPU {
+	g.meta.l0CacheSize = size
+	return g
+}
+
+func (g *GPU) WithRegisterFileSize(size int32) *GPU {
+	g.meta.registerFileSize = size
+	return g
+}
+
+func (g *GPU) WithLaneSize(size int32) *GPU {
+	g.meta.laneSize = size
+	return g
+}
+
+func (g *GPU) WithALU(aluType string, num int32) *GPU {
+	g.meta.alus = append(g.meta.alus, struct {
+		aluType string
+		aluNum  int32
+	}{aluType: aluType, aluNum: num})
+	return g
+}
+
+func (g *GPU) Build() {
+	g.buildDispatcher()
+	g.gpcs = make([]*gpc.GPC, g.meta.gpcNum)
+	for i := 0; i < int(g.meta.gpcNum); i++ {
+		g.gpcs[i] = gpc.NewGPC().
+			WithSMNum(g.meta.smNum).
+			WithSMUnitNum(g.meta.smUnitNum).
+			WithGPCStrategy(g.meta.gpcStrategy).
+			WithSMStrategy(g.meta.smStrategy).
+			WithSMUnitStrategy(g.meta.smUnitStrategy).
+			WithL2CacheSize(g.meta.l2CacheSize).
+			WithL1CacheSize(g.meta.l1CacheSize).
+			WithL0CacheSize(g.meta.l0CacheSize).
+			WithRegisterFileSize(g.meta.registerFileSize).
+			WithLaneSize(g.meta.laneSize)
+		for _, alu := range g.meta.alus {
+			g.gpcs[i].WithALU(alu.aluType, alu.aluNum)
+		}
+		g.gpcs[i].Build()
+	}
+}
+
+// RunThreadBlock runs a threadblock on the GPU
+// [todo] how to handle the relationship between trace.threadblock and truethreadblock
+func (g *GPU) RunThreadBlock(tb *nvidia.ThreadBlock) {
+	g.dispatcher.dispatch(tb)
+}
diff --git a/accelsim_tracing/gpu/default.go b/accelsim_tracing/gpu/default.go
new file mode 100644
index 00000000..56fc9101
--- /dev/null
+++ b/accelsim_tracing/gpu/default.go
@@ -0,0 +1,34 @@
+package gpu
+
+import (
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+)
+
+type defaultDispatcher struct {
+	parent *GPU
+}
+
+func newDefaultDispatcher() *defaultDispatcher {
+	return &defaultDispatcher{}
+}
+
+func (d *defaultDispatcher) withParent(gpu *GPU) gpuDispatcher {
+	d.parent = gpu
+	return d
+}
+
+func (d *defaultDispatcher) dispatch(tb *nvidia.ThreadBlock) {
+	for {
+		flag := false
+		for _, gpc := range d.parent.gpcs {
+			if gpc.IsFree() {
+				gpc.Execute(tb)
+				flag = true
+				break
+			}
+		}
+		if flag {
+			break
+		}
+	}
+}
diff --git a/accelsim_tracing/gpu/dispatcher.go b/accelsim_tracing/gpu/dispatcher.go
new file mode 100644
index 00000000..515ab81a
--- /dev/null
+++ b/accelsim_tracing/gpu/dispatcher.go
@@ -0,0 +1,17 @@
+package gpu
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type gpuDispatcher interface {
+	withParent(gpu *GPU) gpuDispatcher
+	dispatch(tb *nvidia.ThreadBlock)
+}
+
+func (g *GPU) buildDispatcher() {
+	switch g.meta.gpuStrategy {
+	case "default":
+		g.dispatcher = newDefaultDispatcher().withParent(g)
+	default:
+		panic("Unknown dispatcher strategy")
+	}
+}
diff --git a/accelsim_tracing/gpu/doc.go b/accelsim_tracing/gpu/doc.go
new file mode 100644
index 00000000..2f735511
--- /dev/null
+++ b/accelsim_tracing/gpu/doc.go
@@ -0,0 +1,2 @@
+// Package gpu implements the simulation components for the GPU level.
+package gpu
diff --git a/accelsim_tracing/nvidia/byte.go b/accelsim_tracing/nvidia/byte.go
new file mode 100644
index 00000000..72d614d6
--- /dev/null
+++ b/accelsim_tracing/nvidia/byte.go
@@ -0,0 +1,7 @@
+package nvidia
+
+const (
+	BYTE  = 8
+	WORD  = 16
+	DWORD = 32
+)
diff --git a/accelsim_tracing/nvidia/dim3.go b/accelsim_tracing/nvidia/dim3.go
new file mode 100644
index 00000000..cd231980
--- /dev/null
+++ b/accelsim_tracing/nvidia/dim3.go
@@ -0,0 +1,3 @@
+package nvidia
+
+type Dim3 = [3]int
diff --git a/accelsim_tracing/nvidia/doc.go b/accelsim_tracing/nvidia/doc.go
new file mode 100644
index 00000000..bd4c0d06
--- /dev/null
+++ b/accelsim_tracing/nvidia/doc.go
@@ -0,0 +1,2 @@
+// Package nvidia includes basic const, types and structs for nvidia tracing
+package nvidia
diff --git a/accelsim_tracing/nvidia/opcode.go b/accelsim_tracing/nvidia/opcode.go
new file mode 100644
index 00000000..7413edca
--- /dev/null
+++ b/accelsim_tracing/nvidia/opcode.go
@@ -0,0 +1,57 @@
+package nvidia
+
+import "log"
+
+// VariableType [todo] how to construct these?
+type VariableType int32
+
+const (
+	VariableDefault VariableType = iota
+	VariableError
+	VariableINT32
+	VariableFP32
+	VariableFP64
+)
+
+type OpCodeType int32
+
+const (
+	OpCodeDefault OpCodeType = iota
+	OpCodeError
+	IMADMOVU32
+)
+
+type Opcode struct {
+	rawText string
+	opType  OpCodeType
+	varType VariableType
+}
+
+func NewOpcode(rawText string) *Opcode {
+	op, ok := opcodeTable[rawText]
+	if !ok {
+		op = Opcode{rawText, OpCodeError, VariableError}
+		log.Panic("Unknown opcode: ", rawText)
+	}
+	return &op
+}
+
+func (op *Opcode) String() string {
+	return op.rawText
+}
+
+func (op *Opcode) OpcodeType() OpCodeType {
+	return op.opType
+}
+
+func (op *Opcode) VariableType() VariableType {
+	return op.varType
+}
+
+var opcodeTable map[string]Opcode
+
+func init() {
+	opcodeTable = make(map[string]Opcode)
+
+	opcodeTable["IMAD.MOV.U32"] = Opcode{"IMAD.MOV.U32", IMADMOVU32, VariableINT32}
+}
diff --git a/accelsim_tracing/nvidia/register.go b/accelsim_tracing/nvidia/register.go
new file mode 100644
index 00000000..c7e7f074
--- /dev/null
+++ b/accelsim_tracing/nvidia/register.go
@@ -0,0 +1,44 @@
+package nvidia
+
+import (
+	"fmt"
+	"log"
+)
+
+type Register struct {
+	rawText string
+	regID   int32
+	isZero  bool
+}
+
+func NewRegister(rawText string) *Register {
+	reg, ok := registerTable[rawText]
+	if !ok {
+		reg = Register{rawText, -1, false}
+		log.Panic("Unknown register: ", rawText)
+	}
+	return &reg
+}
+
+func (r *Register) String() string {
+	return r.rawText
+}
+
+func (r *Register) ID() int32 {
+	return r.regID
+}
+
+func (r *Register) IsZeroRegister() bool {
+	return r.isZero
+}
+
+var registerTable map[string]Register
+
+func init() {
+	registerTable = make(map[string]Register)
+
+	for i := 0; i < 32; i++ {
+		registerTable[fmt.Sprintf("R%d", i)] = Register{fmt.Sprintf("R%d", i), int32(i), false}
+	}
+	registerTable["R255"] = Register{"R255", 255, true}
+}
diff --git a/accelsim_tracing/nvidia/thread_block.go b/accelsim_tracing/nvidia/thread_block.go
new file mode 100644
index 00000000..2c4754ef
--- /dev/null
+++ b/accelsim_tracing/nvidia/thread_block.go
@@ -0,0 +1,26 @@
+package nvidia
+
+type ThreadBlock struct {
+	WarpNum int
+	Warps   []*Warp
+}
+
+type Warp struct {
+	InstNum int
+	Insts   []*Instruction
+}
+
+type Instruction struct {
+	PC                int32
+	Mask              int64
+	DestNum           int32
+	DestRegs          []*Register
+	OpCode            *Opcode
+	SrcNum            int32
+	SrcRegs           []*Register
+	MemWidth          int32
+	AddressCompress   int32
+	MemAddress        int64
+	MemAddressSuffix1 int32
+	MemAddressSuffix2 []int32
+}
diff --git a/accelsim_tracing/sm/builder.go b/accelsim_tracing/sm/builder.go
new file mode 100644
index 00000000..812f1c71
--- /dev/null
+++ b/accelsim_tracing/sm/builder.go
@@ -0,0 +1,119 @@
+package sm
+
+import (
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/smunit"
+)
+
+type SM struct {
+	meta       *smMetaData
+	dispatcher smDispatcher
+	smUnits    []*smunit.SMUnit
+}
+
+type smMetaData struct {
+	smUnitNum int32
+
+	smStrategy     string
+	smUnitStrategy string
+
+	l2CacheSize int32
+	l1CacheSize int32
+	l0CacheSize int32
+
+	registerFileSize int32
+	laneSize         int32
+
+	alus []struct {
+		aluType string
+		aluNum  int32
+	}
+}
+
+func NewSM() *SM {
+	return &SM{
+		meta: &smMetaData{
+			smUnitNum: 0,
+
+			smStrategy:     "default",
+			smUnitStrategy: "default",
+
+			l1CacheSize: 0,
+			l0CacheSize: 0,
+
+			registerFileSize: 0,
+			laneSize:         0,
+
+			alus: nil,
+		},
+		dispatcher: nil,
+		smUnits:    nil,
+	}
+}
+
+func (s *SM) WithSMStrategy(strategy string) *SM {
+	s.meta.smStrategy = strategy
+	return s
+}
+
+func (s *SM) WithSMUnitNum(num int32) *SM {
+	s.meta.smUnitNum = num
+	return s
+}
+
+func (s *SM) WithSMUnitStrategy(strategy string) *SM {
+	s.meta.smUnitStrategy = strategy
+	return s
+}
+
+func (s *SM) WithL1CacheSize(size int32) *SM {
+	s.meta.l1CacheSize = size
+	return s
+}
+
+func (s *SM) WithL0CacheSize(size int32) *SM {
+	s.meta.l0CacheSize = size
+	return s
+}
+
+func (s *SM) WithRegisterFileSize(size int32) *SM {
+	s.meta.registerFileSize = size
+	return s
+}
+
+func (s *SM) WithLaneSize(size int32) *SM {
+	s.meta.laneSize = size
+	return s
+}
+
+func (s *SM) WithALU(aluType string, aluNum int32) *SM {
+	s.meta.alus = append(s.meta.alus, struct {
+		aluType string
+		aluNum  int32
+	}{aluType: aluType, aluNum: aluNum})
+	return s
+}
+
+func (s *SM) Build() {
+	s.buildDispatcher()
+	s.smUnits = make([]*smunit.SMUnit, s.meta.smUnitNum)
+	for i := 0; i < int(s.meta.smUnitNum); i++ {
+		s.smUnits[i] = smunit.NewSMUnit().
+		WithSMUnitStrategy(s.meta.smUnitStrategy).
+			WithL0CacheSize(s.meta.l0CacheSize).
+			WithRegisterFileSize(s.meta.registerFileSize).
+			WithLaneSize(s.meta.laneSize)
+		for _, alu := range s.meta.alus {
+			s.smUnits[i].WithALU(alu.aluType, alu.aluNum)
+		}
+		s.smUnits[i].Build()
+	}
+}
+
+func (s *SM) IsFree() bool {
+	return true
+}
+
+func (s *SM) Execute(tb *nvidia.ThreadBlock) {
+	s.dispatcher.dispatch(tb)
+}
diff --git a/accelsim_tracing/sm/default.go b/accelsim_tracing/sm/default.go
new file mode 100644
index 00000000..d13f70f5
--- /dev/null
+++ b/accelsim_tracing/sm/default.go
@@ -0,0 +1,34 @@
+package sm
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type defaultDispatcher struct {
+	parent *SM
+}
+
+func newDefaultDispatcher() *defaultDispatcher {
+	return &defaultDispatcher{}
+}
+
+func (d *defaultDispatcher) withParent(sm *SM) smDispatcher {
+	d.parent = sm
+	return d
+}
+
+func (d *defaultDispatcher) dispatch(tb *nvidia.ThreadBlock) {
+	for _, warp := range tb.Warps {
+		for {
+			flag := false
+			for _, smUnit := range d.parent.smUnits {
+				if smUnit.IsFree() {
+					smUnit.Execute(warp)
+					flag = true
+					break
+				}
+			}
+			if flag {
+				break
+			}
+		}
+	}
+}
diff --git a/accelsim_tracing/sm/dispacther.go b/accelsim_tracing/sm/dispacther.go
new file mode 100644
index 00000000..6d2a2a61
--- /dev/null
+++ b/accelsim_tracing/sm/dispacther.go
@@ -0,0 +1,17 @@
+package sm
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type smDispatcher interface {
+	withParent(sm *SM) smDispatcher
+	dispatch(tb *nvidia.ThreadBlock)
+}
+
+func (s *SM) buildDispatcher() {
+	switch s.meta.smStrategy {
+	case "default":
+		s.dispatcher = newDefaultDispatcher().withParent(s)
+	default:
+		panic("Unknown dispatch strategy")
+	}
+}
diff --git a/accelsim_tracing/sm/doc.go b/accelsim_tracing/sm/doc.go
new file mode 100644
index 00000000..e53f0941
--- /dev/null
+++ b/accelsim_tracing/sm/doc.go
@@ -0,0 +1,2 @@
+// Package sm implements the simulation components for the SM level.
+package sm
diff --git a/accelsim_tracing/smunit/builder.go b/accelsim_tracing/smunit/builder.go
new file mode 100644
index 00000000..ced59820
--- /dev/null
+++ b/accelsim_tracing/smunit/builder.go
@@ -0,0 +1,93 @@
+package smunit
+
+import (
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/alu"
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+)
+
+type SMUnit struct {
+	meta         *smUnitMetaData
+	dispatcher   smUnitDispatcher
+	registerFile *RegisterFile
+	aluGroup     []*alu.ALUGroup
+}
+
+type smUnitMetaData struct {
+	smUnitStrategy string
+
+	l0CacheSize int32
+
+	registerFileSize int32
+	laneSize         int32
+
+	alus []struct {
+		aluType string
+		aluNum  int32
+	}
+}
+
+func NewSMUnit() *SMUnit {
+	return &SMUnit{
+		meta: &smUnitMetaData{
+			smUnitStrategy: "default",
+
+			l0CacheSize: 0,
+
+			registerFileSize: 0,
+			laneSize:         0,
+
+			alus: nil,
+		},
+		dispatcher:   nil,
+		registerFile: nil,
+		aluGroup:     nil,
+	}
+}
+
+func (s *SMUnit) WithSMUnitStrategy(strategy string) *SMUnit {
+	s.meta.smUnitStrategy = strategy
+	return s
+}
+
+func (s *SMUnit) WithL0CacheSize(size int32) *SMUnit {
+	s.meta.l0CacheSize = size
+	return s
+}
+
+func (s *SMUnit) WithRegisterFileSize(size int32) *SMUnit {
+	s.meta.registerFileSize = size
+	return s
+}
+
+func (s *SMUnit) WithLaneSize(size int32) *SMUnit {
+	s.meta.laneSize = size
+	return s
+}
+
+func (s *SMUnit) WithALU(aluType string, num int32) *SMUnit {
+	s.meta.alus = append(s.meta.alus, struct {
+		aluType string
+		aluNum  int32
+	}{aluType: aluType, aluNum: num})
+	return s
+}
+
+func (s *SMUnit) Build() {
+	s.buildDispatcher()
+	s.buildRegisterFile(s.meta.registerFileSize, s.meta.laneSize)
+	s.aluGroup = make([]*alu.ALUGroup, len(s.meta.alus))
+	for i, a := range s.meta.alus {
+		s.aluGroup[i] = alu.NewALUGroup().
+			WithALUType(a.aluType).
+			WithALUNum(a.aluNum)
+		s.aluGroup[i].Build()
+	}
+}
+
+func (s *SMUnit) IsFree() bool {
+	return true
+}
+
+func (s *SMUnit) Execute(warp *nvidia.Warp) {
+	s.dispatcher.dispatch(warp)
+}
diff --git a/accelsim_tracing/smunit/default.go b/accelsim_tracing/smunit/default.go
new file mode 100644
index 00000000..220fd191
--- /dev/null
+++ b/accelsim_tracing/smunit/default.go
@@ -0,0 +1,19 @@
+package smunit
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type defaultDispatcher struct {
+	parent *SMUnit
+}
+
+func newDefaultDispatcher() *defaultDispatcher {
+	return &defaultDispatcher{}
+}
+
+func (d *defaultDispatcher) withParent(sm *SMUnit) smUnitDispatcher {
+	d.parent = sm
+	return d
+}
+
+func (d *defaultDispatcher) dispatch(warp *nvidia.Warp) {
+}
diff --git a/accelsim_tracing/smunit/dispatcher.go b/accelsim_tracing/smunit/dispatcher.go
new file mode 100644
index 00000000..125be691
--- /dev/null
+++ b/accelsim_tracing/smunit/dispatcher.go
@@ -0,0 +1,17 @@
+package smunit
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+type smUnitDispatcher interface {
+	withParent(sm *SMUnit) smUnitDispatcher
+	dispatch(tb *nvidia.Warp)
+}
+
+func (s *SMUnit) buildDispatcher() {
+	switch s.meta.smUnitStrategy {
+	case "default":
+		s.dispatcher = newDefaultDispatcher().withParent(s)
+	default:
+		panic("Unknown dispatch strategy")
+	}
+}
diff --git a/accelsim_tracing/smunit/doc.go b/accelsim_tracing/smunit/doc.go
new file mode 100644
index 00000000..18798c1f
--- /dev/null
+++ b/accelsim_tracing/smunit/doc.go
@@ -0,0 +1,2 @@
+// Package smunit implements the simulation components for the SM Unit level.
+package smunit
\ No newline at end of file
diff --git a/accelsim_tracing/smunit/registerfile.go b/accelsim_tracing/smunit/registerfile.go
new file mode 100644
index 00000000..a99a97be
--- /dev/null
+++ b/accelsim_tracing/smunit/registerfile.go
@@ -0,0 +1,21 @@
+package smunit
+
+type RegisterFile struct {
+	RfSize          int32
+	rfLaneSize      int32
+	buf             []byte
+	byteSizePerLane int32
+}
+
+func (r *RegisterFile) Read(offset int32, width int32) {
+}
+
+func (r *RegisterFile) Write(offset int32, width int32) {
+}
+
+func (s *SMUnit) buildRegisterFile(size int32, sizePerLane int32) {
+	s.registerFile = &RegisterFile{
+		buf:             make([]byte, size),
+		byteSizePerLane: sizePerLane,
+	}
+}
diff --git a/accelsim_tracing/trace/convert.go b/accelsim_tracing/trace/convert.go
new file mode 100644
index 00000000..aa1fdca2
--- /dev/null
+++ b/accelsim_tracing/trace/convert.go
@@ -0,0 +1,41 @@
+package trace
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+
+func (tb *threadBlock) generateNVThreadBlock() *nvidia.ThreadBlock {
+	nvtb := &nvidia.ThreadBlock{
+		WarpNum: len(tb.warps),
+	}
+	for _, wp := range tb.warps {
+		nvtb.Warps = append(nvtb.Warps, wp.generateNVWarp())
+	}
+	return nvtb
+}
+
+func (wp *warp) generateNVWarp() *nvidia.Warp {
+	nvwp := &nvidia.Warp{
+		InstNum: len(wp.instructions),
+	}
+	for _, inst := range wp.instructions {
+		nvwp.Insts = append(nvwp.Insts, inst.generateNVInst())
+	}
+	return nvwp
+}
+
+func (inst *instruction) generateNVInst() *nvidia.Instruction {
+	nvinst := &nvidia.Instruction{
+		PC:                inst.PC,
+		Mask:              inst.Mask,
+		DestNum:           inst.DestNum,
+		DestRegs:          inst.DestRegs,
+		OpCode:            inst.OpCode,
+		SrcNum:            inst.SrcNum,
+		SrcRegs:           inst.SrcRegs,
+		MemWidth:          inst.MemWidth,
+		AddressCompress:   inst.AddressCompress,
+		MemAddress:        inst.MemAddress,
+		MemAddressSuffix1: inst.MemAddressSuffix1,
+		MemAddressSuffix2: inst.MemAddressSuffix2,
+	}
+	return nvinst
+}
diff --git a/accelsim_tracing/trace/doc.go b/accelsim_tracing/trace/doc.go
new file mode 100644
index 00000000..984da148
--- /dev/null
+++ b/accelsim_tracing/trace/doc.go
@@ -0,0 +1,2 @@
+// Package trace deals with trace parsing
+package trace
diff --git a/accelsim_tracing/trace/kernel.go b/accelsim_tracing/trace/kernel.go
new file mode 100644
index 00000000..86ace4ec
--- /dev/null
+++ b/accelsim_tracing/trace/kernel.go
@@ -0,0 +1,26 @@
+package trace
+
+import (
+	"path"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpu"
+)
+
+type kernel struct { // trace execs interface
+	parent *Trace
+
+	rawText    string
+	filePath   string
+	traceGroup *traceGroup
+}
+
+func (te *kernel) Type() string {
+	return "kernel"
+}
+
+func (te *kernel) Execute(gpu *gpu.GPU) error {
+	tg := NewTraceGroup().WithFilePath(path.Join(te.parent.traceDirPath, te.filePath))
+	tg.Build()
+	err := tg.Exec(gpu)
+	return err
+}
diff --git a/accelsim_tracing/trace/memory_copy.go b/accelsim_tracing/trace/memory_copy.go
new file mode 100644
index 00000000..2acf02ad
--- /dev/null
+++ b/accelsim_tracing/trace/memory_copy.go
@@ -0,0 +1,24 @@
+package trace
+
+import "github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpu"
+
+type memCopy struct { // trace execs interface
+	parent *Trace
+
+	rawText   string
+	h2d       bool
+	startAddr uint64
+	length    uint64
+}
+
+type memCopyParent struct {
+	trace *Trace
+}
+
+func (te *memCopy) Type() string {
+	return "memcopy"
+}
+
+func (te *memCopy) Execute(gpu *gpu.GPU) error {
+	return nil
+}
diff --git a/accelsim_tracing/trace/thread_block.go b/accelsim_tracing/trace/thread_block.go
new file mode 100644
index 00000000..9c96537c
--- /dev/null
+++ b/accelsim_tracing/trace/thread_block.go
@@ -0,0 +1,53 @@
+package trace
+
+import (
+	"fmt"
+	"log"
+	"strings"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+)
+
+type threadBlock struct {
+	parent     *traceGroup
+	rawContext struct {
+		blockDim string
+	}
+
+	threadBlockDim nvidia.Dim3
+	warps          []*warp
+}
+
+func parseThreadBlocks(lines []string) *threadBlock {
+	tb := &threadBlock{}
+	dim := parseThreadBlockDim(lines)
+	tb.threadBlockDim = *dim
+	for i, line := range lines {
+		if strings.HasPrefix(line, "warp") {
+			wp := parseWarp(lines[i:]) // [todo] too many copies
+			wp.parent = tb
+			tb.warps = append(tb.warps, wp)
+		}
+	}
+	return tb
+}
+
+func parseThreadBlockDim(lines []string) *nvidia.Dim3 {
+	for _, line := range lines {
+		if strings.HasPrefix(line, "thread block") {
+			d := &nvidia.Dim3{}
+			elems := strings.Split(line, "=")
+			if len(elems) != 2 {
+				log.Panicf("Invalid thread block dim line: %s", line)
+			}
+			value := strings.TrimSpace(elems[1])
+			_, err := fmt.Sscanf(value, "%d,%d,%d", &d[0], &d[1], &d[2])
+			if err != nil {
+				log.Panicf("Invalid thread block dim value: %s", value)
+			}
+			return d
+		}
+	}
+	log.Panic("Cannot find thread block dim")
+	return nil
+}
diff --git a/accelsim_tracing/trace/trace.go b/accelsim_tracing/trace/trace.go
new file mode 100644
index 00000000..da8a7516
--- /dev/null
+++ b/accelsim_tracing/trace/trace.go
@@ -0,0 +1,63 @@
+package trace
+
+import (
+	"bufio"
+	"log"
+	"os"
+	"path"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpu"
+)
+
+type Trace struct {
+	traceDirPath string
+	traceExecs   []traceExecs
+}
+
+func NewTrace() *Trace {
+	return &Trace{
+		traceDirPath: "",
+		traceExecs:   nil,
+	}
+}
+
+func (t *Trace) WithTraceDirPath(path string) *Trace {
+	t.traceDirPath = path
+	return t
+}
+
+func (t *Trace) Build() {
+	t.parseKernelsList()
+}
+
+func (t *Trace) Exec(gpu *gpu.GPU) error {
+	for _, tg := range t.traceExecs {
+		err := tg.Execute(gpu)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (t *Trace) parseKernelsList() {
+	filePath := path.Join(t.traceDirPath, "kernelslist.g")
+	file, err := os.Open(filePath)
+	if err != nil {
+		log.Panic(err)
+	}
+	defer file.Close()
+
+	var lines []string
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		if scanner.Text() != "" {
+			lines = append(lines, scanner.Text())
+		}
+	}
+
+	for _, line := range lines {
+		te := parseTraceExecs(line, t)
+		t.traceExecs = append(t.traceExecs, te)
+	}
+}
diff --git a/accelsim_tracing/trace/trace_execs.go b/accelsim_tracing/trace/trace_execs.go
new file mode 100644
index 00000000..a2c81c15
--- /dev/null
+++ b/accelsim_tracing/trace/trace_execs.go
@@ -0,0 +1,45 @@
+package trace
+
+import (
+	"fmt"
+	"log"
+	"strings"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpu"
+)
+
+type traceExecs interface {
+	Type() string
+	Execute(gpu *gpu.GPU) error
+}
+
+func parseTraceExecs(rawText string, trace *Trace) traceExecs {
+	if strings.HasPrefix(rawText, "Memcpy") {
+		/*
+			format  : H2D or D2H, start, length
+			example : HtoD,0x7f0,0x1000
+		*/
+		res := strings.Split(rawText, ",")
+		m := &memCopy{
+			parent:  trace,
+			rawText: rawText,
+			h2d:     strings.Contains(res[0], "HtoD"),
+		}
+		fmt.Sscanf(res[1], "%v", &m.startAddr)
+		fmt.Sscanf(res[2], "%v", &m.length)
+		return m
+	} else if strings.HasPrefix(rawText, "kernel") {
+		/*
+			format  : kernel name
+			example : kernel_0
+		*/
+		k := &kernel{
+			parent:   trace,
+			rawText:  rawText,
+			filePath: rawText,
+		}
+		return k
+	}
+	log.Panicf("Unknown trace group rawText: %s", rawText)
+	return nil
+}
diff --git a/accelsim_tracing/trace/trace_group.go b/accelsim_tracing/trace/trace_group.go
new file mode 100644
index 00000000..edd930a1
--- /dev/null
+++ b/accelsim_tracing/trace/trace_group.go
@@ -0,0 +1,97 @@
+package trace
+
+import (
+	"bufio"
+	"container/list"
+	"log"
+	"os"
+	"strings"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpu"
+)
+
+type traceGroup struct {
+	filePath             string
+	file                 *os.File
+	scanner              *bufio.Scanner
+	hasParsedTraceHeader bool
+	traceHeader          *traceHeader
+	threadBlockQueue     *list.List
+}
+
+func NewTraceGroup() *traceGroup {
+	return &traceGroup{
+		filePath:         "",
+		threadBlockQueue: list.New(),
+	}
+}
+
+func (tg *traceGroup) WithFilePath(path string) *traceGroup {
+	tg.filePath = path
+	return tg
+}
+
+func (tg *traceGroup) Build() {
+	tg.buildFileScanner()
+	tg.parseTraceHeader()
+}
+
+func (tg *traceGroup) Exec(gpu *gpu.GPU) error {
+	// [todo] threadblocks can be parallelized to save memory
+	tg.parseThreadBlocks()
+
+	for it := tg.threadBlockQueue.Front(); it != nil; it = it.Next() {
+		gpu.RunThreadBlock(it.Value.(*threadBlock).generateNVThreadBlock())
+	}
+
+	tg.file.Close()
+	return nil
+}
+
+func (tg *traceGroup) buildFileScanner() {
+	file, err := os.Open(tg.filePath)
+	if err != nil {
+		log.Panic(err)
+	}
+	tg.file = file // [note] close after exec
+	tg.scanner = bufio.NewScanner(file)
+}
+
+func (tg *traceGroup) parseTraceHeader() {
+	if tg.hasParsedTraceHeader {
+		return
+	}
+
+	headerLines := make([]string, 0)
+	for tg.scanner.Scan() { // [note] get prefix lines that start with "-"
+		if strings.HasPrefix(tg.scanner.Text(), "-") {
+			headerLines = append(headerLines, tg.scanner.Text())
+		} else if tg.scanner.Text() != "" {
+			break
+		}
+	}
+
+	tg.traceHeader = parseHeaderParam(headerLines)
+	tg.hasParsedTraceHeader = true
+	tg.traceHeader.parent = tg
+}
+
+func (tg *traceGroup) parseThreadBlocks() {
+	if !tg.hasParsedTraceHeader {
+		log.Panic("Trace header has not been parsed")
+	}
+	for tg.scanner.Scan() {
+		if strings.TrimSpace(tg.scanner.Text()) == "#BEGIN_TB" {
+			threadBlocklines := make([]string, 0) // [note] store whole lines of a thread block
+			for tg.scanner.Scan() {
+				if strings.TrimSpace(tg.scanner.Text()) == "#END_TB" {
+					tb := parseThreadBlocks(threadBlocklines)
+					tb.parent = tg
+					tg.threadBlockQueue.PushBack(tb)
+					break
+				}
+				threadBlocklines = append(threadBlocklines, tg.scanner.Text())
+			}
+		}
+	}
+}
diff --git a/accelsim_tracing/trace/trace_header.go b/accelsim_tracing/trace/trace_header.go
new file mode 100644
index 00000000..d43b7462
--- /dev/null
+++ b/accelsim_tracing/trace/trace_header.go
@@ -0,0 +1,106 @@
+package trace
+
+import (
+	"fmt"
+	"log"
+	"strings"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+)
+
+type traceHeader struct {
+	parent     *traceGroup
+	rawContext struct {
+		kernelName            string
+		kernelID              string
+		gridDim               string
+		blockDim              string
+		shmem                 string
+		nregs                 string
+		binaryVersion         string
+		cudaStreamID          string
+		shmemBaseAddr         string
+		localMemBaseAddr      string
+		nvbitVersion          string
+		accelsimTracerVersion string
+	}
+
+	kernelName            string
+	kernelID              int32
+	gridDim               nvidia.Dim3
+	blockDim              nvidia.Dim3
+	shmem                 int32
+	nregs                 int32
+	binaryVersion         int32
+	cudaStreamID          int32
+	shmemBaseAddr         int64
+	localMemBaseAddr      int64
+	nvbitVersion          string
+	accelsimTracerVersion string
+}
+
+func parseHeaderParam(lines []string) *traceHeader {
+	th := &traceHeader{}
+
+	for _, line := range lines {
+		elems := strings.Split(line, "=")
+		if len(elems) != 2 {
+			log.Panicf("Invalid trace header line: %s", line)
+		}
+		key := strings.TrimSpace(elems[0])
+		value := strings.TrimSpace(elems[1])
+
+		th.updateParam(key[1:], value, line)
+	}
+	return th
+}
+
+// Shaoyu: Maybe we can parse the attrs in order and avoid using swicth-case here
+//
+//nolint:funlen,gocyclo
+func (th *traceHeader) updateParam(key string, value string, rawText string) {
+	err := error(nil)
+	switch key {
+	case "kernel name":
+		th.rawContext.kernelName = rawText
+		th.kernelName = value
+	case "kernel id":
+		th.rawContext.kernelID = rawText
+		_, err = fmt.Sscanf(value, "%d", &th.kernelID)
+	case "grid dim":
+		th.rawContext.gridDim = rawText
+		_, err = fmt.Sscanf(value, "(%d,%d,%d)", &th.gridDim[0], &th.gridDim[1], &th.gridDim[2])
+	case "block dim":
+		th.rawContext.blockDim = rawText
+		_, err = fmt.Sscanf(value, "(%d,%d,%d)", &th.blockDim[0], &th.blockDim[1], &th.blockDim[2])
+	case "shmem":
+		th.rawContext.shmem = rawText
+		_, err = fmt.Sscanf(value, "%d", &th.shmem)
+	case "nregs":
+		th.rawContext.nregs = rawText
+		_, err = fmt.Sscanf(value, "%d", &th.nregs)
+	case "binary version":
+		th.rawContext.binaryVersion = rawText
+		_, err = fmt.Sscanf(value, "%d", &th.binaryVersion)
+	case "cuda stream id":
+		th.rawContext.cudaStreamID = rawText
+		_, err = fmt.Sscanf(value, "%d", &th.cudaStreamID)
+	case "shmem base_addr":
+		th.rawContext.shmemBaseAddr = rawText
+		_, err = fmt.Sscanf(value, "%v", &th.shmemBaseAddr)
+	case "local mem base_addr":
+		th.rawContext.localMemBaseAddr = rawText
+		_, err = fmt.Sscanf(value, "%v", &th.localMemBaseAddr)
+	case "nvbit version":
+		th.rawContext.nvbitVersion = rawText
+		th.nvbitVersion = value
+	case "accelsim tracer version":
+		th.rawContext.accelsimTracerVersion = rawText
+		th.accelsimTracerVersion = value
+	default:
+		log.Printf("Unknown trace header key: %s", key)
+	}
+	if err != nil {
+		log.Panicf("Invalid trace header value for [%s]: %s", key, value)
+	}
+}
diff --git a/accelsim_tracing/trace/warp.go b/accelsim_tracing/trace/warp.go
new file mode 100644
index 00000000..a95515dc
--- /dev/null
+++ b/accelsim_tracing/trace/warp.go
@@ -0,0 +1,95 @@
+package trace
+
+import (
+	"fmt"
+	"log"
+	"strconv"
+	"strings"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+)
+
+type warp struct {
+	parent     *threadBlock
+	rawContext struct {
+		warpID     string
+		instsCount string
+	}
+
+	warpID       int32
+	instsCount   int32
+	instructions []instruction
+}
+
+type instruction struct {
+	parent  *warp
+	rawText string
+
+	PC                int32
+	Mask              int64
+	DestNum           int32
+	DestRegs          []*nvidia.Register
+	OpCode            *nvidia.Opcode
+	SrcNum            int32
+	SrcRegs           []*nvidia.Register
+	MemWidth          int32
+	AddressCompress   int32
+	MemAddress        int64
+	MemAddressSuffix1 int32
+	MemAddressSuffix2 []int32
+}
+
+func parseWarp(lines []string) *warp {
+	wp := &warp{}
+	elems0 := strings.Split(lines[0], "=")
+	elems1 := strings.Split(lines[1], "=")
+	if len(elems0) != 2 || len(elems1) != 2 {
+		log.Panicf("Invalid warp header: %s, %s", lines[0], lines[1])
+	}
+	wp.rawContext.warpID = lines[0]
+	wp.rawContext.instsCount = lines[1]
+	_, err0 := fmt.Sscanf(strings.TrimSpace(elems0[1]), "%d", &wp.warpID)
+	_, err1 := fmt.Sscanf(strings.TrimSpace(elems1[1]), "%d", &wp.instsCount)
+	if err0 != nil || err1 != nil {
+		log.Panicf("Invalid warp header: %s, %s", lines[0], lines[1])
+	}
+	for i := 2; i < 2+int(wp.instsCount); i++ {
+		inst := parseInst(lines[i])
+		inst.parent = wp
+		wp.instructions = append(wp.instructions, inst)
+	}
+	return wp
+}
+
+func parseInst(line string) instruction {
+	inst := &instruction{}
+	elems := strings.Fields(line)
+	fmt.Sscanf(elems[0]+elems[1]+elems[2], "%x%x%d", &inst.PC, &inst.Mask, &inst.SrcNum)
+	for i := 0; i < int(inst.SrcNum); i++ {
+		inst.SrcRegs = append(inst.SrcRegs, nvidia.NewRegister(elems[3+i]))
+	}
+	fmt.Sscanf(elems[3+int(inst.SrcNum)], "%d", &inst.DestNum)
+	for i := 0; i < int(inst.DestNum); i++ {
+		inst.DestRegs = append(inst.DestRegs, nvidia.NewRegister(elems[4+int(inst.SrcNum)+i]))
+	}
+	inst.parseMemory(elems[4+int(inst.SrcNum)+int(inst.DestNum):])
+	return *inst
+}
+
+// [todo]: understand memory format
+func (inst *instruction) parseMemory(elems []string) {
+	fmt.Sscanf(elems[0], "%d", &inst.MemWidth)
+	if inst.MemWidth == 0 {
+		return
+	}
+	fmt.Sscanf(elems[1]+elems[2], "%d0x%x", &inst.AddressCompress, &inst.MemAddress)
+	switch inst.AddressCompress {
+	case 1:
+		fmt.Sscanf(elems[2], "%d", &inst.MemAddressSuffix1)
+	case 2:
+		for _, s := range elems[2:] {
+			s32, _ := strconv.Atoi(s)
+			inst.MemAddressSuffix2 = append(inst.MemAddressSuffix2, int32(s32))
+		}
+	}
+}
diff --git a/accelsim_tracing/tracer.go b/accelsim_tracing/tracer.go
new file mode 100644
index 00000000..cb0df615
--- /dev/null
+++ b/accelsim_tracing/tracer.go
@@ -0,0 +1,62 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/benchmark"
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/gpu"
+	"github.com/sarchlab/mgpusim/v3/accelsim_tracing/nvidia"
+)
+
+type inputArguments struct {
+	inputTraceDir string
+	// deparse        bool
+	// outputTraceDir string
+}
+
+func getInputArguments() *inputArguments {
+	i := &inputArguments{}
+
+	flag.Usage = func() {
+		fmt.Println("Usage: ./as_trace_parser [options] trace")
+		flag.PrintDefaults()
+	}
+
+	flag.Parse()
+	if len(flag.Args()) < 1 {
+		flag.Usage()
+		log.Panic("Error: should specify an input trace")
+	}
+
+	i.inputTraceDir = flag.Arg(0)
+	return i
+}
+
+func buildAmpereGPU() *gpu.GPU {
+	gpu := gpu.NewGPU().
+		WithGPUStrategy("default").
+		WithGPCNum(8).
+		WithSMNum(16).
+		WithSMUnitNum(4).
+		WithGPCStrategy("default").
+		WithSMStrategy("default").
+		WithSMUnitStrategy("default").
+		WithL2CacheSize(4*1024*1024*nvidia.BYTE).
+		WithL1CacheSize(192*1024*nvidia.BYTE).
+		WithL0CacheSize(16*1024*nvidia.BYTE).
+		WithRegisterFileSize(256*1024*nvidia.BYTE).
+		WithLaneSize(4*nvidia.BYTE).
+		WithALU("int32", 16)
+	gpu.Build()
+	return gpu
+}
+
+func main() {
+	args := getInputArguments()
+	gpu := buildAmpereGPU()
+	benchmark := benchmark.NewBenchMark().WithTraceDirPath(args.inputTraceDir)
+	benchmark.Build()
+	benchmark.Exec(gpu)
+}
diff --git a/samples/runner/r9nanobuilder.go b/samples/runner/r9nanobuilder.go
index f76b015a..a0d3a23d 100644
--- a/samples/runner/r9nanobuilder.go
+++ b/samples/runner/r9nanobuilder.go
@@ -745,12 +745,6 @@ func (b *R9NanoGPUBuilder) populateInstMemoryHierarchy(sa *shaderArray) {
 }
 
 func (b *R9NanoGPUBuilder) buildRDMAEngine() {
-	// b.rdmaEngine = rdma.NewEngine(
-	// 	fmt.Sprintf("%s.RDMA", b.gpuName),
-	// 	b.engine,
-	// 	b.lowModuleFinderForL1,
-	// 	nil,
-	// )
 	name := fmt.Sprintf("%s.RDMA", b.gpuName)
 	b.rdmaEngine = rdma.MakeBuilder().
 		WithEngine(b.engine).
@@ -762,6 +756,10 @@ func (b *R9NanoGPUBuilder) buildRDMAEngine() {
 	if b.monitor != nil {
 		b.monitor.RegisterComponent(b.rdmaEngine)
 	}
+
+	if b.enableVisTracing {
+		tracing.CollectTrace(b.rdmaEngine, b.visTracer)
+	}
 }
 
 func (b *R9NanoGPUBuilder) buildPageMigrationController() {
diff --git a/timing/rdma/comp.go b/timing/rdma/comp.go
index ed95fa6b..0b8683dd 100644
--- a/timing/rdma/comp.go
+++ b/timing/rdma/comp.go
@@ -214,8 +214,7 @@ func (c *Comp) processReqFromL1(
 	if err == nil {
 		c.ToL1.Retrieve(now)
 
-		tracing.TraceReqReceive(req, c)
-		tracing.TraceReqInitiate(cloned, c, tracing.MsgIDAtReceiver(req, c))
+		c.traceInsideOutStart(req, cloned)
 
 		//fmt.Printf("%s req inside %s -> outside %s\n",
 		//e.Name(), req.GetID(), cloned.GetID())
@@ -247,8 +246,7 @@ func (c *Comp) processReqFromOutside(
 	if err == nil {
 		c.ToOutside.Retrieve(now)
 
-		tracing.TraceReqReceive(req, c)
-		tracing.TraceReqInitiate(cloned, c, tracing.MsgIDAtReceiver(req, c))
+		c.traceOutsideInStart(req, cloned)
 
 		//fmt.Printf("%s req outside %s -> inside %s\n",
 		//e.Name(), req.GetID(), cloned.GetID())
@@ -284,8 +282,7 @@ func (c *Comp) processRspFromL2(
 		//fmt.Printf("%s rsp inside %s -> outside %s\n",
 		//e.Name(), rsp.GetID(), rspToOutside.GetID())
 
-		tracing.TraceReqFinalize(trans.toInside, c)
-		tracing.TraceReqComplete(trans.fromOutside, c)
+		c.traceOutsideInEnd(trans)
 
 		c.transactionsFromOutside =
 			append(c.transactionsFromOutside[:transactionIndex],
@@ -312,8 +309,7 @@ func (c *Comp) processRspFromOutside(
 	if err == nil {
 		c.ToOutside.Retrieve(now)
 
-		tracing.TraceReqFinalize(trans.toOutside, c)
-		tracing.TraceReqComplete(trans.fromInside, c)
+		c.traceInsideOutEnd(trans)
 
 		//fmt.Printf("%s rsp outside %s -> inside %s\n",
 		//e.Name(), rsp.GetID(), rspToInside.GetID())
@@ -404,3 +400,69 @@ func (c *Comp) cloneRsp(origin mem.AccessRsp, rspTo string) mem.AccessRsp {
 func (c *Comp) SetFreq(freq sim.Freq) {
 	c.TickingComponent.Freq = freq
 }
+
+func (c *Comp) traceInsideOutStart(req mem.AccessReq, cloned mem.AccessReq) {
+	if len(c.Hooks()) == 0 {
+		return
+	}
+
+	tracing.StartTaskWithSpecificLocation(
+		tracing.MsgIDAtReceiver(req, c),
+		req.Meta().ID+"_req_out",
+		c,
+		"req_in",
+		reflect.TypeOf(req).String(),
+		c.Name()+".InsideOut",
+		req,
+	)
+
+	tracing.StartTaskWithSpecificLocation(
+		cloned.Meta().ID+"_req_out",
+		tracing.MsgIDAtReceiver(req, c),
+		c,
+		"req_out",
+		reflect.TypeOf(req).String(),
+		c.Name()+".InsideOut",
+		cloned,
+	)
+}
+
+func (c *Comp) traceOutsideInStart(req mem.AccessReq, cloned mem.AccessReq) {
+	if len(c.Hooks()) == 0 {
+		return
+	}
+
+	tracing.StartTaskWithSpecificLocation(
+		tracing.MsgIDAtReceiver(req, c),
+		req.Meta().ID+"_req_out",
+		c,
+		"req_in",
+		reflect.TypeOf(req).String(),
+		c.Name()+".OutsideIn",
+		req,
+	)
+
+	tracing.StartTaskWithSpecificLocation(
+		cloned.Meta().ID+"_req_out",
+		tracing.MsgIDAtReceiver(req, c),
+		c,
+		"req_out",
+		reflect.TypeOf(req).String(),
+		c.Name()+".OutsideIn",
+		cloned,
+	)
+}
+
+func (c *Comp) traceInsideOutEnd(trans transaction) {
+	if len(c.Hooks()) == 0 {
+		return
+	}
+
+	tracing.TraceReqFinalize(trans.toOutside, c)
+	tracing.TraceReqComplete(trans.fromInside, c)
+}
+
+func (c *Comp) traceOutsideInEnd(trans transaction) {
+	tracing.TraceReqFinalize(trans.toInside, c)
+	tracing.TraceReqComplete(trans.fromOutside, c)
+}