diff --git a/tests/acceptance/cases.go b/tests/acceptance/cases.go
index 8a7a23e0..c7d37a6f 100644
--- a/tests/acceptance/cases.go
+++ b/tests/acceptance/cases.go
@@ -801,32 +801,32 @@ var benchmarks = []benchmark{
 			{gpus: []int{1, 2, 3, 4}, timing: true, parallel: true, unifiedGPU: true, unifiedMemory: true},
 		},
 	},
-	{
-		benchmarkPath:  "",
-		executablePath: "../../samples/concurrentkernel",
-		executable:     "concurrentkernel",
-		sizeArgs:       []string{},
-		cases: []benchmarkCase{
-			{gpus: []int{1}, timing: false, parallel: false, unifiedGPU: false, unifiedMemory: false},
-			{gpus: []int{1}, timing: false, parallel: true, unifiedGPU: false, unifiedMemory: false},
-			{gpus: []int{1}, timing: true, parallel: false, unifiedGPU: false, unifiedMemory: false},
-			{gpus: []int{1}, timing: true, parallel: true, unifiedGPU: false, unifiedMemory: false},
-		},
-	},
-	{
-		benchmarkPath:  "",
-		executablePath: "../../samples/concurrentworkload",
-		executable:     "concurrentworkload",
-		sizeArgs:       []string{},
-		cases: []benchmarkCase{
-			{gpus: []int{1, 2, 3, 4}, timing: false, parallel: false, unifiedGPU: false, unifiedMemory: false},
-			{gpus: []int{1, 2, 3, 4}, timing: false, parallel: true, unifiedGPU: false, unifiedMemory: false},
-			{gpus: []int{1, 2, 3, 4}, timing: true, parallel: false, unifiedGPU: false, unifiedMemory: false},
-			{gpus: []int{1, 2, 3, 4}, timing: true, parallel: true, unifiedGPU: false, unifiedMemory: false},
-			// {gpus: []int{1, 2, 3, 4}, timing: false, parallel: false, unifiedGPU: false, unifiedMemory: true},
-			// {gpus: []int{1, 2, 3, 4}, timing: false, parallel: true, unifiedGPU: false, unifiedMemory: true},
-			// {gpus: []int{1, 2, 3, 4}, timing: true, parallel: false, unifiedGPU: false, unifiedMemory: true},
-			// {gpus: []int{1, 2, 3, 4}, timing: true, parallel: true, unifiedGPU: false, unifiedMemory: true},
-		},
-	},
+	// {
+	// 	benchmarkPath:  "",
+	// 	executablePath: "../../samples/concurrentkernel",
+	// 	executable:     "concurrentkernel",
+	// 	sizeArgs:       []string{},
+	// 	cases: []benchmarkCase{
+	// 		{gpus: []int{1}, timing: false, parallel: false, unifiedGPU: false, unifiedMemory: false},
+	// 		{gpus: []int{1}, timing: false, parallel: true, unifiedGPU: false, unifiedMemory: false},
+	// 		{gpus: []int{1}, timing: true, parallel: false, unifiedGPU: false, unifiedMemory: false},
+	// 		{gpus: []int{1}, timing: true, parallel: true, unifiedGPU: false, unifiedMemory: false},
+	// 	},
+	// },
+	// {
+	// 	benchmarkPath:  "",
+	// 	executablePath: "../../samples/concurrentworkload",
+	// 	executable:     "concurrentworkload",
+	// 	sizeArgs:       []string{},
+	// 	cases: []benchmarkCase{
+	// 		{gpus: []int{1, 2, 3, 4}, timing: false, parallel: false, unifiedGPU: false, unifiedMemory: false},
+	// 		{gpus: []int{1, 2, 3, 4}, timing: false, parallel: true, unifiedGPU: false, unifiedMemory: false},
+	// 		{gpus: []int{1, 2, 3, 4}, timing: true, parallel: false, unifiedGPU: false, unifiedMemory: false},
+	// 		{gpus: []int{1, 2, 3, 4}, timing: true, parallel: true, unifiedGPU: false, unifiedMemory: false},
+	// {gpus: []int{1, 2, 3, 4}, timing: false, parallel: false, unifiedGPU: false, unifiedMemory: true},
+	// {gpus: []int{1, 2, 3, 4}, timing: false, parallel: true, unifiedGPU: false, unifiedMemory: true},
+	// {gpus: []int{1, 2, 3, 4}, timing: true, parallel: false, unifiedGPU: false, unifiedMemory: true},
+	// {gpus: []int{1, 2, 3, 4}, timing: true, parallel: true, unifiedGPU: false, unifiedMemory: true},
+	// },
+	// },
 }
diff --git a/timing/cu/issuearbiter.go b/timing/cu/issuearbiter.go
index ac594acd..2992dbe8 100644
--- a/timing/cu/issuearbiter.go
+++ b/timing/cu/issuearbiter.go
@@ -10,7 +10,7 @@ type IssueArbiter struct {
 // NewIssueArbiter returns a newly created IssueArbiter
 func NewIssueArbiter() *IssueArbiter {
 	a := new(IssueArbiter)
-	a.lastSIMDID = -1
+	a.lastSIMDID = 0
 	return a
 }
 
@@ -23,25 +23,30 @@ func (a *IssueArbiter) Arbitrate(
 		return []*wavefront.Wavefront{}
 	}
 
-	a.moveToNextSIMD(wfPools)
-	for len(wfPools[a.lastSIMDID].wfs) == 0 {
-		a.moveToNextSIMD(wfPools)
-	}
+	wfToIssue := make([]*wavefront.Wavefront, 0)
+	for i := 0; i < len(wfPools); i++ {
+		simdID := (a.lastSIMDID + i) % len(wfPools)
+
+		typeMask := make([]bool, 7)
+		wfPool := wfPools[simdID]
+		for _, wf := range wfPool.wfs {
+			if wf.State != wavefront.WfReady || wf.InstToIssue == nil {
+				continue
+			}
 
-	typeMask := make([]bool, 7)
-	wfPool := wfPools[a.lastSIMDID]
-	list := make([]*wavefront.Wavefront, 0)
-	for _, wf := range wfPool.wfs {
-		if wf.State != wavefront.WfReady || wf.InstToIssue == nil {
-			continue
+			if typeMask[wf.InstToIssue.ExeUnit] == false {
+				wfToIssue = append(wfToIssue, wf)
+				typeMask[wf.InstToIssue.ExeUnit] = true
+			}
 		}
 
-		if typeMask[wf.InstToIssue.ExeUnit] == false {
-			list = append(list, wf)
-			typeMask[wf.InstToIssue.ExeUnit] = true
+		if len(wfToIssue) != 0 {
+			a.lastSIMDID = simdID
+			break
 		}
 	}
-	return list
+
+	return wfToIssue
 }
 
 func (a *IssueArbiter) moveToNextSIMD(wfPools []*WavefrontPool) {
diff --git a/timing/cu/vectormemoryunit.go b/timing/cu/vectormemoryunit.go
index c0b98528..8b571336 100644
--- a/timing/cu/vectormemoryunit.go
+++ b/timing/cu/vectormemoryunit.go
@@ -109,7 +109,7 @@ func (u *VectorMemoryUnit) insertTransactionToPipeline(
 }
 
 func (u *VectorMemoryUnit) execute(now sim.VTimeInSec) (madeProgress bool) {
-	item := u.postInstructionPipelineBuffer.Pop()
+	item := u.postInstructionPipelineBuffer.Peek()
 	if item == nil {
 		return false
 	}
@@ -126,6 +126,7 @@ func (u *VectorMemoryUnit) execute(now sim.VTimeInSec) (madeProgress bool) {
 		log.Panicf("running inst %s in vector memory unit is not supported", inst.String(nil))
 	}
 
+	u.postInstructionPipelineBuffer.Pop()
 	u.cu.UpdatePCAndSetReady(wave)
 	u.numInstInFlight--
 
diff --git a/timing/cu/vectormemoryunit_test.go b/timing/cu/vectormemoryunit_test.go
index 4c5cc62a..0e067254 100644
--- a/timing/cu/vectormemoryunit_test.go
+++ b/timing/cu/vectormemoryunit_test.go
@@ -91,6 +91,7 @@ var _ = Describe("Vector Memory Unit", func() {
 			transactions[i].Read = read
 		}
 		coalescer.EXPECT().generateMemTransactions(wave).Return(transactions)
+		instBuffer.EXPECT().Peek().Return(vectorMemInst{wavefront: wave})
 		instBuffer.EXPECT().Pop().Return(vectorMemInst{wavefront: wave})
 
 		madeProgress := vecMemUnit.instToTransaction(10)
@@ -122,6 +123,7 @@ var _ = Describe("Vector Memory Unit", func() {
 			transactions[i].Write = write
 		}
 		coalescer.EXPECT().generateMemTransactions(wave).Return(transactions)
+		instBuffer.EXPECT().Peek().Return(vectorMemInst{wavefront: wave})
 		instBuffer.EXPECT().Pop().Return(vectorMemInst{wavefront: wave})
 
 		madeProgress := vecMemUnit.instToTransaction(10)