switch the prefetch order back for now

At least for now, we will follow the order: 1. Load B Matrix Tile 2. Load A matrix Tile 3. Prefetch Next B Matrix Tile 4. Prefetch Next A Matrix Tile 5. Compute 6. Loop back to 1.
bashbaug · Mar 6, 2024 · 2ea4d56 · 2ea4d56
1 parent ef205e3
commit 2ea4d56
Showing 1 changed file with 11 additions and 16 deletions.
diff --git a/samples/99_matrixexperiments/matrix_kernel_tiled.cl b/samples/99_matrixexperiments/matrix_kernel_tiled.cl
@@ -605,8 +605,8 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN
 
     int prefetch_k = 0;
     for (int p = 0; p < PREFETCH_DISTANCE; p++) {
-        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
         HELPER_NAME(btile_block_prefetch_rowmajor, MM, NN)(B, tN, K, N, prefetch_k, n);
+        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
         prefetch_k += tK * KK;
     }
 
@@ -620,25 +620,22 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_rowmajor_tiled, 8, 16, MM, NN
     split_barrier_arrive();
 
     for (int k = 0; k < K; k += tK * KK) {
-        // TODO: skip prefetch on the last iterations.
-        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
-
         int8    bData[NN][KK];
         HELPER_NAME(btile_load_blockread_rowmajor, MM, NN)(B, tN, K, N, k, n, bData);
 
         short8  aData[KK][MM];
         HELPER_NAME(atile_load_blockread_rowmajor, MM, NN)(A, tM, M, K, m, k, aData);
 
+        HELPER_NAME(btile_block_prefetch_rowmajor, MM, NN)(B, tN, K, N, prefetch_k, n);
+        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
+        prefetch_k += tK * KK;
+
         for (int kk = 0; kk < KK; kk++) {
             for (int nn = 0; nn < NN; nn++) {
                 for (int mm = 0; mm < MM; mm++) {
                     sum[mm][nn] = mat_mul_sg16(aData[kk][mm], bData[nn][kk], sum[mm][nn]);
                 }
             }
-            if (kk == 0) {
-                HELPER_NAME(btile_block_prefetch_rowmajor, MM, NN)(B, tN, K, N, prefetch_k, n);
-                prefetch_k += tK * KK;
-            }
         }
 
         split_barrier_wait();
@@ -667,8 +664,8 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(gl
 
     int prefetch_k = 0;
     for (int p = 0; p < PREFETCH_DISTANCE; p++) {
-        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
         HELPER_NAME(btile_block_prefetch_vnni, MM, NN)(B, tN, K, N, prefetch_k, n);
+        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
         prefetch_k += tK * KK;
     }
 
@@ -682,25 +679,23 @@ kernel void MM_KERNEL_NAME(bfloat16_dpas_blockread_vnni_tiled, 8, 16, MM, NN)(gl
     split_barrier_arrive();
 
     for (int k = 0; k < K; k += tK * KK) {
-        // TODO: skip prefetch on the last iterations.
-        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
-
         int8    bData[NN][KK];
         HELPER_NAME(btile_load_blockread_vnni, MM, NN)(B, tN, K, N, k, n, bData);
 
         short8  aData[KK][MM];
         HELPER_NAME(atile_load_blockread_rowmajor, MM, NN)(A, tM, M, K, m, k, aData);
 
+        // TODO: skip prefetch on the last iterations.
+        HELPER_NAME(btile_block_prefetch_vnni, MM, NN)(B, tN, K, N, prefetch_k, n);
+        HELPER_NAME(atile_block_prefetch_rowmajor, MM, NN)(A, tM, M, K, m, prefetch_k);
+        prefetch_k += tK * KK;
+
         for (int kk = 0; kk < KK; kk++) {
             for (int nn = 0; nn < NN; nn++) {
                 for (int mm = 0; mm < MM; mm++) {
                     sum[mm][nn] = mat_mul_sg16(aData[kk][mm], bData[nn][kk], sum[mm][nn]);
                 }
             }
-            if (kk == 0) {
-                HELPER_NAME(btile_block_prefetch_vnni, MM, NN)(B, tN, K, N, prefetch_k, n);
-                prefetch_k += tK * KK;
-            }
         }
 
         split_barrier_wait();