From 6bfd95f3087d9768771522ba9d31d3a7aaa463da Mon Sep 17 00:00:00 2001
From: zhumakhan <zhumakhan.nazir@centml.ai>
Date: Sat, 3 Aug 2024 02:11:11 +0500
Subject: [PATCH] workaround for gpt-j (#395)

Some models initialize tensors during the first forward pass and reuse
it for next iterations. This causes model to recompile . One temporary
solution is to run torch model once before compilation. Related issue is
here: https://github.com/CentML/hidet/issues/291

Co-authored-by: Zhumakhan <nazirzhumakhan@gmail,.com>
---
 tests/benchmarks/bench_transformer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/benchmarks/bench_transformer.py b/tests/benchmarks/bench_transformer.py
index 0e2d5340e..4839c03e9 100644
--- a/tests/benchmarks/bench_transformer.py
+++ b/tests/benchmarks/bench_transformer.py
@@ -59,6 +59,11 @@ def bench_causal_lm(model_name, bs, genlen, dtype, backend, mode):
     inputs = tokenizer(input_string_batch, return_tensors='pt')['input_ids'].cuda()
 
     with torch.no_grad(), torch.autocast("cuda"):
+        # Temporary workaround for gpt-j
+        # gpt-j initializes tensors during the first forwasd pass
+        # which causes recompilation during the second forward pass
+        if model_name == 'EleutherAI/gpt-j-6B':
+            model(inputs)
         model = comp_backend.compile(model)
         latency = bench_gen_model(model, tokenizer, inputs, bs=bs, genlen=genlen)
         del model