From c66716bcfce2270de3bd50f0b7d05590a96b6fc1 Mon Sep 17 00:00:00 2001
From: cranechu <1340390339@qq.com>
Date: Thu, 12 Sep 2024 16:17:45 +0800
Subject: [PATCH 1/2] fix: add npu to ppl

---
 .../benchmark/harness/lm-evaluation-harness   |  1 +
 .../dev/benchmark/perplexity/run_wikitext.py  | 20 +++++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)
 create mode 160000 python/llm/dev/benchmark/harness/lm-evaluation-harness

diff --git a/python/llm/dev/benchmark/harness/lm-evaluation-harness b/python/llm/dev/benchmark/harness/lm-evaluation-harness
new file mode 160000
index 00000000000..b281b0921b6
--- /dev/null
+++ b/python/llm/dev/benchmark/harness/lm-evaluation-harness
@@ -0,0 +1 @@
+Subproject commit b281b0921b636bc36ad05c0b0b0763bd6dd43463
diff --git a/python/llm/dev/benchmark/perplexity/run_wikitext.py b/python/llm/dev/benchmark/perplexity/run_wikitext.py
index 061c87babb6..245bc037d35 100644
--- a/python/llm/dev/benchmark/perplexity/run_wikitext.py
+++ b/python/llm/dev/benchmark/perplexity/run_wikitext.py
@@ -36,7 +36,18 @@
 parser.add_argument("--mixed_precision", action="store_true") 
 args = parser.parse_args()
 
-if args.precision == "fp16":  # ipex fp16
+if args.device == "npu":
+    from ipex_llm.transformers.npu_model import AutoModelForCausalLM
+    model = AutoModelForCausalLM.from_pretrained(
+            args.model_path,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+            max_output_len=4096,
+            max_prompt_len=4096,
+            load_in_low_bit=args.precision,
+            attn_implementation="eager"
+        )
+elif args.precision == "fp16":  # ipex fp16
     from transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(args.model_path,
                                                  use_cache=args.use_cache,
@@ -57,7 +68,7 @@
                                                  trust_remote_code=True,
                                                  mixed_precision=args.mixed_precision)   
     model = model.half()
-model = model.to(args.device)
+
 model = model.eval()
 
 from transformers import AutoTokenizer
@@ -98,7 +109,7 @@ def parse_kwargs(kwstr):
     else:
         end_loc = begin_loc + stride
         trg_len = -stride//2
-    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(args.device)
+    input_ids = encodings.input_ids[:, begin_loc:end_loc]
     if args.stride == 0: input_ids[:, 0] = tokenizer.bos_token_id
     target_ids = input_ids.clone()
     target_ids[:, :-trg_len] = -100
@@ -110,6 +121,7 @@ def parse_kwargs(kwstr):
         # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
         # to the left by 1.
         neg_log_likelihood = outputs.loss
+        print(neg_log_likelihood)
 
     nlls.append(neg_log_likelihood)
     if "xpu" in args.device:
@@ -118,6 +130,6 @@ def parse_kwargs(kwstr):
     prev_end_loc = end_loc
     if end_loc == seq_len:
         break
-
+print(neg_log_likelihood)
 ppl = torch.exp(torch.stack(nlls).mean())
 print("Final ppl estimate: {}".format(ppl.item()))

From 70ee2e70191e87d84d9560448ed19ac41e31b5bf Mon Sep 17 00:00:00 2001
From: cranechu <1340390339@qq.com>
Date: Thu, 12 Sep 2024 16:51:27 +0800
Subject: [PATCH 2/2] fix: remove lm_eval

---
 python/llm/dev/benchmark/harness/lm-evaluation-harness | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 python/llm/dev/benchmark/harness/lm-evaluation-harness

diff --git a/python/llm/dev/benchmark/harness/lm-evaluation-harness b/python/llm/dev/benchmark/harness/lm-evaluation-harness
deleted file mode 160000
index b281b0921b6..00000000000
--- a/python/llm/dev/benchmark/harness/lm-evaluation-harness
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b281b0921b636bc36ad05c0b0b0763bd6dd43463