diff --git a/python/llm/dev/benchmark/perplexity/run_wikitext.py b/python/llm/dev/benchmark/perplexity/run_wikitext.py index 061c87babb6..245bc037d35 100644 --- a/python/llm/dev/benchmark/perplexity/run_wikitext.py +++ b/python/llm/dev/benchmark/perplexity/run_wikitext.py @@ -36,7 +36,18 @@ parser.add_argument("--mixed_precision", action="store_true") args = parser.parse_args() -if args.precision == "fp16": # ipex fp16 +if args.device == "npu": + from ipex_llm.transformers.npu_model import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained( + args.model_path, + trust_remote_code=True, + torch_dtype=torch.float16, + max_output_len=4096, + max_prompt_len=4096, + load_in_low_bit=args.precision, + attn_implementation="eager" + ) +elif args.precision == "fp16": # ipex fp16 from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(args.model_path, use_cache=args.use_cache, @@ -57,7 +68,7 @@ trust_remote_code=True, mixed_precision=args.mixed_precision) model = model.half() -model = model.to(args.device) + model = model.eval() from transformers import AutoTokenizer @@ -98,7 +109,7 @@ def parse_kwargs(kwstr): else: end_loc = begin_loc + stride trg_len = -stride//2 - input_ids = encodings.input_ids[:, begin_loc:end_loc].to(args.device) + input_ids = encodings.input_ids[:, begin_loc:end_loc] if args.stride == 0: input_ids[:, 0] = tokenizer.bos_token_id target_ids = input_ids.clone() target_ids[:, :-trg_len] = -100 @@ -110,6 +121,7 @@ def parse_kwargs(kwstr): # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels # to the left by 1. neg_log_likelihood = outputs.loss + print(neg_log_likelihood) nlls.append(neg_log_likelihood) if "xpu" in args.device: @@ -118,6 +130,6 @@ def parse_kwargs(kwstr): prev_end_loc = end_loc if end_loc == seq_len: break - +print(neg_log_likelihood) ppl = torch.exp(torch.stack(nlls).mean()) print("Final ppl estimate: {}".format(ppl.item()))