-
Notifications
You must be signed in to change notification settings - Fork 1
/
phi2_demo.py
87 lines (74 loc) · 5.04 KB
/
phi2_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Import necessary libraries from Flask for creating the web app and handling requests,
# and from transformers for interacting with the Phi-2 model.
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
# Initialize the Flask app.
app = Flask(__name__)
# Print statements for console feedback when loading starts and completes.
# Helpful for debugging and to know when the app is ready.
print("Starting to load the tokenizer and model...")
# Load the tokenizer and model using Hugging Face's Transformers library.
# The tokenizer turns text inputs into a format the model can understand,
# and the model is what generates the text based on those tokenized inputs.
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
# Pad Token ID: This is the specific token used for padding the input sequences to achieve a uniform length
# across all sequences in a batch. Padding becomes necessary because machine learning models, especially those used in NLP,
# are trained on and expect input data to be presented in batches of consistent size.
# The consistent length is achieved by filling in the shorter sequences with a pad token
# so that every sequence in the batch matches the length of the longest sequence.
# The uniformity in sequence length is crucial for the model to process the data correctly.
# Here, we check if the tokenizer already has a pad token defined.
# If not, we manually add a special token for padding purposes. We choose '[PAD]' as the pad token symbol.
# This step ensures that our data preprocessing aligns with the model's requirements for input data format.
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Note: For generative models like GPT, which are designed to produce sequences of text,
# it might be more appropriate to use the eos_token (end-of-sequence token) as the pad token.
# This is because the eos_token naturally signifies the end of a generated text sequence,
# making it a suitable choice for padding in scenarios where maintaining the integrity of textual output is important.
# However, when we explicitly set the pad token to '[PAD]',
# we are ensuring that our data preprocessing step introduces a clear and distinct token for padding,
# which may be more suitable for tasks that require distinguishing padded areas from actual sequence content.
# tokenizer.pad_token = tokenizer.eos_token # An alternative approach for models like GPT.
config = AutoConfig.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", config=config)
print("Model and tokenizer loaded successfully.")
# Define a route for the generate endpoint, allowing POST requests.
# This is the main interface of the API, where users can send prompts for text generation.
@app.route('/generate', methods=['POST'])
def generate_text():
if request.method == 'POST':
# Extract the 'prompt' data from the incoming JSON request.
data = request.json
prompt = data.get('prompt', '')
# Check if a prompt was provided.
if prompt:
# Encode the prompt to a format suitable for the model, with padding and truncation.
input_ids = tokenizer.encode(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
# Generate an attention mask for the input. This helps the model know which parts of the input are actual data vs padding.
# Attention Mask: A binary tensor indicating the position of padded indices so the model does not attend to them.
# For each token in the input, the mask will be 1 if the token is not padding and 0 if it is padding.
attention_mask = input_ids.ne(tokenizer.pad_token_id).int()
# Generate text based on the encoded input and additional parameters.
output = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
num_return_sequences=1,
pad_token_id=tokenizer.eos_token_id, # Explicitly setting pad token ID for generation.
temperature=0.7, # Adjust based on desired creativity
top_k=50, # Consider tuning
top_p=0.95, # Consider tuning
do_sample=True, # Enable sampling-based generation
max_new_tokens=50 # Control the length of the generated output
# max_length=200, # Commented out to rely on max_new_tokens
)
# Decode the generated tokens to a string and return it as part of a JSON response.
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
return jsonify({'generated_text': generated_text})
else:
# Return an error if no prompt was provided in the request.
return jsonify({'error': 'No prompt provided'}), 400
# Start the Flask app on host 0.0.0.0 (making it accessible from other machines) and port 9001.
if __name__ == '__main__':
app.run(host='0.0.0.0', port=9001)