-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
103 lines (81 loc) · 4.13 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from flask import Flask, render_template, request
import spacy
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from rouge import Rouge
import ngram as ngram
app = Flask(__name__)
# Load T5 model and tokenizer
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')
nlp = spacy.load("en_core_web_sm")
@app.route('/', methods=['GET', 'POST'])
def index():
if request.method == 'POST':
# Take paragraph input
paragraph = request.form['paragraph']
# Perform POS tagging and NER using spaCy
doc = nlp(paragraph)
# Print POS tagging and NER
for token in doc:
print(f"{token.text}: {token.pos_}, {token.ent_type_}")
# Make list of words that are repeating and/or have different POS tagging in different occurrences
repeated_words = {}
for token in doc:
if token.is_punct or token.text.lower() in ["a", "an", "the"]:
continue
if token.text.lower() in repeated_words:
repeated_words[token.text.lower()].append((token.text, token.pos_, token.ent_type_))
else:
repeated_words[token.text.lower()] = [(token.text, token.pos_, token.ent_type_)]
# Filter out auxiliary verbs from the list of repeating words
repeated_words = {word: taggings for word, taggings in repeated_words.items() if not all(t[1] == "AUX" for t in taggings)}
# Make a list of potential target words to perform WSD
target_words=[]
for word, taggings in repeated_words.items():
if len(taggings) > 1:
print(f"{word}: {[t[1] for t in taggings]}")
target_words.append((word, [t[1] for t in taggings], [t[2] for t in taggings]))
# Generate summaries using T5 and compare results
summaries = []
for word, pos_tags, ner_tags in target_words:
# Generate input sequence with target word
input_seq = f"Potential target word: {word}\n\n"
for token in doc:
if token.text.lower() == word.lower():
input_seq += f"[{word}]"
else:
input_seq += token.text_with_ws
# Generate summary using T5
input_ids = t5_tokenizer.encode(input_seq, return_tensors='pt')
output = t5_model.generate(input_ids, max_length=10000, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
# Get the top 5 predictions for the target word using an n-gram model
candidates = repeated_words[word.lower()]
candidates_str = [c[0] for c in candidates]
predictions = ngram.NGram(candidates_str).search(word.lower(), threshold=0.5)[:5]
if len(predictions) > 0:
top_prediction = predictions[0][0] # Only use the word from the top prediction
else:
top_prediction = word # If n-gram model cannot find any prediction, keep the original word
# Replace the target word with the top prediction
summary = t5_tokenizer.decode(output[0], skip_special_tokens=True)
summary = summary.replace(word, top_prediction)
summaries.append(summary)
# Compare summaries and find the best one
best_summary = ""
best_rouge_score = 0
rouge = Rouge()
for i, summary in enumerate(summaries):
score = rouge.get_scores(summary, paragraph)[0]['rouge-1']['f']
if score > best_rouge_score:
best_summary = summary
best_rouge_score = score
#print the summary and its rouge score
print(f"Summary {i+1}: {summary}")
print(f"Rouge score: {score:.2f}\n")
# Render the results in the template
return render_template('index.html', paragraph=paragraph, summary=best_summary, rouge_score=best_rouge_score)
else:
return render_template('index.html')
if __name__ == '__main__':
app.run(debug=True)