-
Notifications
You must be signed in to change notification settings - Fork 4
/
TextSummarization_Preprocess.py
74 lines (56 loc) · 2.58 KB
/
TextSummarization_Preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Import necessary libraries
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.probability import FreqDist
from nltk.cluster.util import cosine_distance
# Original Text
text = "Apple Inc. is an American multinational technology company headquartered in Cupertino, California that designs, develops, and sells consumer electronics, computer software, and online services. The company's hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple smartwatch, and the Apple TV digital media player."
# Tokenization, Removing Stopwords, and Stemming
stop_words = set(stopwords.words("english"))
sentences = sent_tokenize(text)
def preprocess_text(sentence):
# Remove punctuation characters
sentence = re.sub(r"[^a-zA-Z0-9]", " ", sentence)
# Tokenization
words = word_tokenize(sentence)
# Remove stopwords
filtered_words = [w for w in words if w.lower() not in stop_words]
# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_words]
return stemmed
# Preprocess the sentences
preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences]
# Flatten the list of preprocessed words
flat_preprocessed_words = [word for sentence in preprocessed_sentences for word in sentence]
# Calculate word frequency
word_freq = FreqDist(flat_preprocessed_words)
# Score sentences based on word frequency
def score_sentences(sentences, word_freq):
sentence_scores = {}
for i, sentence in enumerate(sentences):
for word in sentence:
if word in word_freq:
if i in sentence_scores:
sentence_scores[i] += word_freq[word]
else:
sentence_scores[i] = word_freq[word]
return sentence_scores
sentence_scores = score_sentences(preprocessed_sentences, word_freq)
# Generate a summary by selecting top sentences
summary_sentences = []
if sentence_scores:
sorted_scores = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
top_sentences = sorted_scores[:3] # Select the top 3 sentences as the summary
for index, _ in top_sentences:
summary_sentences.append(sentences[index])
# Join the summary sentences to create the final summary
summary = ' '.join(summary_sentences)
# Print the summary
print("\nSummary:")
print(summary)