-
Notifications
You must be signed in to change notification settings - Fork 6
/
Arabic_Word2Vec_SkipGramSummarize.py
131 lines (120 loc) · 4.6 KB
/
Arabic_Word2Vec_SkipGramSummarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -- coding: utf-8 --
"""
Created on Sat Feb 22 23:53:51 2020
@author: Raghad Alshaikh, Ghaidaa Aflah, Nada Alamouadi
"""
import gensim
import re
import numpy as np
from nltk.stem.isri import ISRIStemmer
from urllib import request
from bs4 import BeautifulSoup as bs
import nltk
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from sklearn import cluster
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.cluster import KMeans
# ============================
# ====== N-Grams Models ======
#Get the pretrained model
t_model = gensim.models.Word2Vec.load('full_grams_sg_100_wiki/full_grams_sg_100_wiki.mdl')
# ============================
# ====== Load input ======
#Store the article URL
url = "https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D9%83%D9%88%D9%86"
allparagraphContent = ""
#Access the article URL
htmlArticle = request.urlopen(url)
#Get the article code including all the HTML tags
Soup = bs(htmlArticle, 'html.parser')
paragraphContents = Soup.findAll('p')
#Get the text
for paragraphContent in paragraphContents:
allparagraphContent += paragraphContent.text
# ============================
# ====== Clean input by removing all the HTML tags information======
allparagraphContent_Cleaned = re.sub(r'\[0-9]*\]',' ',allparagraphContent)
allparagraphContent_Cleaned = re.sub(r'\s+',' ',allparagraphContent)
allparagraphContent_Cleaned = re.sub(r'\[^a-zA-Z]',' ',allparagraphContent)
allparagraphContent_Cleaned = re.sub(r'\s+',' ',allparagraphContent)
# ============================
# ====== PreProcessing: Aracic Stemmer======
st = ISRIStemmer()
# ============================
# ====== PreProcessing: Tokenization======
words_tokens = nltk.word_tokenize(allparagraphContent_Cleaned)
# remove out-of-vocabulary words
# ====== remove out-of-vocabulary words======
words_tokens = [word for word in words_tokens if st.stem(word) in t_model.wv.vocab]
# ====== PreProcessing: Sentence splitting======
sentences_tokens = nltk.sent_tokenize(allparagraphContent_Cleaned)
print("sentences number: ",len(sentences_tokens))
print("\nOriginal input sentences: ")
for sen in sentences_tokens:
print("\n", sen)
# ====== PreProcessing: StopWords defined======
stopwords_list = stopwords.words('arabic')
# ====== PreProcessing: Stemming======
words_stemm = [st.stem(word) for word in words_tokens]
words_stemm = [st.stem(word) for word in nltk.word_tokenize(sen) if st.stem(word) not in stopwords_list]
# ====== From word vectors to sentence vector using Sum&Average======
# ====== Purpose: Get the sentences vectors to process======
senCount=0;
sentenceVec = {}
tsne = TSNE(n_components=2)
for sen in sentences_tokens:
sen = [st.stem(word) for word in nltk.word_tokenize(sen) if st.stem(word) in t_model.wv.vocab]
if len(sen) >= 1:
sentenceVec[senCount]=np.mean(t_model.wv[sen], axis=0)
senCount+=1
NUM_CLUSTERS=2
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
newSenVec = []
for i in sentenceVec:
newSenVec.append(sentenceVec[i])
#print(len(newSenVec))
print("\n\n")
#=======================
#=======================
#Calculate the clusters No.
#n_clusters = int(np.ceil(len(newSenVec)**0.5))
n_clusters=NUM_CLUSTERS
#Create the KMEANS
kmeans2 = KMeans(n_clusters=n_clusters, random_state=0)
#=======================
#Fit the sentences to each cluster
#=======================
kmeans2 = kmeans2.fit(newSenVec)
avg2 = []
closest2 = []
for j in range(n_clusters):
idx = np.where(kmeans2.labels_ == j)[0]
avg2.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans2.cluster_centers_,newSenVec)
ordering = sorted(range(n_clusters), key=lambda k: avg2[k])
summary2 = '\n\n'.join([sentences_tokens[closest[idx]] for idx in ordering])
#=======================
#Fit the sentences to each cluster
#=======================
kmeans.fit(newSenVec)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
n_clusters1 = n_clusters
avg1 = []
for j in range(n_clusters1):
idx1 = np.where(kmeans.labels_ == j)[0]
avg1.append(np.mean(idx1))
ordering = sorted(range(n_clusters1), key=lambda k: avg1[k])
#Print Summary based on first appearance
print('Summary based on the first appearance of each cluster')
appearedLabesl = []
index = 0
for i in labels:
if i not in appearedLabesl:
print(sentences_tokens[index], "\n")
appearedLabesl.append(i)
index = index+1
#Print Summary based on the closet to the center of each cluster
print("Summary based on the closet to the center of each cluster")
print(summary2)