-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
47 lines (19 loc) · 1.79 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def clean(sentence):
"""
Wikipedia veriseti için preprocess işlemlerini yapan fonksiyon
Input
sentence [string] : XML dosyasından gelen cümle
Output
sentence [string] : Temizlenmiş cümle
"""
table = str.maketrans("ABCÇDEFGĞHİIJKLMNOÖPRSŞTUÜVYZWXQ","abcçdefgğhiıjklmnoöprsştuüvyzwxq")
# Check special char
if "==" not in sentence and "|" not in sentence and "!" in sentence and "ISBN" not in sentence and ". Bölüm" not in sentence and "≈" not in sentence and "=" not in sentence and "http" not in sentence:
sentence_ = sentence.strip().split()
if sentence_[0] != "InDesign" and sentence_[0] != "Dosya" and sentence_[0] != "Image" and sentence_[0] != "YÖNLENDİRME" and sentence_[0] != "bar:" and sentence_[0] != "TextData=" and sentence_[0] != "fontsize:" and sentence_[0] != "id" and sentence_[0] != "ImageSize" and sentence_[0] != "PlotArea" and sentence_[0] != "DateFormat" and sentence_[0] != "Period" and sentence_[0] != "TimeAxis" and sentence_[0] != "AlignBars" and sentence_[0] != "ScaleMajor" and sentence_[0] != "ScaleMinor" and sentence_[0] != "BackgroundColors" and sentence_[0] != "BarData" and sentence_[0] != "REDIRECT" and sentence_[0] != "@":
if len(sentence.split()) > 4:
# Removing special characters
sentence = sentence.replace("\n"," ").replace("\t"," ").replace("Hicrî","Hicrî ").replace("Rumi","Rumi ")
# Lowercase
sentence = sentence.translate(table)
return sentence