-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleansing.py
67 lines (58 loc) · 2.18 KB
/
cleansing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def lowercase(text):
return text.lower()
def remove_unnecessary_char(text):
text = re.sub('\\+n', ' ', text)
text = re.sub('\n'," ",text)
text = re.sub('rt',' ',text)
text = re.sub('RT',' ',text)
text = re.sub('user',' ',text)
text = re.sub('USER', ' ', text)
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text)
text = re.sub(':', ' ', text)
text = re.sub(';', ' ', text)
text = re.sub('\\+n', ' ', text)
text = re.sub('\n'," ",text)
text = re.sub('\\+', ' ', text)
text = re.sub(' +', ' ', text)
return text
def remove_nonaplhanumeric(text):
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
return text
def normalize_alay(text):
alay_dict = pd.read_csv('./docs/new_kamusalay.csv', names=['original', 'replacement'], encoding='latin-1')
alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
normalize_text = ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in str(text).split(' ')])
return normalize_text
def remove_stopword(text):
stopword_dict = pd.read_csv('./docs/stopwordbahasa.csv', header=None, names=['stopword'], encoding='latin-1')
text = ' '.join(['' if word in stopword_dict.stopword.values else word for word in text.split(' ')])
text = re.sub(' +', ' ', text)
text = text.strip()
return text
def remove_emoticon_byte(text):
text = str(text).replace("\\", " ")
text = re.sub('x..', ' ', text)
text = re.sub(' n ', ' ', text)
return text
def remove_early_space(text):
if text[0] == ' ':
return text[1:]
else:
return text
def stemming(text):
return stemmer.stem(text)
def cleanse_text(text):
text = lowercase(text)
text = remove_early_space(text)
text = remove_nonaplhanumeric(text)
text = remove_unnecessary_char(text)
text = remove_emoticon_byte(text)
text = normalize_alay(text)
text = remove_stopword(text)
text = stemming(text)
return text