-
Notifications
You must be signed in to change notification settings - Fork 0
/
dirty_words.py
38 lines (31 loc) · 1.25 KB
/
dirty_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import json
import re
import time
import praw
def write_words():
ws = list()
with open("assets/DirtyWords.json", encoding="utf8") as f:
words = json.load(f)["RECORDS"]
count = 0
for word in words:
if word["language"] == "en":
ws.append(word["word"])
with open("assets/DirtyWords_en.txt", "w+") as f:
f.write("\n".join(ws))
if __name__ == "__main__":
with open("assets/DirtyWords_en.txt") as f:
words = f.read().splitlines()
word_patterns = [re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE) for w in words]
reddit = praw.Reddit("LCB")
texts = [comment.body for comment in reddit.subreddit("gtaonline").comments(limit=None)]
print(f"{len(texts)} comments found.")
start_time = time.time()
matches = 0
for text in texts:
matches += 1 if any(f" {word} " in f" {text} " for word in words) else 0
print(f"{matches} matches, took {time.time() - start_time} seconds.")
start_time = time.time()
matches = 0
for text in texts:
matches += 1 if any(pattern.search(text) for pattern in word_patterns) else 0
print(f"{matches} matches, took {time.time() - start_time} seconds.")