Skip to content

Commit

Permalink
Merge branch 'more_cleaning'
Browse files Browse the repository at this point in the history
  • Loading branch information
bomanimc committed Aug 21, 2019
2 parents 419a56c + f395666 commit bd682a4
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 1 deletion.
File renamed without changes.
64 changes: 63 additions & 1 deletion cleaned.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,76 @@
import csv
import datetime
import re
from nltk import word_tokenize
from nltk.util import ngrams
import operator
import pprint

INPUT_CSV_NAME = 'black_news'
INPUT_FILE_PATH = 'black_news.csv'
OUTPUT_FILE_BASE = 'cleaned/cleaned_'

BAD_BIGRAMS = [
'black tea',
'africano para',
'african bean',
'black hills',
'african plum',
'african mango',
'black beans',
'black henna',
'black-ish ,',
'black box',
'black pepper',
'african forests'
]

def has_only_relevant_blacks(sent):
pat = r'(\w*%s\w*)' % 'black'
matches = re.findall(pat, sent.lower())
for match in matches:
if (match not in ['black', 'blacks', 'nonblack']):
print(match)
return False

return True

def should_select_sentence(sent, connected_bigrams):
ends_with_period = sent[-1] == '.'
no_newlines = "\n" not in sent
not_about_trypanosomiasis = 'trypanosomiasis' not in sent
only_relevant_black = has_only_relevant_blacks(sent)
contains_bad_bigram = any(term in connected_bigrams for term in BAD_BIGRAMS)
if contains_bad_bigram:
print("Contains Bad Bigram", contains_bad_bigram)
print(sent, "\n")

return ends_with_period and no_newlines and not_about_trypanosomiasis and only_relevant_black and not contains_bad_bigram

def main():
sentences = []
bigrams_dict = {}
with open(INPUT_FILE_PATH, mode='r') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
invalid_count = 0
for row in csv_reader:
sent = row[1].strip()
if (sent[-1] == '.' and "\n" not in sent and 'trypanosomiasis' not in sent):

text = sent.lower()
token = word_tokenize(text)
bigrams = list(ngrams(token, 2))
connected_bigrams = []
for bigram in bigrams:
connected_bigram = ' '.join(bigram)
if (any(term in bigram[0] for term in ['african', 'black'])):
connected_bigrams.append(connected_bigram)
if (connected_bigram in bigrams_dict):
bigrams_dict[connected_bigram] += 1
else:
bigrams_dict[connected_bigram] = 1

if (should_select_sentence(sent, connected_bigrams)):
sentences.append(sent)
line_count += 1
else:
Expand All @@ -24,6 +81,11 @@ def main():
sentences = list(set(sentences))
print("Num Unique Sentence Lines: ", len(sentences))

sorted_bigrams_dict = sorted(bigrams_dict.items(), key=operator.itemgetter(1), reverse=True)
print("\n\n Bigram Counts in Descending Order:")
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(sorted_bigrams_dict)

write_time = datetime.datetime.now()
output_file = OUTPUT_FILE_BASE + "_" + INPUT_CSV_NAME + "_" + str(write_time) + '.csv'
with open(output_file, mode='a') as out_file:
Expand Down

0 comments on commit bd682a4

Please sign in to comment.