-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess.py
118 lines (98 loc) · 4.39 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# This takes sentences generated by generate_sentences.py and converts it to raw text
# While also keeping track of sentence lengths and creating a dictionary of words
import os
import nltk
import json
import random
dictionary = {}
mapped_dictionary = {}
max_sentence_length = 0
most_frequent_word = ""
most_frequent_word_freq = 0
total_words = 0
def randomize_words_in_sentence(filename):
data = []
open("data/txt/incoherent_sentences_randomized_words.txt", 'w') # Clear contents of file
randomized_file = open("data/txt/incoherent_sentences_randomized_words.txt", 'a+')
for line in open("data/json/" + filename, 'r'):
data.append(json.loads(line))
for line in data:
sentence = line['Arg1Raw'] + " " + line['ConnectiveRaw'] + " " + line['Arg2Raw'] + "\n"
word_sentence = nltk.word_tokenize(sentence.lower())
random.shuffle(word_sentence)
for word in word_sentence:
randomized_file.write(word.encode('ascii', 'ignore') + " ")
randomized_file.write('\n')
# Stats output file
open("data/corpus_stats.txt", 'w') # Clear contents of file
stats_file = open("data/corpus_stats.txt", 'a+')
# Build dictionary and convert sentences to raw text
print("Converting to raw text")
for filename in os.listdir(os.getcwd()+ "/data/json"):
# Variables for file-specific data
file_max_sentence_length = 0
file_num_sentences = 0
file_dict = {}
# Import data as a JSON object
data = []
for line in open("data/json/" + filename, 'r'):
data.append(json.loads(line))
file_num_sentences += 1
output_file = "data/txt/" + filename[:-5] + ".txt"
open(output_file, 'w') # Clear contents of file
out = open(output_file, 'a+')
for line in data:
# Convert to raw text
sentence = line['Arg1Raw'] + " " + line['ConnectiveRaw'] + " " + line['Arg2Raw'] + "\n"
out.write(sentence.encode('ascii', 'ignore'))
# Tokenize sentence and build dictionary + corpus stats
word_sentence = nltk.word_tokenize(sentence.lower())
# Find maximum sentence length
if len(word_sentence) > max_sentence_length:
max_sentence_length = len(word_sentence)
if len(word_sentence) > file_max_sentence_length:
file_max_sentence_length = len(word_sentence)
# Build dictionary
for word in word_sentence:
try:
word = word.decode('utf8').encode('ascii', errors='ignore')
except UnicodeEncodeError:
stripped = (c for c in word if 0 < ord(c) < 127)
word = ''.join(stripped)
total_words += 1
if word not in dictionary:
dictionary[word] = 1
else:
dictionary[word] += 1
if dictionary[word] > most_frequent_word_freq:
most_frequent_word = word
most_frequent_word_freq = dictionary[word]
if word not in file_dict:
file_dict[word] = True
# Get one dataset with words fully randomized
if filename == 'incoherent_sentences_arg2_diff_sense.json':
randomize_words_in_sentence(filename)
# Output File Stats
stats_file.write(filename + " stats:\n")
stats_file.write("# words: " + str(len(file_dict.keys())) + "\n")
stats_file.write("# sentences: " + str(file_num_sentences) + "\n")
stats_file.write("Max sentence length: " + str(file_max_sentence_length) + "\n")
# Output dictionary and create mapping of terms to integers
index = 1
open("data/dictionary.txt", 'w') # Clear contents of file
dict_file = open("data/dictionary.txt", 'a+')
# Used for padding sentences to max_sentence_length
mapped_dictionary["<pad>"] = 0
dict_file.write("0 <pad> -1\n")
# Print all terms from dictionary and map them to integers in mapped_dictionary
for key in sorted(dictionary.keys()):
mapped_dictionary[key.lower()] = index
entry = str(index) + " " + key + " " + str(dictionary[key]) + "\n"
dict_file.write(entry)
index += 1
# Output corpus stats
stats_file.write("Total words: " + str(total_words) + "\n")
stats_file.write("Most frequent word: " + str(most_frequent_word) + "\n")
stats_file.write("Most frequent word frequency: " + str(most_frequent_word_freq) + "\n")
stats_file.write("Unique terms in dictionary: " + str(len(dictionary.keys())) + "\n")
stats_file.write("Max sentence length: " + str(max_sentence_length) + "\n")