Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update make_datafiles.py #25

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions make_datafiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,32 +59,32 @@ def chunk_all():
os.mkdir(chunks_dir)
# Chunk the data
for set_name in ['train', 'val', 'test']:
print "Splitting %s data into chunks..." % set_name
print("Splitting %s data into chunks..." % set_name)
chunk_file(set_name)
print "Saved chunked data in %s" % chunks_dir
print("Saved chunked data in %s" % chunks_dir)


def tokenize_stories(stories_dir, tokenized_stories_dir):
"""Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer"""
print "Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)
print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir))
stories = os.listdir(stories_dir)
# make IO list file
print "Making list of files to tokenize..."
print("Making list of files to tokenize...")
with open("mapping.txt", "w") as f:
for s in stories:
f.write("%s \t %s\n" % (os.path.join(stories_dir, s), os.path.join(tokenized_stories_dir, s)))
command = ['java', 'edu.stanford.nlp.process.PTBTokenizer', '-ioFileList', '-preserveLines', 'mapping.txt']
print "Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir)
print("Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir))
subprocess.call(command)
print "Stanford CoreNLP Tokenizer has finished."
print("Stanford CoreNLP Tokenizer has finished.")
os.remove("mapping.txt")

# Check that the tokenized stories directory contains the same number of files as the original directory
num_orig = len(os.listdir(stories_dir))
num_tokenized = len(os.listdir(tokenized_stories_dir))
if num_orig != num_tokenized:
raise Exception("The tokenized stories directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during tokenization?" % (tokenized_stories_dir, num_tokenized, stories_dir, num_orig))
print "Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir)
print("Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir))


def read_text_file(text_file):
Expand All @@ -98,7 +98,7 @@ def read_text_file(text_file):
def hashhex(s):
"""Returns a heximal formated SHA1 hash of the input string."""
h = hashlib.sha1()
h.update(s)
h.update(s.encode())
return h.hexdigest()


Expand Down Expand Up @@ -149,7 +149,7 @@ def get_art_abs(story_file):

def write_to_bin(url_file, out_file, makevocab=False):
"""Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file."""
print "Making bin file for URLs listed in %s..." % url_file
print("Making bin file for URLs listed in %s..." % url_file)
url_list = read_text_file(url_file)
url_hashes = get_url_hashes(url_list)
story_fnames = [s+".story" for s in url_hashes]
Expand All @@ -161,17 +161,17 @@ def write_to_bin(url_file, out_file, makevocab=False):
with open(out_file, 'wb') as writer:
for idx,s in enumerate(story_fnames):
if idx % 1000 == 0:
print "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories))
print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories)))

# Look in the tokenized story dirs to find the .story file corresponding to this url
if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)):
story_file = os.path.join(cnn_tokenized_stories_dir, s)
elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)):
story_file = os.path.join(dm_tokenized_stories_dir, s)
else:
print "Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir)
print("Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir))
# Check again if tokenized stories directories contain correct number of files
print "Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir)
print("Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir))
check_num_stories(cnn_tokenized_stories_dir, num_expected_cnn_stories)
check_num_stories(dm_tokenized_stories_dir, num_expected_dm_stories)
raise Exception("Tokenized stories directories %s and %s contain correct number of files but story file %s found in neither." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir, s))
Expand All @@ -181,8 +181,8 @@ def write_to_bin(url_file, out_file, makevocab=False):

# Write to tf.Example
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([article])
tf_example.features.feature['abstract'].bytes_list.value.extend([abstract])
tf_example.features.feature['article'].bytes_list.value.extend([article.encode()])
tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode()])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
Expand All @@ -198,15 +198,15 @@ def write_to_bin(url_file, out_file, makevocab=False):
tokens = [t for t in tokens if t!=""] # remove empty
vocab_counter.update(tokens)

print "Finished writing file %s\n" % out_file
print("Finished writing file %s\n" % out_file)

# write vocab to file
if makevocab:
print "Writing vocab file..."
print("Writing vocab file...")
with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer:
for word, count in vocab_counter.most_common(VOCAB_SIZE):
writer.write(word + ' ' + str(count) + '\n')
print "Finished writing vocab file"
print("Finished writing vocab file")


def check_num_stories(stories_dir, num_expected):
Expand All @@ -217,7 +217,7 @@ def check_num_stories(stories_dir, num_expected):

if __name__ == '__main__':
if len(sys.argv) != 3:
print "USAGE: python make_datafiles.py <cnn_stories_dir> <dailymail_stories_dir>"
print("USAGE: python make_datafiles.py <cnn_stories_dir> <dailymail_stories_dir>")
sys.exit()
cnn_stories_dir = sys.argv[1]
dm_stories_dir = sys.argv[2]
Expand Down