abisee · becxer · Sep 12, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+cnn_stories_tokenized/
+dm_stories_tokenized/
+finished_files/
diff --git a/make_datafiles.py b/make_datafiles.py
@@ -59,32 +59,32 @@ def chunk_all():
     os.mkdir(chunks_dir)
   # Chunk the data
   for set_name in ['train', 'val', 'test']:
-    print "Splitting %s data into chunks..." % set_name
+    print("Splitting %s data into chunks..." % set_name)
     chunk_file(set_name)
-  print "Saved chunked data in %s" % chunks_dir
+  print("Saved chunked data in %s" % chunks_dir)
 
 
 def tokenize_stories(stories_dir, tokenized_stories_dir):
   """Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer"""
-  print "Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)
+  print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir))
   stories = os.listdir(stories_dir)
   # make IO list file
-  print "Making list of files to tokenize..."
+  print("Making list of files to tokenize...")
   with open("mapping.txt", "w") as f:
     for s in stories:
       f.write("%s \t %s\n" % (os.path.join(stories_dir, s), os.path.join(tokenized_stories_dir, s)))
   command = ['java', 'edu.stanford.nlp.process.PTBTokenizer', '-ioFileList', '-preserveLines', 'mapping.txt']
-  print "Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir)
+  print("Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir))
   subprocess.call(command)
-  print "Stanford CoreNLP Tokenizer has finished."
+  print("Stanford CoreNLP Tokenizer has finished.")
   os.remove("mapping.txt")
 
   # Check that the tokenized stories directory contains the same number of files as the original directory
   num_orig = len(os.listdir(stories_dir))
   num_tokenized = len(os.listdir(tokenized_stories_dir))
   if num_orig != num_tokenized:
     raise Exception("The tokenized stories directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during tokenization?" % (tokenized_stories_dir, num_tokenized, stories_dir, num_orig))
-  print "Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir)
+  print("Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir))
 
 
 def read_text_file(text_file):
@@ -98,7 +98,7 @@ def read_text_file(text_file):
 def hashhex(s):
   """Returns a heximal formated SHA1 hash of the input string."""
   h = hashlib.sha1()
-  h.update(s)
+  h.update(s.encode())
   return h.hexdigest()
 
 
@@ -149,7 +149,7 @@ def get_art_abs(story_file):
 
 def write_to_bin(url_file, out_file, makevocab=False):
   """Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file."""
-  print "Making bin file for URLs listed in %s..." % url_file
+  print("Making bin file for URLs listed in %s..." % url_file)
   url_list = read_text_file(url_file)
   url_hashes = get_url_hashes(url_list)
   story_fnames = [s+".story" for s in url_hashes]
@@ -161,17 +161,17 @@ def write_to_bin(url_file, out_file, makevocab=False):
   with open(out_file, 'wb') as writer:
     for idx,s in enumerate(story_fnames):
       if idx % 1000 == 0:
-        print "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories))
+        print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories)))
 
       # Look in the tokenized story dirs to find the .story file corresponding to this url
       if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)):
         story_file = os.path.join(cnn_tokenized_stories_dir, s)
       elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)):
         story_file = os.path.join(dm_tokenized_stories_dir, s)
       else:
-        print "Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir)
+        print("Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir))
         # Check again if tokenized stories directories contain correct number of files
-        print "Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir)
+        print("Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir))
         check_num_stories(cnn_tokenized_stories_dir, num_expected_cnn_stories)
         check_num_stories(dm_tokenized_stories_dir, num_expected_dm_stories)
         raise Exception("Tokenized stories directories %s and %s contain correct number of files but story file %s found in neither." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir, s))
@@ -181,8 +181,8 @@ def write_to_bin(url_file, out_file, makevocab=False):
 
       # Write to tf.Example
       tf_example = example_pb2.Example()
-      tf_example.features.feature['article'].bytes_list.value.extend([article])
-      tf_example.features.feature['abstract'].bytes_list.value.extend([abstract])
+      tf_example.features.feature['article'].bytes_list.value.extend([article.encode()])
+      tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode()])
       tf_example_str = tf_example.SerializeToString()
       str_len = len(tf_example_str)
       writer.write(struct.pack('q', str_len))
@@ -198,15 +198,15 @@ def write_to_bin(url_file, out_file, makevocab=False):
         tokens = [t for t in tokens if t!=""] # remove empty
         vocab_counter.update(tokens)
 
-  print "Finished writing file %s\n" % out_file
+  print("Finished writing file %s\n" % out_file)
 
   # write vocab to file
   if makevocab:
-    print "Writing vocab file..."
+    print("Writing vocab file...")
     with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer:
       for word, count in vocab_counter.most_common(VOCAB_SIZE):
         writer.write(word + ' ' + str(count) + '\n')
-    print "Finished writing vocab file"
+    print("Finished writing vocab file")
 
 
 def check_num_stories(stories_dir, num_expected):
@@ -217,7 +217,7 @@ def check_num_stories(stories_dir, num_expected):
 
 if __name__ == '__main__':
   if len(sys.argv) != 3:
-    print "USAGE: python make_datafiles.py <cnn_stories_dir> <dailymail_stories_dir>"
+    print("USAGE: python make_datafiles.py <cnn_stories_dir> <dailymail_stories_dir>")
     sys.exit()
   cnn_stories_dir = sys.argv[1]
   dm_stories_dir = sys.argv[2]