From 6861dabe64ec15cf1e665de8257a0b0155b55960 Mon Sep 17 00:00:00 2001 From: interviewBubble <39003714+interviewBubble@users.noreply.github.com> Date: Tue, 27 Nov 2018 15:51:38 +0530 Subject: [PATCH] Update make_datafiles.py Removing this ERRORS ERROR1: Traceback (most recent call last): File "make_datafiles.py", line 239, in write_to_bin(all_test_urls, os.path.join(finished_files_dir, "test.bin")) File "make_datafiles.py", line 154, in write_to_bin url_hashes = get_url_hashes(url_list) File "make_datafiles.py", line 106, in get_url_hashes return [hashhex(url) for url in url_list] File "make_datafiles.py", line 106, in return [hashhex(url) for url in url_list] File "make_datafiles.py", line 101, in hashhex h.update(s) TypeError: Unicode-objects must be encoded before hashing ERROR 2: PTBTokenizer tokenized 203071165 tokens at 1811476.32 tokens per second. Stanford CoreNLP Tokenizer has finished. Successfully finished tokenizing dailymail/stories/ to dm_stories_tokenized. Making bin file for URLs listed in url_lists/all_test.txt... Writing story 0 of 11490; 0.00 percent done Traceback (most recent call last): File "make_datafiles.py", line 239, in write_to_bin(all_test_urls, os.path.join(finished_files_dir, "test.bin")) File "make_datafiles.py", line 184, in write_to_bin tf_example.features.feature['article'].bytes_list.value.extend([article]) TypeError: "marseille , france -lrb- cnn -rrb- the french prosecutor leading an investigation into the crash of has type str, but expected one of: bytes --- make_datafiles.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/make_datafiles.py b/make_datafiles.py index bb431d5f0..aa5f03a49 100644 --- a/make_datafiles.py +++ b/make_datafiles.py @@ -59,24 +59,24 @@ def chunk_all(): os.mkdir(chunks_dir) # Chunk the data for set_name in ['train', 'val', 'test']: - print "Splitting %s data into chunks..." % set_name + print("Splitting %s data into chunks..." % set_name) chunk_file(set_name) - print "Saved chunked data in %s" % chunks_dir + print("Saved chunked data in %s" % chunks_dir) def tokenize_stories(stories_dir, tokenized_stories_dir): """Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer""" - print "Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir) + print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)) stories = os.listdir(stories_dir) # make IO list file - print "Making list of files to tokenize..." + print("Making list of files to tokenize...") with open("mapping.txt", "w") as f: for s in stories: f.write("%s \t %s\n" % (os.path.join(stories_dir, s), os.path.join(tokenized_stories_dir, s))) command = ['java', 'edu.stanford.nlp.process.PTBTokenizer', '-ioFileList', '-preserveLines', 'mapping.txt'] - print "Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir) + print("Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir)) subprocess.call(command) - print "Stanford CoreNLP Tokenizer has finished." + print("Stanford CoreNLP Tokenizer has finished.") os.remove("mapping.txt") # Check that the tokenized stories directory contains the same number of files as the original directory @@ -84,7 +84,7 @@ def tokenize_stories(stories_dir, tokenized_stories_dir): num_tokenized = len(os.listdir(tokenized_stories_dir)) if num_orig != num_tokenized: raise Exception("The tokenized stories directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during tokenization?" % (tokenized_stories_dir, num_tokenized, stories_dir, num_orig)) - print "Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir) + print("Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir)) def read_text_file(text_file): @@ -98,7 +98,7 @@ def read_text_file(text_file): def hashhex(s): """Returns a heximal formated SHA1 hash of the input string.""" h = hashlib.sha1() - h.update(s) + h.update(s.encode()) return h.hexdigest() @@ -149,7 +149,7 @@ def get_art_abs(story_file): def write_to_bin(url_file, out_file, makevocab=False): """Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file.""" - print "Making bin file for URLs listed in %s..." % url_file + print("Making bin file for URLs listed in %s..." % url_file) url_list = read_text_file(url_file) url_hashes = get_url_hashes(url_list) story_fnames = [s+".story" for s in url_hashes] @@ -161,7 +161,7 @@ def write_to_bin(url_file, out_file, makevocab=False): with open(out_file, 'wb') as writer: for idx,s in enumerate(story_fnames): if idx % 1000 == 0: - print "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories)) + print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories))) # Look in the tokenized story dirs to find the .story file corresponding to this url if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)): @@ -169,9 +169,9 @@ def write_to_bin(url_file, out_file, makevocab=False): elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)): story_file = os.path.join(dm_tokenized_stories_dir, s) else: - print "Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir) + print("Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir)) # Check again if tokenized stories directories contain correct number of files - print "Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir) + print("Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir)) check_num_stories(cnn_tokenized_stories_dir, num_expected_cnn_stories) check_num_stories(dm_tokenized_stories_dir, num_expected_dm_stories) raise Exception("Tokenized stories directories %s and %s contain correct number of files but story file %s found in neither." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir, s)) @@ -181,8 +181,8 @@ def write_to_bin(url_file, out_file, makevocab=False): # Write to tf.Example tf_example = example_pb2.Example() - tf_example.features.feature['article'].bytes_list.value.extend([article]) - tf_example.features.feature['abstract'].bytes_list.value.extend([abstract]) + tf_example.features.feature['article'].bytes_list.value.extend([article.encode()]) + tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode()]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) @@ -198,15 +198,15 @@ def write_to_bin(url_file, out_file, makevocab=False): tokens = [t for t in tokens if t!=""] # remove empty vocab_counter.update(tokens) - print "Finished writing file %s\n" % out_file + print("Finished writing file %s\n" % out_file) # write vocab to file if makevocab: - print "Writing vocab file..." + print("Writing vocab file...") with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') - print "Finished writing vocab file" + print("Finished writing vocab file") def check_num_stories(stories_dir, num_expected): @@ -217,7 +217,7 @@ def check_num_stories(stories_dir, num_expected): if __name__ == '__main__': if len(sys.argv) != 3: - print "USAGE: python make_datafiles.py " + print("USAGE: python make_datafiles.py ") sys.exit() cnn_stories_dir = sys.argv[1] dm_stories_dir = sys.argv[2]