decode.py

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
# Modifications Copyright 2017 Abigail See
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""This file contains code to run beam search decoding, including running ROUGE evaluation and producing JSON datafiles for the in-browser attention visualizer, which can be found here https://github.com/abisee/attn_vis"""

import os
import time
import tensorflow as tf
import beam_search
import data
import json
#import pyrouge
import util
import logging
import numpy as np

FLAGS = tf.app.flags.FLAGS

SECS_UNTIL_NEW_CKPT = 60  # max number of seconds before loading new checkpoint


class BeamSearchDecoder(object):
    """Beam search decoder."""

    def __init__(self, model, batcher, vocab):
        """Initialize decoder.

        Args:
            model: a Seq2SeqAttentionModel object.
            batcher: a Batcher object.
            vocab: Vocabulary object
        """
        self._model = model
        self._model.build_graph()
        self._batcher = batcher
        self._vocab = vocab
        self._saver = tf.train.Saver()
        self._sess = tf.Session(config=util.get_config())

        # Load an initial checkpoint to use for decoding
        ckpt_path = util.load_ckpt(self._saver, self._sess)

        if FLAGS.single_pass:
            # Make a descriptive decode directory name
            # this is something of the form "ckpt-123456"
            ckpt_name = "ckpt-" + ckpt_path.split('-')[-1]
            self._decode_dir = os.path.join(FLAGS.log_root,
                                            get_decode_dir_name(ckpt_name))
            if os.path.exists(self._decode_dir):
                raise Exception(
                    "single_pass decode directory %s should not already exist"
                    % self._decode_dir)

        else:  # Generic decode dir name
            self._decode_dir = os.path.join(FLAGS.log_root, "decode")

        # Make the decode dir if necessary
        if not os.path.exists(self._decode_dir): os.mkdir(self._decode_dir)

        if FLAGS.single_pass:
            # Make the dirs to contain output written in the correct format for pyrouge
            self._rouge_ref_dir = os.path.join(self._decode_dir, "reference")
            if not os.path.exists(self._rouge_ref_dir):
                os.mkdir(self._rouge_ref_dir)
            self._rouge_dec_dir = os.path.join(self._decode_dir, "decoded")
            if not os.path.exists(self._rouge_dec_dir):
                os.mkdir(self._rouge_dec_dir)

    def decode(self):
        """
        Decode examples until data is exhausted (if FLAGS.single_pass) and return,
        or decode indefinitely, loading latest checkpoint at regular intervals
        """
        t0 = time.time()
        start_time = t0
        counter = 0
        while True:
            # 1 example repeated across batch
            batch = self._batcher.next_batch()
            if batch is None:  # finished decoding dataset in single_pass mode
                assert FLAGS.single_pass, "Dataset exhausted, but we are not in single_pass mode"
                tf.logging.info(
                    "Decoder has finished reading dataset for single_pass, using %d seconds.",
                    time.time() - start_time)
                tf.logging.info(
                    "Output has been saved in %s and %s. Now starting ROUGE eval...",
                    self._rouge_ref_dir, self._rouge_dec_dir)
                #todo: need to update for rouge
                # results_dict = rouge_eval(self._rouge_ref_dir,
                #                           self._rouge_dec_dir)
                # rouge_log(results_dict, self._decode_dir)
                return

            original_context = batch.original_contexts[0]  # string
	    original_query = batch.original_querys[0]
            original_summarization = batch.original_summarizations[0]  # string
            #original_abstract_sents = batch.original_abstracts_sents[
            #    0]  # list of strings

            #context_withunks = data.show_art_oovs(original_context, self._vocab)
            #abstract_withunks = data.show_abs_oovs(
            #    original_summarization, self._vocab,
            #    (batch.art_oovs[0] if FLAGS.pointer_gen else None))  # string

            # Run beam search to get best Hypothesis
            best_hyp = beam_search.run_beam_search(self._sess, self._model,
                                                   self._vocab, batch)

          #  export_path = os.path.join(FLAGS.export_dir,str(FLAGS.export_version))
            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_hyp.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self._vocab, (batch.art_oovs[0]
                                          if FLAGS.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                # index of the (first) [STOP] symbol
                fst_stop_idx = decoded_words.index(data.MARK_EOS)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words
            decoded_output = ''.join(decoded_words)  # single string

            if FLAGS.single_pass:
                #todo: need to check
                # write ref summary and decoded summary to file, to eval with pyrouge later
                self.write_result(original_context, original_summarization,
                                  decoded_words, counter)
                # self.write_for_eval(original_summarization, output_ids,
                #                     counter)
                counter += 1  # this is how many examples we've decoded
            else:
                # log output to screen
                print_results(context_withunks, abstract_withunks,
                              decoded_output)
                # write info to .json file for visualization tool
                self.write_for_attnvis(context_withunks, abstract_withunks,
                                       decoded_words, best_hyp.attn_dists)

                # Check if SECS_UNTIL_NEW_CKPT has elapsed;
                # if so return so we can load a new checkpoint
                t1 = time.time()
                if t1 - t0 > SECS_UNTIL_NEW_CKPT:
                    tf.logging.info(
                        'We\'ve been decoding with same checkpoint for %i seconds. Time to load new checkpoint',
                        t1 - t0)
                    _ = util.load_ckpt(self._saver, self._sess)
                    t0 = time.time()

    def write_for_eval(self, reference_summarization, decoded_words, ex_index):
        """
        Write output to file in correct format for evaluation. 
        The content is ids, since pyrough cannot deal with chinese character.
        This is called in single_pass mode.

        Args:
            reference_sents: list of strings
            decoded_words: list of strings
            ex_index: int, the index with which to label the files
        """
        # pyrouge calls a perl script that puts the data into HTML files.
        # Therefore we need to make our output HTML safe.
        summarization = make_html_safe(''.join(decoded_words))
        reference_summarization = make_html_safe(reference_summarization)

        # Write to file
        ref_file = os.path.join(self._rouge_ref_dir,
                                "%06d_reference.txt" % ex_index)
        decoded_file = os.path.join(self._rouge_dec_dir,
                                    "%06d_decoded.txt" % ex_index)

        with open(ref_file, "w") as f:
            f.write(reference_summarization.encode('utf-8') + "\n")
        with open(decoded_file, "w") as f:
            f.write(summarization.encode('utf-8') + "\n")

        tf.logging.info("Wrote example %i to file" % ex_index)

    def write_result(self, original_context, reference_summarization,
                     decoded_words, ex_index):
        """
        Write output to file.

        Args:
            reference_sents: list of strings
            decoded_words: list of strings
            ex_index: int, the index with which to label the files
        """
        summarization = ''.join(decoded_words)

        # Write to file
        result_file = os.path.join(self._decode_dir, "result.txt")

        with open(result_file, 'a') as f:
            f.write(
                original_context.encode('utf-8') + '\t\t' +
                reference_summarization.encode('utf-8') + '\t\t' +
                summarization.encode('utf-8') + "\n")
            f.flush()

        if ex_index % 10 == 0:
            tf.logging.info("Wrote example %i to file" % ex_index)

    def write_for_rouge(self, reference_summarization, decoded_words,
                        ex_index):
        """
        Write output to file in correct format for eval with pyrouge. 
        This is called in single_pass mode.

        Args:
            reference_sents: list of strings
            decoded_words: list of strings
            ex_index: int, the index with which to label the files
        """
        # pyrouge calls a perl script that puts the data into HTML files.
        # Therefore we need to make our output HTML safe.
        summarization = make_html_safe(''.join(decoded_words))
        reference_summarization = make_html_safe(reference_summarization)

        # Write to file
        ref_file = os.path.join(self._rouge_ref_dir,
                                "%06d_reference.txt" % ex_index)
        decoded_file = os.path.join(self._rouge_dec_dir,
                                    "%06d_decoded.txt" % ex_index)

        with open(ref_file, "w") as f:
            f.write(reference_summarization.encode('utf-8') + "\n")
        with open(decoded_file, "w") as f:
            f.write(summarization.encode('utf-8') + "\n")

        tf.logging.info("Wrote example %i to file" % ex_index)

    def write_for_attnvis(self, article, abstract, decoded_words, attn_dists):
        """
        Write some data to json file, which can be read into the in-browser attention visualizer tool:
        https://github.com/abisee/attn_vis
        Args:
            article: The original article string.
            abstract: The human (correct) abstract string.
            attn_dists: List of arrays; the attention distributions.
            decoded_words: List of strings; the words of the generated summary.
        """
        to_write = {
            'origin_context': make_html_safe(article),
            'decoded_sum': make_html_safe(''.join(decoded_words)),
            'summarization': make_html_safe(abstract),
            'attn_dists': attn_dists
        }
        output_fname = os.path.join(self._decode_dir, 'attn_vis_data.json')
        with open(output_fname, 'w') as output_file:
            json.dump(to_write, output_file)
        tf.logging.info('Wrote visualization data to %s', output_fname)


def print_results(article, abstract, decoded_output):
    """Prints the article, the reference summmary and the decoded summary to screen"""
    print ""
    tf.logging.info('ARTICLE:  %s', article)
    tf.logging.info('REFERENCE SUMMARY: %s', abstract)
    tf.logging.info('GENERATED SUMMARY: %s', decoded_output)
    print ""


def make_html_safe(s):
    """Replace any angled brackets in string s to avoid interfering with HTML attention visualizer."""
    s.replace("<", "&lt;")
    s.replace(">", "&gt;")
    return s


def rouge_eval(ref_dir, dec_dir):
    """Evaluate the files in ref_dir and dec_dir with pyrouge, returning results_dict"""
    r = pyrouge.Rouge155()
    r.model_filename_pattern = '#ID#_reference.txt'
    r.system_filename_pattern = '(\d+)_decoded.txt'
    r.model_dir = ref_dir
    r.system_dir = dec_dir
    logging.getLogger('global').setLevel(
        logging.WARNING)  # silence pyrouge logging
    rouge_results = r.convert_and_evaluate()
    return r.output_to_dict(rouge_results)


def rouge_log(results_dict, dir_to_write):
    """Log ROUGE results to screen and write to file.

    Args:
        results_dict: the dictionary returned by pyrouge
        dir_to_write: the directory where we will write the results to
    """
    log_str = ""
    for x in ["1", "2", "l"]:
        log_str += "\nROUGE-%s:\n" % x
        for y in ["f_score", "recall", "precision"]:
            key = "rouge_%s_%s" % (x, y)
            key_cb = key + "_cb"
            key_ce = key + "_ce"
            val = results_dict[key]
            val_cb = results_dict[key_cb]
            val_ce = results_dict[key_ce]
            log_str += "%s: %.4f with confidence interval (%.4f, %.4f)\n" % (
                key, val, val_cb, val_ce)
    tf.logging.info(log_str)  # log to screen
    results_file = os.path.join(dir_to_write, "ROUGE_results.txt")
    tf.logging.info("Writing final ROUGE results to %s...", results_file)
    with open(results_file, "w") as f:
        f.write(log_str)


def get_decode_dir_name(ckpt_name):
    """Make a descriptive name for the decode dir, including the name of the checkpoint we use to decode. This is called in single_pass mode."""

    if "train" in FLAGS.data_path: dataset = "train"
    elif "val" in FLAGS.data_path: dataset = "val"
    elif "test" in FLAGS.data_path: dataset = "test"
    else:
        raise ValueError(
            "FLAGS.data_path %s should contain one of train, val or test" %
            (FLAGS.data_path))
    dirname = "decode_%s_%imaxenc_%ibeam_%imindec_%imaxdec" % (
        dataset, FLAGS.max_enc_steps, FLAGS.beam_size, FLAGS.min_dec_steps,
        FLAGS.max_dec_steps)
    if ckpt_name is not None:
        dirname += "_%s" % ckpt_name
    return dirname