renderSideBySide.py

#!/usr/bin/python
"""
CS276 Programming Assignment 3: Ranking

Given two experiment outputs, render a collection of webpages that compare the
two experiment results. The rendered pages will be in the following hierarchies:
    - index.html: an index page that shows NDCG differences.
    - queries/qn.html: a collection of query pages that shows detailed comparison on each query.
The experiment outputs should be generated by using the Java code in the skeleton code, 
otherwise we may not be able to parse the format correctly.

Usage:  render.py [-h] exp1_name exp1_file exp2_name exp2_file
Args:
    exp1_name   A name for experiment 1
    exp1_file   Experiment 1 output
    exp2_name   A name for experiment 2
    exp2_file   Experiment 2 output

Note: This script is relatively new, and was tested for macOS only. 
If you find it to be buggy, please let us know on Piazza.
"""
import os
import sys
import shutil
import argparse
import math

ROOT_DIR = './sideBySide/'
INDEX_PAGE = 'index.html'
QUERY_DIR = 'queries/'

class URL(object):
    def __init__(self, url, rating, title, debugstr):
        self.url = url
        self.rating = rating
        self.title = title
        self.debugstr = debugstr
        self.clean_url = url.split('://')[1]
    
    def render_url_info(self, idx=None):
        info = '<td>\n'
        if idx is None:
            info += ' Page: <a href="%s">%s</a> <br>' % (self.url, self.title)
        else:
            info += ' P%d: <a href="%s">%s</a> <br>' % (idx, self.url, self.title)
        info += ' <small style="color:green">%s</small> <br>' % self.clean_url
        info += ' Rating: <b>%g</b> <br>' % self.rating
        info += ' Debug: %s <br>' % self.debugstr
        info += '</td>'
        return info

def sign(v):
    return 1 if v >= 0 else -1

def read_experiment_file(fname):
    """ Read in experiment file.
    """
    queries = []
    query2ndcg = dict()
    query2urls = dict()

    def process_cache(cache):
        assert(cache[0].startswith('query:'))
        assert(cache[1].startswith('ndcg:'))
        q = cache[0][7:]
        ndcg = float(cache[1][6:])
        urls = []
        assert(len(cache[2:]) % 4 == 0)
        num_urls = len(cache[2:]) // 4
        for i in range(num_urls):
            assert(cache[i*4+2].startswith('url:'))
            assert(cache[i*4+3].startswith('rating:'))
            assert(cache[i*4+4].startswith('title:'))
            assert(cache[i*4+5].startswith('debug:'))
            u = cache[i*4+2][5:]
            r = float(cache[i*4+3][8:])
            t = cache[i*4+4][7:]
            d = cache[i*4+5][7:]
            urls.append(URL(u, r, t, d))
        queries.append(q)
        query2ndcg[q] = ndcg
        query2urls[q] = urls

    with open(fname) as infile:
        cache = []
        for line in infile:
            line = line.strip()
            if len(line) == 0:
                continue
            if line.startswith('query:'):
                if len(cache) != 0:
                    process_cache(cache)
                    cache = []
            cache.append(line)
        if len(cache) != 0:
            process_cache(cache)
    return queries, query2ndcg, query2urls

def get_color_str(num):
    if num > 0:
        return "green"
    elif num == 0:
        return "black"
    else:
        return "red"

def get_link(url, title):
    return '<a href="%s">%s</a>' % (url, title)

def write_index_page(outfile, queries, ndcg1, ndcg2, name1, name2):
    """ Write index HTML to outfile.
    """
    mean_ndcg1 = 1.0*sum(ndcg1) / len(ndcg1)
    mean_ndcg2 = 1.0*sum(ndcg2) / len(ndcg2)
    diff = mean_ndcg2 - mean_ndcg1
    INDEX_HEADER = '<!DOCTYPE html>\n<html>\n<head>\n<style>\n\
        table, th, td { border: 1px solid black; border-collapse: collapse; }\n\
        th, td { padding: 5px }\n</style>\n</head>\n'
    print >> outfile, INDEX_HEADER
    print >> outfile, '<body>'
    print >> outfile, '<p>Experiment 1: %s <br>' % name1
    print >> outfile, 'Experiment 2: %s</p>' % name2
    print >> outfile, '<p>Average NDCG diff [%s - %s]: <b style="color:%s">%g</b></p>' % \
        (name2, name1, get_color_str(diff), diff)
    print >> outfile, '<p>Queries are sorted by NDCG diff.</p>'
    print >> outfile, '<table>'
    TABLE_HEADER = '<tr>\n<th></th>\n<th>Query</th>\n<th>NDCG diff</th>\n</tr>'
    print >> outfile, TABLE_HEADER
    for i, (q, n1, n2) in enumerate(zip(queries, ndcg1, ndcg2)):
        idx = i+1
        print >> outfile, ' <tr>'
        print >> outfile, ' <td>%d</td>' % idx
        link = QUERY_DIR + 'q%d.html' % idx
        print >> outfile, ' <td>%s</td>' % get_link(link, q)
        diff = n2 - n1
        print >> outfile, ' <td><b style="color:%s">%g</b></td>' % (get_color_str(diff), diff)
        print >> outfile, ' </tr>'
    print >> outfile, "</table>"
    print >> outfile, "</body>"
    print >> outfile, "</html>"

def write_query_page(outfile, q, n1, n2, urls1, urls2, name1, name2):
    """ Write a query HTML to outfile.
    """
    ndcg_diff = n2 - n1
    QUERY_HEADER = '<!DOCTYPE html>\n<html>\n<head>\n<style>\n\
        table, th, td { border: 1px solid black; border-collapse: collapse;}\n\
        th, td {padding: 10px}\n\
        .box { float: left; width: 20px; height: 20px; margin: 5px; border: 1px solid rgba(0, 0, 0, .2);}\n\
        </style>\n</head>'
    print >> outfile, QUERY_HEADER
    print >> outfile, '<body>'
    print >> outfile, '<p><b>Query: %s</b></p>' % q
    print >> outfile, '<p>NDCG diff [%s - %s]: <b style="color:%s">%g</b></p>' % \
        (name2, name1, get_color_str(ndcg_diff), ndcg_diff)
    print >> outfile, '<table style="width:100%">'
    print >> outfile, '<tr>\n<th>Experiment 1: %s <br> NDCG = %g </th>\n<th>Experiment 2: %s <br> NDCG = %g </th>\n</tr>' % \
        (name1, n1, name2, n2)
    url2id = dict()
    for i,u in enumerate(urls1):
        url2id[u.url] = i+1
    for u1, u2 in zip(urls1, urls2):
        print >> outfile, '<tr>'
        print >> outfile, u1.render_url_info(idx=url2id[u1.url])
        print >> outfile, u2.render_url_info(idx=url2id[u2.url])
        print >> outfile, '</tr>'
    print >> outfile, '</table>'
    print >> outfile, '</body>'
    print >> outfile, '</html>'

def main(args):
    # check output dir, and ask user if delete if already exists
    if os.path.exists(ROOT_DIR):
        inp = raw_input("> Directory %s already exists. Delete and generate new pages (y/n)? " % ROOT_DIR)
        if inp[0].lower() == 'y':
            shutil.rmtree(ROOT_DIR)
        else:
            print "> Do not delete old directory. Program exit."
            exit()
    os.makedirs(ROOT_DIR)

    # digest the two ouput files
    if not os.path.exists(args.exp1_file) or not os.path.exists(args.exp2_file):
        raise Exception("At lease of the input files does not exist. Please check!")

    print "> Reading input files..."
    try:
        queries1, query2ndcg1, query2urls1 = read_experiment_file(args.exp1_file)
        queries2, query2ndcg2, query2urls2 = read_experiment_file(args.exp2_file)
    except AssertionError as e:
        print "The program encounter errors while parsing the input files."
        print "Please check the format of your input files.\nProgram Exits."
        exit(-1)
    print "Done."

    # compare queries to check for errors
    print "> Checking if the input files are valid..."
    if set(queries1) != set(queries2):
        raise Exception('Experiment 1 and experiment 2 contain different queries. ' +
            'Pages cannot be rendered. Please check your files.')

    queries = queries1
    ndcg1 = [query2ndcg1[q] for q in queries]
    ndcg2 = [query2ndcg2[q] for q in queries]
    urls1 = [query2urls1[q] for q in queries]
    urls2 = [query2urls2[q] for q in queries]

    ndcg_diffs = [n2 - n1 for n1,n2 in zip(ndcg1, ndcg2)]
    # sort queries based on ndcg diffs
    zipped = zip(ndcg_diffs, queries, ndcg1, ndcg2, urls1, urls2)
    zipped.sort()
    _, queries, ndcg1, ndcg2, urls1, urls2 = zip(*zipped)

    for u1, u2, q in zip(urls1, urls2, queries):
        u1_set = set([u.url for u in u1])
        u2_set = set([u.url for u in u2])
        if u1_set != u2_set:
            raise Exception('Pages cannot be rendered. ' + 
                'Experiment 1 and experiment 2 contain different URLs for query: ' + q)
    print "Done."

    # generate index file
    print "> Generating pages..."
    with open(ROOT_DIR + INDEX_PAGE, 'w') as outfile:
        write_index_page(outfile, queries, ndcg1, ndcg2, args.exp1_name, args.exp2_name)
    
    # generate all query files
    query_dir = ROOT_DIR + QUERY_DIR
    if os.path.exists(query_dir):
        shutil.rmtree(query_dir)
        print "  > Query directory exists. Deleted."
    os.makedirs(query_dir)
    for i, (q, n1, n2, u1, u2) in enumerate(zip(queries, ndcg1, ndcg2, urls1, urls2)):
        idx = i+1
        fname = ROOT_DIR + QUERY_DIR + 'q%d.html' % idx
        with open(fname, 'w') as outfile:
            write_query_page(outfile, q, n1, n2, u1, u2, args.exp1_name, args.exp2_name)
    print "All done."

    # open browser, only works for macOS
    if sys.platform == 'darwin':
        os.system('open %s' % (ROOT_DIR + INDEX_PAGE))
    print "Please open the index page at: %s" % (ROOT_DIR + INDEX_PAGE)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Render the side by side output, given two experiment outputs.')
    parser.add_argument('exp1_name', help='A name for experiment 1')
    parser.add_argument('exp1_file', help='Experiment 1 output')
    parser.add_argument('exp2_name', help='A name for experiment 2')
    parser.add_argument('exp2_file', help='Experiment 2 output')

    args = parser.parse_args()
    main(args)