candle_db

#!/usr/bin/env python

# CANDLE DB

from __future__ import print_function

import sys
import argparse
import pysolr

import logging

<<<<<<< HEAD
# Change the level to logging.INFO for more verbosity, logging.DEBUG for testing
=======
# Change the level to logging.INFO for regular use, logging.DEBUG for testing
# Add file=candle_db.log to log to a file
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
logging.basicConfig(level=logging.WARNING)

# Currently the only way to modify these is to edit them here
# solr listens on port 8983 by default
<<<<<<< HEAD
# Return up to 1B results (default is 10):
=======
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
HOSTPORT = "localhost:8983"
TIMEOUT = 10
MAX_ROWS = 1000*1000*1000

# =============================================================================
# DEFAULT_VALUES: keys are the allowed cores in solr
# each key indexes a dictionary of default values for that core
# must match each core's schema.xml file; must edit if schema changes
# =============================================================================

UNK = "unknown"
# default values to be used with each allowed core
<<<<<<< HEAD
DEFAULT_VALUES = {
=======
DEFAULT_VALUES = { 
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
    "run" : dict(
        run_id = None,
        parameters = "",
        benchmark_id  = UNK,
        dataset_id    = UNK,
        experiment_id = UNK,
        start_time    = None,
        end_time      = None,
        runtime_hours = None,
        status        = "SUCCESS",
        run_progress  = None,
        training_accuracy = None,
        training_loss = None,
        validation_accuracy = None,
        validation_loss = None,
        model_checkpoint_file = None,
        model_description_file = None,
        model_weight_file = None,
        model_result_files = None
    ),
    "experiment" : dict(
        run_id = None,
        experiment_id = None,
        benchmark_id  = UNK,
        dataset_id    = UNK,
        experiment_title = "untitled",
        description = "_blank",
        optimization_package_name = UNK,
        optimization_package_version = UNK,
        objective_function = UNK,
        search_space = UNK,
        search_strategy = UNK,
        max_runs = 1000000,
        status = None,
        start_time = None,
        end_time = None,
        system_description = UNK,
        keys = None
    )
}

<<<<<<< HEAD

# =============================================================================
# For each core, gives the set of fields which the schema defines as float
# =============================================================================
SCHEMA_FLOAT_FIELDS = {
=======
  
# =============================================================================
# For each core, gives the set of fields which the schema defines as float
# =============================================================================
SCHEMA_FLOAT_FIELDS = { 
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
    "run" : set(("training_accuracy", "training_loss",
                 "validation_accuracy", "validation_loss")) ,
    "experiment" : set()
}
<<<<<<< HEAD

# =============================================================================
# For each core, gives the set of fields which the schema defines as int
# =============================================================================
SCHEMA_INT_FIELDS = {
    "run" : set() ,
=======
  
# =============================================================================
# For each core, gives the set of fields which the schema defines as int
# =============================================================================
SCHEMA_INT_FIELDS = { 
    "run" : set() , 
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
    "experiment" : set(("max_runs", ))
}

#==============================================================================
# TODO: find a better mechanism than global variables to embed into CandleDB
<<<<<<< HEAD
#==============================================================================
=======
#==============================================================================  

def abort(msg):
    logging.critical("candle_db: " + msg)
    sys.exit(1)
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8

def abort(msg):
    logging.critical("candle_db: " + msg)
    sys.exit(1)


#==============================================================================
# # Keyword arguments in Python:
#
# def f(**kwargs):
#     # kwargs is a dictionary
#     # ... do something useful with kwargs ...
#
# Calling f(a=1, b=2, c=3) is equivalent to:
#
# D = {"a":1, "b":2, "c":3}
# f(**D)
#
# # D can be accessed within the body of f as kwargs
#==============================================================================

class CandleDB:
    """Encapsulate communication with Solr"""
    url_template = "http://%s/solr/%s"
    def __init__(self, core, hostport=HOSTPORT, timeout=TIMEOUT, max_rows=MAX_ROWS):
        """Setup a Solr instance. The timeout is optional."""
<<<<<<< HEAD

        if core not in DEFAULT_VALUES.keys():
            abort("unknown core name: %s" % core)

        self.core = core

        core_url = CandleDB.url_template % (hostport, core)
        logging.debug("CandleDB: {}".format(core_url))

        self.max_rows = max_rows

=======
        
        if core not in DEFAULT_VALUES.keys():
            abort("unknown core name: %s", core)
            
        self.core = core
        
        core_url = CandleDB.url_template % (hostport, core)
        logging.debug("CandleDB: {}".format(core_url))
        
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
        self.solr = pysolr.Solr(core_url, timeout=timeout)

    def add(self, **kwargs):
        """Add or update record in solr"""
        args = DEFAULT_VALUES[self.core].copy()
        kwargs = self._validate_fields(kwargs)
        args.update(kwargs)
        logging.debug("Arguments:  {}".format(args))
        self.solr.add([args])

    def delete(self, query="*:*", id_=None):
        """delete records from solr, defaults to all records ("*:*")
<<<<<<< HEAD

=======
        
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
        records can be specified by query, or by unique key
        (solr uses id which is reserved in Python, hence id_ is used here)"""
        if id_:
            # TODO: probably need to enforce str or list of string...
            self.solr.delete(id=id_)
        elif query is not None:
            self.solr.delete(q=query)
        else:
            logging.warning("Solr delete: must specify either id or q")

    def _validate_fields(self, kwargs):
        """Enforce validity of inputs according to schema.

        Invalid values will be replaced by defaults."""

        update_set = set(kwargs.keys())
<<<<<<< HEAD

=======
        
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
        for field in SCHEMA_FLOAT_FIELDS[self.core].intersection(update_set):
            try:
                kwargs[field] = float(kwargs[field])
            except:
                logging.debug("{} requires a float".format(field))
                # remove the offending field from the update
                del kwargs[field]
        for field in SCHEMA_INT_FIELDS[self.core].intersection(update_set):
            try:
                kwargs[field] = int(kwargs[field])
            except:
                logging.debug("{} requires an integer".format(field))
                # remove the offending field from the update
                del kwargs[field]
        return kwargs

    def query(self, q="*.*"):
        # Return up to 1B results (default is 10):
        results = self.solr.search(q=q, rows=self.max_rows)
        return results

def update_run(**kwargs):
    assert kwargs["run_id"] is not None, "run_id is required"
    assert kwargs["parameters"], "parameters is required"

    # Special handling for parameters
    # split parameters on "," into strings of ","-separated values
<<<<<<< HEAD
    # e.g. ["N1=1", "NE=6"]
=======
    # e.g. ["N1=1", "NE=6"] 
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
    # the keyword arguments dictionary will be updated with this list
    kwargs["parameters"] = kwargs["parameters"].split(",")

# =============================================================================
#     # enforce float values, per schema.xml for run
#     float_fields = set(("training_accuracy", "training_loss",
#         "validation_accuracy", "validation_loss"))
<<<<<<< HEAD
#
=======
#     
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
#     for ff in float_fields.intersection(set(kwargs.keys())):
#         try:
#             kwargs[ff] = float(kwargs[ff])
#         except:
#             msg = "{} requires a float".format(ff)
#             logging.debug(msg)
#             print(msg)
#             # remove the offending field from the update
#             del kwargs[ff]
# =============================================================================
<<<<<<< HEAD

=======
    
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
    db = CandleDB(core="run")
    db.add(**kwargs)

def experiment_insert(**kwargs):
    assert kwargs["experiment_id"] is not None, "experiment_id is required!"

    # Special handling for keys:
    # Expecting a list of "K=V" which can be unpacked into a dictionary
    # Values for K should be recognized by the schema or solr will object

    if "keys" in kwargs.keys():
        keys = kwargs["keys"]
        if keys is not None:
            kwargs.update(kv2dict(keys))
        del kwargs["keys"]

# =============================================================================
#     # according to the schema max_runs is an int
#     if "max_runs" in kwargs.keys():
#         try:
#             kwargs["max_runs"] = int(kwargs["max_runs"])
#         except:
#             logging.warning("max_runs requires an int, got {}{".format(kwargs["max_runs"]))
#             # revert to whatever the default value is
#             del kwargs["max_runs"]
<<<<<<< HEAD
#
=======
# 
>>>>>>> 7fb7b96fb02d909b121bb9f65f597252d5435eb8
# =============================================================================
    db = CandleDB(core="experiment")
    db.add(**kwargs)

def params2string(N1, NE):
    return "N1=%i,NE=%i" % (N1, NE)

def kv2dict(L):
    """ Convert list L of [ K=V... ] to dict { K:V ... } """
    logging.debug("kv2dict called with:\n".format(L))
    result = {}
    for kv in L:
        tokens = kv.split('=')
        key = tokens[0]
        if len(tokens) == 1:
            result[key] = ""
        else:
            value = tokens[1]
            result[key] = value
    return result

def update(remainder):
    if len(remainder) < 1:
        abort("update: requires core name!")
    core = remainder[0]
    db = CandleDB(core=core)
    kv = kv2dict(remainder[1:])
    if core == "run":
        db.add(**kv)

def query(args):
    if len(args) < 1:
        abort("query: requires core name!")
    db = CandleDB(core=args[0])
    q = "*:*" # Default
    if len(args) == 2:
        q = args[1]
    results = db.query(q=q)
    return results

def query_print(args):
    results = query(args)
    print("results: " + str(len(results.docs)))
    for result in results:
        print("----")
        print_result(result)

def print_result(result):
    copy = result.copy()
    del copy["_version_"]
    print_table(copy)

def print_table(D):
    """D is a dict"""
    n = max(len(k) for k in D.keys()) # Length of longest key
    for k,v in D.items(): #D.iteritems():
        print("%*s = %s" % (-n, k, v))

def delete(args):
    if len(args) != 1:
        abort("delete: requires core name!")
    db = CandleDB(core=args[0])
    db.delete()

def ls(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--no-count', default=False, action='store_true')
    parser.add_argument("remainder", nargs="*")
    ns = parser.parse_args(args)
    if len(ns.remainder) < 1:
        abort("ls: requires core name!")
    db = ns.remainder[0]
    results = query(ns.remainder)
    if not ns.no_count:
        print("results: " + str(len(results.docs)))
    if db == "experiment":
        ls_experiment(ns.remainder[1:], results)
    elif db == "run":
        ls_run(ns.remainder[1:], results)

def ls_experiment(args, results):
    for result in results:
        print(result["experiment_id"])

def ls_run(args, results):
    table = {}
    for result in results:
        parameters = result["parameters"][0]
        table[result["run_id"]] = parameters
    print_table(table)

if __name__ == "__main__":
    if len(sys.argv) < 2:
        logging.critical("Requires subcommand arguments...")
        print("Requires: subcommand arguments...")
        sys.exit(1)

    subcommand = sys.argv[1]
    remainder  = sys.argv[2:]
    if subcommand == "delete":
        delete(remainder)
    elif subcommand == "update":
        update(remainder)
    elif subcommand == "query":
        query_print(remainder)
    elif subcommand == "ls":
        ls(remainder)
    else:
        abort("unknown subcommand: " + subcommand)