From 66eade1a19234182f43a5262a827fc884cb9566c Mon Sep 17 00:00:00 2001 From: Sera Riddhi Date: Tue, 7 Jun 2022 16:26:00 -0400 Subject: [PATCH 1/5] Aparna's changes --- segway/gmtk/input_master.py | 639 ++++++++++++++++++++++ segway/input_master.py | 1017 +++++++++++++++-------------------- 2 files changed, 1072 insertions(+), 584 deletions(-) create mode 100644 segway/gmtk/input_master.py diff --git a/segway/gmtk/input_master.py b/segway/gmtk/input_master.py new file mode 100644 index 00000000..27b58e0d --- /dev/null +++ b/segway/gmtk/input_master.py @@ -0,0 +1,639 @@ +from __future__ import division, print_function +from collections import OrderedDict +import numpy as np +from numpy import array, ndarray + +COMPONENT_TYPE_DIAG_GAUSSIAN = 0 + + +def array2text(a): + """ + Convert multi-dimensional array to text. + :param a: array + :return: + """ + ndim = a.ndim + if ndim == 1: + return " ".join(map(str, a)) + else: + delimiter = "\n" * (ndim - 1) + return delimiter.join(array2text(row) for row in a) + +def add_extra_line(l): + """ + :param: l: list[str] or str + Return copy of l with "\n" concatenated with the last item. + (l must have at least one item.) + For example: + l = ["a", "b", "c"] + new_l = add_extra_line(l) + new_l + ["a", "b", "c\n"] + """ + if isinstance(l, str): + return "{}\n".format(l) + + last_element = "{}\n".format(l[-1]) + new_l = l[:] + new_l[-1] = last_element + return new_l + + +class Object(str): + def __new__(cls, _name, content, _kind): + return str.__new__(cls, content) + + def __init__(self, name, content, kind): + self.kind = kind + self.name = name + + +class Array(ndarray): + def __new__(cls, *args): + """ + :param input_array: ndarray + :return: + """ + input_array = array(args) + obj = np.asarray(input_array).view(cls) + return obj + + +class Section(OrderedDict): + """ + Contains GMTK objects of a single type and supports writing them to file. + Key: name of GMTK object + Value: GMTK object + """ + def __init__(self): + """ + Initialize an empty Section object. + """ + super(Section, self).__init__() + + def __getattr__(self, name): + if not name.startswith('_'): + return self[name] + OrderedDict.__getattr__(self, name) + + def __setattr__(self, name, value): + if not name.startswith('_'): + if not self.kind() == value.kind: + raise ValueError("Object has incorrect type.") + self[name] = value + else: + OrderedDict.__setattr__(self, name, value) + + def kind(self): + """ + Return string attribute kind of all GMTK objects in this Section object. + :return: str: type of all GMTK objects in this Section object + """ + section_kind = None + for obj in self.values(): + if not section_kind: + section_kind = obj.kind + else: + assert section_kind == obj.kind, "Objects must be of same type." + return section_kind + + +class InlineSection(Section): + + def __str__(self): + """ + Returns inline string representation of this Section object by calling + the individual GMTK object's __str__(). + :return: + """ + # if no gmtk objects + if len(self) == 0: + return "" + # MC, MX are also one line types but have their own subclasses of InlineSection + one_line_types = ["MEAN", "COVAR", "DPMF"] # TODO: DPMF dim > 1? + + lines = ["{}_IN_FILE inline".format(self.kind())] + lines.append(str(len(self)) + "\n") # total number of gmtk objects + for i in range(len(self)): + obj_header = [] + obj_header.append(str(i)) # index of gmtk object + obj_header.append(list(self)[i]) # name of gmtk object + + # special formatting for some GMTK types: + if self.kind() == "NAME_COLLECTION": + # size of this NameCollection object + obj_header.append(str(len(list(self.values())[i]))) + if self.kind() == "DENSE_CPT": + # num_parents and cardinality of this DenseCPT + obj_header.append(list(self.values())[i].get_header_info()) + + if self.kind() in one_line_types: + # string representation of gmtk object + obj_header.append(list(self.values())[i].__str__()) + lines.append(" ".join(obj_header)) + else: + lines.append(" ".join(obj_header)) + # string representation of gmtk object + lines.append(list(self.values())[i].__str__()) + + return "\n".join(add_extra_line(lines)) + + +class InlineMCSection(InlineSection): + """ + Special InlineSection subclass which contains MC objects. + Attributes: + mean: InlineSection object which point to InputMaster.mean + covar: InlineSection object which point to InputMaster.covar + """ + def __init__(self, mean, covar): + """ + :param mean: InlineSection: InlineSection object which point to + InputMaster.mean + :param covar: InlineSection: InlineSection object which point to + InputMaster.covar + """ + super(InlineMCSection, self).__init__() + self.mean = mean + self.covar = covar + + def __setattr__(self, key, value): + OrderedDict.__setattr__(self, key, value) + + def __str__(self): + """ + Returns string representation of all MC objects contained in this + InlineMCSection by calling the individual MC object's __str__(). + :return: + """ + if len(self) == 0: + return "" + else: + lines = ["{}_IN_FILE inline".format(self.kind())] + lines.append("{}\n".format(str(len(self)))) # total number of MC objects + for i in range(len(self)): + obj_line = [] + obj_line.append(str(i)) # index of MC object + # check if dimension of Mean and Covar of this MC are the same + obj = list(self.values())[i] + mean_name = obj.mean + covar_name = obj.covar + if not self.mean[mean_name].get_dimension() == self.covar[covar_name].get_dimension(): + # TODO delete MC? redefine? + raise ValueError("Inconsistent dimensions of mean and covar associated to MC.") + else: + obj_line.append(str(self.mean[mean_name].get_dimension())) + # dimension of MC + obj_line.append(str(obj.component_type)) # component type + obj_line.append(list(self)[i]) # name of MC + obj_line.append(obj.__str__()) # string representation of MC obj + lines.append(" ".join(obj_line)) + + return "\n".join(add_extra_line(lines)) + + +class InlineMXSection(InlineSection): + """ + Special InlineSection subclass which contains MX objects. + Attributes: + dpmf: InlineSection object which point to InputMaster.dpmf + components: InlineSection object which point to InputMaster.mc + """ + + def __init__(self, dpmf, mc): + """ + :param dpmf: InlineSection: InlineSection object which point to + InputMaster.dpmf + :param components: InlineSection: InlineSection object which point to + InputMaster.mc + """ + super(InlineMXSection, self).__init__() + self.dpmf = dpmf + self.mc = mc + + def __setattr__(self, key, value): + OrderedDict.__setattr__(self, key, value) + + def __str__(self): + """ + Returns string representation of all MX objects contained in this + InlineMXSection by calling the individual MX object's __str__. + :return: + """ + if len(self) == 0: + return [] + else: + lines = ["{}_IN_FILE inline".format(self.kind())] + lines.append("{}\n".format(str(len(self)))) # total number of MX objects + for i in range(len(self)): + obj_line = [] + obj_line.append(str(i)) # index of MX object + # check if number of components is equal to length of DPMF + obj = list(self.values())[i] + dpmf_name = obj.dpmf + components = obj.components + dpmf_length = self.dpmf[dpmf_name].get_length() + if not dpmf_length == len(components): + raise ValueError( + "Dimension of DPMF must be equal to number of components associated with this MX object.") + else: + obj_line.append(str(dpmf_length)) # dimension of MX + obj_line.append(list(self)[i]) # name of MX + obj_line.append(obj.__str__()) + # string representation of this MX object + lines.append(" ".join(obj_line)) + + + return "\n".join(add_extra_line(lines)) + + +class InputMaster: + """ + Master class which contains all GMTK objects present in the input + master and is responsible for creating their string representation. + Attributes: + mean: InlineSection: contains all Mean objects in input master + covar: InlineSection: contains all Covar objects in input master + dpmf: InlineSection: contains all DPMF objects in input master + dense_cpt: InlineSection: contains all DenseCPT objects in input master + deterministic_cpt: InlineSection: contains all DeterministicCPT objects + in input master + mc: InlineMCSection: contains all MC objects in input master + mx: InlineMXSection: contains all MX objects in input master + name_collection: InlineSection: contains all NameCollection objects in + input master + """ + + def __init__(self): + """ + Initialize InputMaster instance with empty attributes (InlineSection + and its subclasses). + """ + self.mean = InlineSection() + self.covar = InlineSection() + self.dpmf = InlineSection() + self.dense_cpt = InlineSection() + self.deterministic_cpt = InlineSection() + self.mc = InlineMCSection(mean=self.mean, covar=self.covar) + self.mx = InlineMXSection(dpmf=self.dpmf, mc=self.mc) + self.name_collection = InlineSection() + + def __str__(self): + """ + Return string representation of all the attributes (GMTK types) by + calling the attributes' (InlineSection and its subclasses) __str__(). + :return: + """ + attrs = [self.deterministic_cpt, self.name_collection, self.mean, + self.covar, self.dense_cpt, self.dpmf, self.mc, self.mx] + + s = [] + for obj in attrs: + s.append("".join(obj.__str__())) + + return "".join(s) + + def save(self, filename, traindir='segway_output/traindir'): + """ + Opens filename for writing and writes out the contents of its attributes. + :param: filename: str: path to input master file + :param: traindir: str: path to traindir + (default assumes path to traindir is 'segway_output/traindir') + :return: None + """ + with open(filename, 'w') as filename: + print('# include "' + traindir + '/auxiliary/segway.inc"', file=filename) + print(self, file=filename) + + +class DenseCPT(Array): + """ + A single DenseCPT object. + """ + kind = "DENSE_CPT" + + # todo check if sums to 1.0 + + def __str__(self): + """ + Return string representation of this DenseCPT object. + :return: + """ + line = [] + if 'extra_rows' not in self.__dict__.keys(): + extra_rows = [] + else: + extra_rows = self.extra_rows + line.extend(extra_rows) + line.append("{}\n".format(array2text(self))) + + return "\n".join(line) + + def __setattr__(self, key, value): + if key == 'extra_rows': + if key not in self.__dict__.keys(): + super(DenseCPT, self).__setattr__(key, value) + else: + raise ValueError("Attribute not allowed.") + + def get_header_info(self): + """ + Return number of parents, cardinality line (called by Section.__str__()). + """ + line = [] + line.append(str(len(self.shape) - 1)) # number of parents + cardinality_line = map(str, self.shape) + line.append(" ".join(cardinality_line)) # cardinalities + return " ".join(line) + + @classmethod + def uniform_from_shape(cls, *shape, **kwargs): + """ + :param: shape: int: shape of DenseCPT + :param: kwargs: float: optional value for diagonal entry of DenseCPT (default is 0.0) + :return: DenseCPT with uniform probabilities and given shape + """ + diag_value = kwargs.pop('self', 0.0) # set default value for diagonal entry to 0 + + if len(shape) == 1: + a = np.empty(shape) + cpt = DenseCPT(a) + cpt = np.squeeze(cpt, axis=0) + cpt.fill(1.0 / shape[-1]) # number of columns + return cpt + + else: + a = np.empty(shape) + div = shape[-1] - 1 + # if num_subsegs = 1 + if div == 0: + a.fill(1.0) + else: + value = (1.0 - diag_value) / div + a.fill(value) + # len(shape) = 2 => seg_seg => square matrix + if len(shape) == 2: + # set diagonal elements to 0.0 + diag_index = range(shape[0]) + a[diag_index, diag_index] = diag_value + + # len(shape) = 3 => seg_subseg_subseg + # => num_segs x square matrix + if len(shape) == 3: + # "diag_indices" to be set to 0: + # range(seg), diag_index, diag_index + diag_index = [] + for s in range(shape[-1]): + diag_index.append([s] * len(shape[1:])) + final_indices = [] + for i in range(shape[0]): + for item in diag_index: + index = [i] + index.extend(item) + final_indices.append(tuple(index)) + + for index in final_indices: + a[index] = diag_value + + cpt = DenseCPT(a) + + return np.squeeze(cpt, axis=0) + + +class NameCollection(list): + """ + A single NameCollection object. + """ + kind = "NAME_COLLECTION" + + def __init__(self, *args): + """ + Initialize a single NameCollection object. + :param args: str: names in this NameCollection + """ + if isinstance(args[0], list): # names in NameCollection have been given in a single list + super(NameCollection, self).__init__([]) + self.extend(args[0]) + else: + super(NameCollection, self).__init__(list(args)) + + def __str__(self): + """ + Returns string format of NameCollection object to be printed into the + input.master file (new lines to be added) + """ + if len(self) == 0: + return "" + line = [] + line.extend(self) # names + return "\n".join(line) + + +class Mean(Array): + """ + A single Mean object. + """ + kind = "MEAN" + + def __array_finalize__(self, obj): + if obj is None: return + + def __str__(self): + """ + Returns the string format of the Mean object to be printed into the + input.master file (new lines to be added). + :return: + """ + line = [] + line.append(str(self.get_dimension())) # dimension + line.append(array2text(self)) + return " ".join(line) + + def get_dimension(self): + """ + Return dimension of this Mean object. + :return: int: dimension of this Mean object + """ + return len(self) + + +class Covar(Array): + """ + A single Covar object. + """ + kind = "COVAR" + + def __str__(self): + """ + Return string representation of single Covar object. + :return: + """ + line = [str(self.get_dimension())] # dimension + line.append(array2text(self)) # covar values + return " ".join(line) + + def get_dimension(self): + """ + Return dimension of this Covar object. + :return: int: dimension of this Covar object + """ + return len(self) + + +class DPMF(Array): + """ + A single DPMF object. + """ + kind = "DPMF" + + # todo check if sums to 1.0 + + def __str__(self): + """ + Return string representation of this DPMF. + :return: + """ + line = [str(self.get_length())] # dpmf length + line.append(array2text(self)) # dpmf values + return " ".join(line) + + def get_length(self): + return len(self) + + @classmethod + def uniform_from_shape(cls, *shape): + """ + :param: shape: int: shape of DPMF + :return: DPMF with uniform probabilities and given shape + """ + a = np.empty(shape) + if len(shape) != 1: + raise ValueError("DPMF must be one-dimensional.") + else: + value = 1.0 / shape[0] + a.fill(value) + + return DPMF(a).squeeze(axis=0) + + +class MC: + """ + A single MC object. + Attributes: + component_type: int: type of MC + """ + kind = "MC" + + def __init__(self, component_type): + """ + Initialize a single MC object. + :param component_type: int: type of MC + """ + self.component_type = component_type + + +class DiagGaussianMC(MC, object): + """ + Attributes: + component_type = 0 + mean: str: name of Mean object associated to this MC + covar: str: name of Covar obejct associated to this MC + """ + def __init__(self, mean, covar): + """ + Initialize a single DiagGaussianMC object. + :param mean: name of Mean object associated to this MC + :param covar: name of Covar obejct associated to this MC + """ + # more component types? + super(DiagGaussianMC, self).__init__("COMPONENT_TYPE_DIAG_GAUSSIAN") + self.mean = mean + self.covar = covar + + def __str__(self): + """ + Return string representation of this MC object. + :return: + """ + return " ".join([self.mean, self.covar]) + + +class MX: + """ + A single MX object. + Attributes: + dpmf: str: name of DPMF object associated with MX + components: list[str]: names of components associated with this MX + """ + kind = "MX" + + def __init__(self, dpmf, components): + """ + Initialize a single MX object. + :param dpmf: str: name of DPMF object associated with this MX + :param components: str or list[str]: names of components associated with + this MX + """ + self.dpmf = dpmf + if isinstance(components, str): + self.components = [components] + elif isinstance(components, list): + for name in components: + if not isinstance(name, str): + raise ValueError("All component names must be strings.") + self.components = components + else: # not allowed types + raise ValueError("Incorrect format of component names.") + + def __str__(self): + """ + Return string representation of this MX. + :return: + """ + line = [str(len(self.components))] # number of components + line.append(self.dpmf) # dpmf name + line.append(" ".join(self.components)) # component names + return " ".join(line) + + +class DeterministicCPT: + """ + A single DeterministicCPT object. + Attributes: + parent_cardinality: tuple[int]: cardinality of parents + cardinality: int: cardinality of self + dt: str: name existing Decision Tree (DT) associated with this + DeterministicCPT + """ + kind = "DETERMINISTIC_CPT" + + def __init__(self, cardinality_parents, cardinality, dt): + """ + Initialize a single DeterministicCPT object. + :param cardinality_parents: tuple[int]: cardinality of parents + (if empty, then number of parents = 0 + :param cardinality: int: cardinality of self + :param dt: name existing Decision Tree (DT) associated with this + DeterministicCPT + """ + if not isinstance(cardinality_parents, tuple): + self.cardinality_parents = (cardinality_parents, ) + else: + self.cardinality_parents = cardinality_parents + self.cardinality = cardinality + self.dt = dt + + def __str__(self): + """ + Return string representation of this DeterministicCPT. + :return: + """ + line = [] + num_parents = len(self.cardinality_parents) + line.append(str(num_parents)) # number of parents + cardinalities = [] + cardinalities.extend(self.cardinality_parents) + cardinalities.append(self.cardinality) + line.append(" ".join(map(str, cardinalities))) # cardinalities of parent and self + line.append("{}\n".format(self.dt)) + return "\n".join(line) \ No newline at end of file diff --git a/segway/input_master.py b/segway/input_master.py index 371118c2..cffd6936 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -8,16 +8,14 @@ ## Copyright 2012, 2013 Michael M. Hoffman -from math import frexp, ldexp -from string import Template import sys from genomedata._util import fill_array -from numpy import (array, empty, float32, outer, set_printoptions, sqrt, tile, - vectorize, where, zeros) +from numpy import (array, empty, set_printoptions, sqrt, tile, where) +import numpy as np from six.moves import map, range -from ._util import (copy_attrs, data_string, DISTRIBUTION_GAMMA, +from ._util import (copy_attrs, data_string, DISTRIBUTION_NORM, DISTRIBUTION_ASINH_NORMAL, OFFSET_END, OFFSET_START, OFFSET_STEP, resource_substitute, Saver, SEGWAY_ENCODING, @@ -26,6 +24,9 @@ SUPERVISION_SUPERVISED, USE_MFSDG, VIRTUAL_EVIDENCE_LIST_FILENAME) +from .gmtk.input_master import (InputMaster, NameCollection, DenseCPT, + DPMF, MX, Covar, Mean, DiagGaussianMC) + # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of # scalars and vectors in the parameter output. By default in newer (> 1.14) # versions printed output "giv[es] the shortest unique representation". @@ -38,8 +39,8 @@ except TypeError: # Otherwise ignore the attempt pass - -if USE_MFSDG: + +if USE_MFSDG: # because tying not implemented yet COVAR_TIED = False else: @@ -69,6 +70,8 @@ # TODO[PY2-EOL]: remove ROUND_NDIGITS = 12 +input_master = InputMaster() + def vstack_tile(array_like, *reps): reps = list(reps) + [1] @@ -84,7 +87,6 @@ def array2text(a): delimiter = "\n" * (ndim - 1) return delimiter.join(array2text(row) for row in a) - def make_spec(name, iterable): """ name: str, name of GMTK object type @@ -99,7 +101,7 @@ def make_spec(name, iterable): all_lines = header_lines + indexed_items - # In Python 2, convert from unicode to bytes to prevent + # In Python 2, convert from unicode to bytes to prevent # __str__method from being called twice # Specifically in the string template standard library provided by Python # 2, there is a call to a string escape sequence + tuple, e.g.: @@ -114,7 +116,6 @@ def make_spec(name, iterable): return "\n".join(all_lines) + "\n" - def prob_transition_from_expected_len(length): # formula from Meta-MEME paper, Grundy WN et al. CABIOS 13:397 # see also Reynolds SM et al. PLoS Comput Biol 4:e1000213 @@ -139,24 +140,6 @@ def make_zero_diagonal_table(length): return res -def format_indexed_strs(fmt, num): - full_fmt = fmt + "%d" - return [full_fmt % index for index in range(num)] - - -def jitter_cell(cell, random_state): - """ - adds some random noise - """ - # get the binary exponent and subtract JITTER_ORDERS_MAGNITUDE - # e.g. 3 * 2**10 --> 1 * 2**5 - max_noise = ldexp(1, frexp(cell)[1] - JITTER_ORDERS_MAGNITUDE) - - return cell + random_state.uniform(-max_noise, max_noise) - -jitter = vectorize(jitter_cell) - - class ParamSpec(object): """ base class for parameter specifications used in input.master files @@ -164,43 +147,32 @@ class ParamSpec(object): type_name = None object_tmpl = None copy_attrs = ["distribution", "mins", "num_segs", "num_subsegs", - "num_track_groups", "track_groups", "num_mix_components"] + "num_track_groups", "track_groups", "num_mix_components", + "means", "vars", "num_mix_components", "random_state", + "tracks", "resolution", "card_seg_countdown", "seg_table", + "seg_countdowns_initial", "len_seg_strength", "num_bases"] + + jitter_std_bound = 0.2 + track_names = [] def __init__(self, saver): # copy all variables from saver that it copied from Runner # XXX: override in subclasses to only copy subset copy_attrs(saver, self, self.copy_attrs) + self.track_names = [] + for track in self.tracks: + self.track_names.append(track.name) - def get_track_lt_min(self, track_index): + def __str__(self): + return make_spec(self.type_name, self.generate_objects()) + + def make_data(self): """ - returns a value less than a minimum in a track + override this in subclasses + returns: container indexed by (seg_index, subseg_index, track_index) """ - # XXX: refactor into a new function - min_track = self.mins[track_index] - - # fudge the minimum by a very small amount. this is not - # continuous, but hopefully we won't get values where it - # matters - # XXX: restore this after GMTK issues fixed - # if min_track == 0.0: - # min_track_fudged = FUDGE_TINY - # else: - # min_track_fudged = min_track - ldexp(abs(min_track), FUDGE_EP) - - # this happens for really big numbers or really small - # numbers; you only have 7 orders of magnitude to play - # with on a float32 - min_track_f32 = float32(min_track) - - assert min_track_f32 - float32(ABSOLUTE_FUDGE) != min_track_f32 - return min_track - ABSOLUTE_FUDGE - - def make_segnames(self): - return format_indexed_strs("seg", self.num_segs) - - def make_subsegnames(self): - return format_indexed_strs("subseg", self.num_subsegs) - + return None + def get_template_component_suffix(self, component_number): """Returns the subsitution for the component suffix in the GMTK model template. Empty if there is only one component""" @@ -209,131 +181,319 @@ def get_template_component_suffix(self, component_number): else: return "_component{}".format(component_number) - def generate_tmpl_mappings(self): - # need segnames because in the tied covariance case, the - # segnames are replaced by "any" (see .make_covar_spec()), - # and only one mapping is produced - num_subsegs = self.num_subsegs - - track_groups = self.track_groups - num_track_groups = self.num_track_groups - - for seg_index, segname in enumerate(self.make_segnames()): - seg_offset = num_track_groups * num_subsegs * seg_index - - for subseg_index, subsegname in enumerate(self.make_subsegnames()): - subseg_offset = seg_offset + (num_track_groups * subseg_index) - - for track_group_index, track_group in enumerate(track_groups): - track_offset = subseg_offset + track_group_index - head_trackname = track_group[0].name - - # XXX: change name of index to track_offset in templates - # XXX: change name of track_index to track_group_index - yield dict(seg=segname, subseg=subsegname, - track=head_trackname, seg_index=seg_index, - subseg_index=subseg_index, - track_index=track_group_index, - index=track_offset, - distribution=self.distribution) - - def make_data(self): + def get_head_track_names(self): """ - override this in subclasses - - returns: container indexed by (seg_index, subseg_index, track_index) + Returns list containing the first track name in each track group. """ - return None + head_track_names = [] + for group in self.track_groups: + head_track_names.append(group[0].name) + return head_track_names - def generate_objects(self): + + def generate_collection_names(self): """ - returns: iterable of strs containing GMTK parameter objects starting - with names + Generate names of NameCollection objects. """ - substitute = Template(self.object_tmpl).substitute - - data = self.make_data() - - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - - yield substitute(mapping) - - def __str__(self): - return make_spec(self.type_name, self.generate_objects()) - - -class DTParamSpec(ParamSpec): - type_name = "DT" - copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", - "supervision_type"] - - def make_segCountDown_tree_spec(self, resourcename): # noqa + COLLECTION_NAME_FORMAT_STRING = "collection_seg_{track_name}" + head_track_names = self.get_head_track_names() + names = [] + for name in head_track_names: + names.append(COLLECTION_NAME_FORMAT_STRING.format(track_name=name)) + return names + + def generate_name_collection_entries(self): + """ + Generate entries in NameCollection objects. + """ + COLLECTION_ENTRY_FORMAT_STRING = \ + "mx_seg{seg_index}_subseg{subseg_index}_{track_name}" + head_track_names = self.get_head_track_names() + names = [] + for name in head_track_names: + for i in range(self.num_segs): + for j in range(self.num_subsegs): + names.append(COLLECTION_ENTRY_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=name)) + return names + + def generate_tied_covar_object_names(self): + """ + Generate tied Covar object names. + """ + TIED_COVAR_FORMAT_STRING = "covar_{track_name}{suffix}" + head_track_names = self.get_head_track_names() + names = [] + for component_number in range(self.num_mix_components): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append(TIED_COVAR_FORMAT_STRING.format(track_name=track_name, + suffix=component_suffix)) + return names + + def generate_covar_object_names(self): + """ + Generate tied Covar object names. + """ + COVAR_FORMAT_STRING = "covar_seg{seg_index}_subseg{subseg_index}_{track_name}{suffix}" + head_track_names = self.get_head_track_names() + names = [] + for component_number in range(self.num_mix_components): + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append(COVAR_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name, + suffix=component_suffix)) + + return names + + def generate_mean_object_names(self): + """ + Generate Mean object names. + """ + MEAN_FORMAT_STRING = "mean_seg{seg_index}_subseg{subseg_index}_{track_name}{suffix}" + head_track_names = self.get_head_track_names() + names = [] + for component_number in range(self.num_mix_components): + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append(MEAN_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name, + suffix=component_suffix)) + + return names + + def generate_mx_object_names(self): + """ + Generate MX object names. + """ + MX_FORMAT_STRING = "mx_seg{seg_index}_subseg{subseg_index}_{track_name}" + head_track_names = self.get_head_track_names() + names = [] + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + names.append(MX_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name)) + + return names + + def generate_diag_gaussian_mc_object_names(self): + """ + Generate DiagGaussianMC object names. + """ + DIAG_GAUSSIAN_FORMAT_STRING = \ + "mc_{distribution}_seg{seg_index}_subseg{subseg_index}_{track_name}{suffix}" + head_track_names = self.get_head_track_names() + names = [] + for component_number in range(self.num_mix_components): + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append(DIAG_GAUSSIAN_FORMAT_STRING.format(distribution=self.distribution, + seg_index=i, + subseg_index=j, + track_name=track_name, + suffix=component_suffix)) + return names + + def generate_gamma_mc_object_names(self): + GAMMA_MC_FORMAT_STRING = \ + "mc_gamma_seg{seg_index}_subseg{subseg_index}_{track_name}" + names = [] + head_track_names = self.get_head_track_names() + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + names.append(GAMMA_MC_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name)) + return names + + def generate_dpmf_object_names(self): + """ + Generate DPMF object names. + """ + # to do for num_mix_components > 1: + names = [] + if self.num_mix_components == 1: + names.append("dpmf_always") + else: + # TODO (with dirichlet extra rows) + names.append("") + + return names + + def generate_gmtk_object_names(self, gmtk_object_type): + """ + Generate GMTK object names for the types: + name of NameCollection: "collection_names" + entries in NameCollection: "collection_entries" + Covar: "covar", "tied_covar" + Mean: "mean" + MX: "mx" + MC: "diag_gaussian_mc" + DPMF: "dpmf" + :param gmtk_object_type: str: type of gmtk object for which names must be generated + :return: list[str]: list of GMTK object names + """ + GMTK_OBJECT_NAME_GENERATORS = {'mx': self.generate_mx_object_names, + 'diag_gaussian_mc': self.generate_diag_gaussian_mc_object_names, + 'mean': self.generate_mean_object_names, + 'covar': self.generate_covar_object_names, + 'tied_covar': self.generate_tied_covar_object_names, + 'collection_names': self.generate_collection_names, + 'collection_entries': self.generate_name_collection_entries, + 'dpmf': self.generate_dpmf_object_names} + + return GMTK_OBJECT_NAME_GENERATORS[gmtk_object_type]() + + def generate_name_collection(self): + """ + Generate string representation of NameCollection objects in input master. + """ + # generate list of collection names + # num_track_groups (i.e. one for each head track) number + # of collection names generated + collection_names = self.generate_gmtk_object_names("collection_names") + # generate list of all names in NameCollections + # (num_segs * num_subsegs) number of names generated + names = self.generate_gmtk_object_names("collection_entries") + num_track_groups = self.num_track_groups + len_name_group = int(len(names) / num_track_groups) + # names grouped by collection + name_groups = [names[i:i + len_name_group] for i in + range(0, len(names), len_name_group)] + # create NameCollection objects and add to input master + for group_index in range(len(name_groups)): + input_master.name_collection[collection_names[group_index]] = \ + NameCollection(name_groups[group_index]) + + return str(input_master.name_collection) + + def make_mean_data(self): num_segs = self.num_segs - seg_countdowns_initial = self.seg_countdowns_initial - - header = ([str(num_segs)] + - [str(num_seg) for num_seg in range(num_segs - 1)] + - ["default"]) - - lines = [" ".join(header)] - - for seg, seg_countdown_initial in enumerate(seg_countdowns_initial): - lines.append(" -1 %d" % seg_countdown_initial) - - tree = "\n".join(lines) + num_subsegs = self.num_subsegs + means = self.means # indexed by track_index - return resource_substitute(resourcename)(tree=tree) + # maximum likelihood, adjusted by no more than 0.2*sd + stds = sqrt(self.vars) - def make_map_seg_segCountDown_dt_spec(self): # noqa - return self.make_segCountDown_tree_spec("map_seg_segCountDown.dt.tmpl") + # tile the means of each track (num_segs, num_subsegs times) + means_tiled = vstack_tile(means, num_segs, num_subsegs) + stds_tiled = vstack_tile(stds, num_segs, num_subsegs) - def make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec(self): # noqa - template_name = \ - "map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl" - return self.make_segCountDown_tree_spec(template_name) + jitter_std_bound = self.jitter_std_bound + noise = self.random_state.uniform(-jitter_std_bound, + jitter_std_bound, stds_tiled.shape) - def generate_objects(self): - yield data_string("map_frameIndex_ruler.dt.txt") - yield self.make_map_seg_segCountDown_dt_spec() - yield self.make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec() # noqa - yield data_string("map_seg_subseg_obs.dt.txt") + return means_tiled + (stds_tiled * noise) - supervision_type = self.supervision_type - if supervision_type == SUPERVISION_SEMISUPERVISED: - yield data_string("map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt") # noqa - elif supervision_type == SUPERVISION_SUPERVISED: - # XXX: does not exist yet - yield data_string("map_supervisionLabel_seg_alwaysTrue_supervised.dt.txt") # noqa + def generate_mean_objects(self): + """ + Generate string representation of Mean objects in input master. + """ + # generate list of names of Mean objects + names = self.generate_gmtk_object_names("mean") + means = self.make_mean_data() # array + num_track_groups = self.num_track_groups # number of head track names + # dimensions of means: num_segs x num_subsegs x num_head_tracks + # create Mean objects + names_array = array(names).reshape((self.num_segs, self.num_subsegs, len(self.track_groups))) + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for k in range(num_track_groups): + input_master.mean[names_array[i, j, k]] = Mean(means[i, j, k]) + + return str(input_master.mean) + + def generate_covar_objects(self): + """ + Generate string representation of Covar objects in input master. + """ + if COVAR_TIED: + names = self.generate_gmtk_object_names("tied_covar") else: - assert supervision_type == SUPERVISION_UNSUPERVISED - - -class TableParamSpec(ParamSpec): - copy_attrs = ParamSpec.copy_attrs \ - + ["resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial"] + names = self.generate_gmtk_object_names("covar") + covar_values = self.vars # array of variance values + # creating Covar objects and adding them to input master + covar_objects = map(Covar, covar_values) + input_master.covar.update(dict(zip(names, covar_objects))) - # see Segway paper - probs_force_transition = array([0.0, 0.0, 1.0]) + return str(input_master.covar) - def make_table_spec(self, name, table, ndim, extra_rows=[]): - header_rows = [name, ndim] - header_rows.extend(table.shape) - - rows = [" ".join(map(str, header_rows))] - rows.extend(extra_rows) - rows.extend([array2text(table), ""]) - - return "\n".join(rows) + def generate_mc_objects(self): + """ + Generate string representation of MC objects in input master. + """ + # if distribution is norm or asinh_norm, TODO for missing, gamma + if self.distribution in DISTRIBUTIONS_LIKE_NORM: + if USE_MFSDG: + # TODO + option = "missing_mc" + else: + option = "diag_gaussian_mc" + # generate MC object names + names = self.generate_gmtk_object_names(option) + + # replicate covar names for iteration + covar_names = list(input_master.mc.covar) * ( + self.num_segs * self.num_subsegs) + # list of all mean names + mean_names = list(input_master.mc.mean) + + # create MC objects and add them to input master + mc_objects = [] + for mean_name, covar_name in zip(mean_names, covar_names): + mc_objects.append(DiagGaussianMC(mean=mean_name, covar=covar_name)) + input_master.mc.update(dict(zip(names, mc_objects))) + + return str(input_master.mc) + + def generate_mx_objects(self): + """Generate string representation of MX objects in input master. + """ + # generate list of MX names + names = self.generate_gmtk_object_names("mx") + mc_names = list(input_master.mc) # list of all mc names + dpmf_names = list(input_master.dpmf) # list of all dpmf names + multiple = int(len(names) / len(dpmf_names)) + dpmf_names *= multiple # replicate dpmf names for iteration + mx_objects = [] + # parameters required for creating MX object: names of mc, dpmf + for mc_name, dpmf_name in zip(mc_names, dpmf_names): + mx_objects.append(MX(dpmf=dpmf_name, components=mc_name)) + + # adding MX object to input master + input_master.mx.update(dict(zip(names, mx_objects))) + return input_master.mx.__str__() + + def generate_dpmf_objects(self): + """Generate string representation of DPMF objects in input master. + """ + # generate a list of dpmf names + names = self.generate_gmtk_object_names("dpmf") + # if single dpmf + if self.num_mix_components == 1: + input_master.dpmf[names[0]] = DPMF(1.0) + else: + # uniform probabilities + dpmf_values = str(round(1.0 / self.num_mix_components, + ROUND_NDIGITS)) + # creating DPMF objects and adding them to input master + dpmf_objects = map(DPMF, dpmf_values) + input_master.dpmf.update(dict(zip(names, dpmf_objects))) + + return str(input_master.dpmf) def calc_prob_transition(self, length): """Calculate probability transition from scaled expected length. @@ -348,6 +508,10 @@ def calc_prob_transition(self, length): def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # first values are the ones where segCountDown = 0 therefore # the transitions to segTransition = 2 occur early on + + # see Segway paper + probs_force_transition = array([0.0, 0.0, 1.0]) + card_seg_countdown = self.card_seg_countdown # by default, when segCountDown is high, never transition @@ -386,8 +550,8 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # labels with a maximum seg_countdowns_initial = self.seg_countdowns_initial - - res[0, labels_with_maximum] = self.probs_force_transition + res[0, labels_with_maximum] = probs_force_transition + # res[0, labels_with_maximum] = self.probs_force_transition for label in labels_with_maximum: seg_countdown_initial = seg_countdowns_initial[label] minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] @@ -399,109 +563,38 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa return res - - @staticmethod - def make_dirichlet_name(name): - return "dirichlet_%s" % name - - -class DenseCPTParamSpec(TableParamSpec): - type_name = "DENSE_CPT" - copy_attrs = TableParamSpec.copy_attrs \ - + ["random_state", "len_seg_strength", "use_dinucleotide"] - - def make_table_spec(self, name, table, dirichlet=False): - """ - if dirichlet is True, this table has a corresponding DirichletTable - automatically generated name - """ - ndim = table.ndim - 1 # don't include output dim - - if dirichlet: - extra_rows = ["DirichletTable %s" % self.make_dirichlet_name(name)] - else: - extra_rows = [] - - return TableParamSpec.make_table_spec(self, name, table, ndim, - extra_rows) - - def make_empty_cpt(self): + def generate_dense_cpt_objects(self): + # names of dense cpts + names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg", + "segCountDown_seg_segTransition"] num_segs = self.num_segs - - return zeros((num_segs, num_segs)) - - def make_dense_cpt_start_seg_spec(self): - num_segs = self.num_segs - cpt = fill_array(1.0 / num_segs, num_segs) - - return self.make_table_spec("start_seg", cpt) - - def make_dense_cpt_seg_subseg_spec(self): num_subsegs = self.num_subsegs - cpt = fill_array(1.0 / num_subsegs, (self.num_segs, num_subsegs)) - - return self.make_table_spec("seg_subseg", cpt) - - def make_dense_cpt_seg_seg_spec(self): - cpt = make_zero_diagonal_table(self.num_segs) - - return self.make_table_spec("seg_seg", cpt) - - def make_dense_cpt_seg_subseg_subseg_spec(self): - cpt_seg = make_zero_diagonal_table(self.num_subsegs) - cpt = vstack_tile(cpt_seg, self.num_segs, 1) - return self.make_table_spec("seg_subseg_subseg", cpt) + # create required probability tables + start_seg = fill_array(1.0 / num_segs, num_segs) + seg_subseg = fill_array(1.0 / num_subsegs, (num_segs, num_subsegs)) + seg_seg = make_zero_diagonal_table(num_segs) + cpt_seg = make_zero_diagonal_table(num_subsegs) + seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1)) + segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition() + prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown] + # create DenseCPTs and add to input_master.dense_cpt: InlineSection + for i in range(len(names)): + input_master.dense_cpt[names[i]] = np.squeeze(DenseCPT(prob[i]), axis=0) + # adding dirichlet row if necessary + if self.len_seg_strength > 0: + dirichlet_row = ["DirichletTable %s" % self.make_dirichlet_name(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION)] + input_master.dense_cpt[NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION].extra_rows = dirichlet_row + return str(input_master.dense_cpt) def make_dinucleotide_table_row(self): - # simple one-parameter model - gc = self.random_state.uniform() - at = 1 - gc - - a = at / 2 - c = gc / 2 - g = gc - c - t = 1 - a - c - g - - acgt = array([a, c, g, t]) - - # shape: (16,) - return outer(acgt, acgt).ravel() - - def make_dense_cpt_seg_dinucleotide_spec(self): - table = [self.make_dinucleotide_table_row() - for seg_index in range(self.num_segs)] + pass - return self.make_table_spec("seg_dinucleotide", table) + def make_seg_dinucleotide(self): + pass - def make_dense_cpt_segCountDown_seg_segTransition_spec(self): # noqa - cpt = self.make_dense_cpt_segCountDown_seg_segTransition() - - return self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, cpt, - dirichlet=self.len_seg_strength > 0) - - def generate_objects(self): - yield self.make_dense_cpt_start_seg_spec() - yield self.make_dense_cpt_seg_subseg_spec() - yield self.make_dense_cpt_seg_seg_spec() - yield self.make_dense_cpt_seg_subseg_subseg_spec() - yield self.make_dense_cpt_segCountDown_seg_segTransition_spec() - - if self.use_dinucleotide: - yield self.make_dense_cpt_seg_dinucleotide_spec() - - -class DirichletTabParamSpec(TableParamSpec): - type_name = "DIRICHLET_TAB" - copy_attrs = TableParamSpec.copy_attrs \ - + ["len_seg_strength", "num_bases", "card_seg_countdown", - "num_mix_components"] - - def make_table_spec(self, name, table): - dirichlet_name = self.make_dirichlet_name(name) - - return TableParamSpec.make_table_spec(self, dirichlet_name, table, - table.ndim) + def make_dirichlet_name(self, name): + return "dirichlet_{}".format(name) def make_dirichlet_table(self): probs = self.make_dense_cpt_segCountDown_seg_segTransition() @@ -518,336 +611,111 @@ def make_dirichlet_table(self): return pseudocounts - def generate_objects(self): + def generate_dirichlet_objects(self): # XXX: these called functions have confusing/duplicative names if self.len_seg_strength > 0: + lines = ["DIRICHLET_TAB_IN_FILE inline"] + lines.append("1\n") # only one DirichletTab for segCountDown_seg_segTransition + row = ["0"] # index of dirichlet tab + row.append(self.make_dirichlet_name(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION)) + # name of dirichlet tab dirichlet_table = self.make_dirichlet_table() - yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, - dirichlet_table) - - -class NameCollectionParamSpec(ParamSpec): - type_name = "NAME_COLLECTION" - header_tmpl = "collection_seg_${track} ${fullnum_subsegs}" - row_tmpl = "mx_${seg}_${subseg}_${track}" - - def generate_objects(self): - num_segs = self.num_segs - num_subsegs = self.num_subsegs - track_groups = self.track_groups - - substitute_header = Template(self.header_tmpl).substitute - substitute_row = Template(self.row_tmpl).substitute - - fullnum_subsegs = num_segs * num_subsegs - - for track_group in track_groups: - head_trackname = track_group[0].name - - # XXX: rename in template: track -> head_trackname - mapping = dict(track=head_trackname, - fullnum_subsegs=fullnum_subsegs) - - rows = [substitute_header(mapping)] - for seg_index in range(num_segs): - seg = "seg%d" % seg_index - - for subseg_index in range(num_subsegs): - subseg = "subseg%d" % subseg_index - mapping = dict(seg=seg, subseg=subseg, - track=head_trackname) - - rows.append(substitute_row(mapping)) - - yield "\n".join(rows) - - -class MeanParamSpec(ParamSpec): - type_name = "MEAN" - object_tmpl = "mean_${seg}_${subseg}_${track}${component_suffix} 1 ${datum}" - jitter_std_bound = 0.2 - - copy_attrs = ParamSpec.copy_attrs + ["means", "num_mix_components", "random_state", "vars"] + dim_shape = [dirichlet_table.ndim] + dim_shape.extend(dirichlet_table.shape) + row.append(" ".join(map(str, dim_shape))) + lines.append(" ".join(row)) + value = array2text(dirichlet_table) + lines.append("{}\n\n".format(value)) + return "\n".join(lines) + else: + return "" + + +class DTParamSpec(ParamSpec): + type_name = "DT" + copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", + "supervision_type"] - def make_data(self): + def make_segCountDown_tree_spec(self, resourcename): # noqa num_segs = self.num_segs - num_subsegs = self.num_subsegs - means = self.means # indexed by track_index - - # maximum likelihood, adjusted by no more than 0.2*sd - stds = sqrt(self.vars) - - # tile the means of each track (num_segs, num_subsegs times) - means_tiled = vstack_tile(means, num_segs, num_subsegs) - stds_tiled = vstack_tile(stds, num_segs, num_subsegs) - - jitter_std_bound = self.jitter_std_bound - noise = self.random_state.uniform(-jitter_std_bound, - jitter_std_bound, stds_tiled.shape) - - return means_tiled + (stds_tiled * noise) - - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - - for component in range(self.num_mix_components): - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) - - mapping["datum"] = mapping["datum"] - yield substitute(mapping) - - -class CovarParamSpec(ParamSpec): - type_name = "COVAR" - object_tmpl = "covar_${seg}_${subseg}_${track}${component_suffix} 1 ${datum}" - - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components", "vars"] - - def make_data(self): - return vstack_tile(self.vars, self.num_segs, self.num_subsegs) - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - for component in range(self.num_mix_components): - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) - - mapping["datum"] = mapping["datum"] - yield substitute(mapping) - - -class TiedCovarParamSpec(CovarParamSpec): - object_tmpl = "covar_${track}${component_suffix} 1 ${datum}" - - def make_segnames(self): - return ["any"] - - def make_subsegnames(self): - return ["any"] - - -class RealMatParamSpec(ParamSpec): - type_name = "REAL_MAT" - - def generate_objects(self): - yield "matrix_weightscale_1x1 1 1 1.0" - - -class GammaRealMatParamSpec(RealMatParamSpec): - scale_tmpl = "gammascale_${seg}_${subseg}_${track} 1 1 ${datum}" - shape_tmpl = "gammashape_${seg}_${subseg}_${track} 1 1 ${datum}" - - copy_attrs = ParamSpec.copy_attrs \ - + ["means", "random_state", "vars"] - - def generate_objects(self): - means = self.means - vars = self.vars - - substitute_scale = Template(self.scale_tmpl).substitute - substitute_shape = Template(self.shape_tmpl).substitute - - # random start values are equivalent to the random start - # values of a Gaussian: - # - # means = scales * shapes - # vars = shapes * scales**2 - # - # therefore: - scales = vars / means - shapes = (means ** 2) / vars - - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - - scale = jitter(scales[track_index], self.random_state) - yield substitute_scale(dict(datum=scale, **mapping)) - - shape = jitter(shapes[track_index], self.random_state) - yield substitute_shape(dict(datum=shape, **mapping)) - - -class MCParamSpec(ParamSpec): - type_name = "MC" - + seg_countdowns_initial = self.seg_countdowns_initial -class NormMCParamSpec(MCParamSpec): - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components"] + header = ([str(num_segs)] + + [str(num_seg) for num_seg in range(num_segs - 1)] + + ["default"]) - if USE_MFSDG: - # dimensionality component_type name mean covar weights - object_tmpl = "1 COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN" \ - " mc_${distribution}_${seg}_${subseg}_${track}" \ - " mean_${seg}_${subseg}_${track} covar_${seg}_${subseg}_${track}" \ - " matrix_weightscale_1x1" - else: - # dimensionality component_type name mean covar - object_tmpl = "1 COMPONENT_TYPE_DIAG_GAUSSIAN" \ - " mc_${distribution}_${seg}_${subseg}_${track}${component_suffix}" \ - " mean_${seg}_${subseg}_${track}${component_suffix} covar_${track}${component_suffix}" + lines = [" ".join(header)] - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - for component in range(self.num_mix_components): - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) + for seg, seg_countdown_initial in enumerate(seg_countdowns_initial): + lines.append(" -1 %d" % seg_countdown_initial) - yield substitute(mapping) + tree = "\n".join(lines) + return resource_substitute(resourcename)(tree=tree) -class GammaMCParamSpec(MCParamSpec): - object_tmpl = "1 COMPONENT_TYPE_GAMMA mc_gamma_${seg}_${subseg}_${track}" \ - " ${min_track} gammascale_${seg}_${subseg}_${track}" \ - " gammashape_${seg}_${subseg}_${track}" + def make_map_seg_segCountDown_dt_spec(self): # noqa + return self.make_segCountDown_tree_spec("map_seg_segCountDown.dt.tmpl") + def make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec( + self): # noqa + template_name = \ + "map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl" + return self.make_segCountDown_tree_spec(template_name) -class MXParamSpec(ParamSpec): - type_name = "MX" def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - object_tmpl = "1 mx_${seg}_${subseg}_${track} ${num_mix_components} " + yield data_string("map_frameIndex_ruler.dt.txt") + yield self.make_map_seg_segCountDown_dt_spec() + yield self.make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec() # noqa + yield data_string("map_seg_subseg_obs.dt.txt") - # If the number of mixture components is one - if self.num_mix_components == 1: - # Set the dense probabily mass function containing component - # responsibilites to be set to always 1 for 1 component - object_tmpl += "dpmf_always" - # Otherwise set the dense probability mass function based on number - # of components from the GMTK DPMF definition + supervision_type = self.supervision_type + if supervision_type == SUPERVISION_SEMISUPERVISED: + yield data_string( + "map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt") # noqa + elif supervision_type == SUPERVISION_SUPERVISED: + # XXX: does not exist yet + yield data_string( + "map_supervisionLabel_seg_alwaysTrue_supervised.dt.txt") # noqa else: - object_tmpl += "dpmf_${seg}_${subseg}_${track}" - - for component in range(self.num_mix_components): - add = " mc_${distribution}_${seg}_${subseg}_${track}%s" % ( - self.get_template_component_suffix(component)) - object_tmpl += add - substitute = Template(object_tmpl).substitute - - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - mapping["num_mix_components"] = self.num_mix_components - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - yield substitute(mapping) - -class DPMFParamSpec(DenseCPTParamSpec): - type_name = "DPMF" - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components"] + assert supervision_type == SUPERVISION_UNSUPERVISED - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - # If the number of mixture components is one - if self.num_mix_components == 1: - # Create a dense probability mass function of one value of 1 - # to fix the number of mixture components to one - yield "dpmf_always 1 1.0" - # Otherwise - else: - # Create a dense probability mass function of dirichlet constants - # with the same amount of mixture components - object_tmpl = "dpmf_${seg}_${subseg}_${track} ${num_mix_components} "\ - "DirichletConst %s ${weights}" % GAUSSIAN_MIXTURE_WEIGHTS_PSEUDOCOUNT - component_weight = str(round(1.0 / self.num_mix_components, - ROUND_NDIGITS)) - weights = (" " + component_weight) * self.num_mix_components - substitute = Template(object_tmpl).substitute - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - mapping["weights"] = weights - track_index = mapping["track_index"] - mapping["num_mix_components"] = self.num_mix_components - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - yield substitute(mapping) class VirtualEvidenceSpec(ParamSpec): type_name = "VE_CPT" # According to GMTK specification (tksrc/GMTK_VECPT.cc) - # this should be of the format: + # this should be of the format: # CPT_name num_par par_card self_card VE_CPT_FILE # nfs:nfloats nis:nints ... fmt:obsformat ... END object_tmpl = "seg_virtualEvidence 1 %s 2 %s nfs:%s nis:0 fmt:ascii END" copy_attrs = ParamSpec.copy_attrs + ["virtual_evidence", "num_segs"] def make_virtual_evidence_spec(self): - return self.object_tmpl % (self.num_segs, VIRTUAL_EVIDENCE_LIST_FILENAME, self.num_segs) + return self.object_tmpl % ( + self.num_segs, VIRTUAL_EVIDENCE_LIST_FILENAME, self.num_segs) def generate_objects(self): yield self.make_virtual_evidence_spec() - + class InputMasterSaver(Saver): resource_name = "input.master.tmpl" copy_attrs = ["num_bases", "num_segs", "num_subsegs", "num_track_groups", "card_seg_countdown", "seg_countdowns_initial", "seg_table", "distribution", - "len_seg_strength", "resolution", "random_state", "supervision_type", + "len_seg_strength", "resolution", "random_state", + "supervision_type", "use_dinucleotide", "mins", "means", "vars", "gmtk_include_filename_relative", "track_groups", - "num_mix_components", "virtual_evidence"] + "num_mix_components", "virtual_evidence", "tracks"] def make_mapping(self): # the locals of this function are used as the template mapping # use caution before deleting or renaming any variables # check that they are not used in the input.master template + param_spec = ParamSpec(self) num_free_params = 0 - + num_segs = self.num_segs num_subsegs = self.num_subsegs num_track_groups = self.num_track_groups @@ -856,58 +724,39 @@ def make_mapping(self): include_filename = self.gmtk_include_filename_relative dt_spec = DTParamSpec(self) - - if self.len_seg_strength > 0: - dirichlet_spec = DirichletTabParamSpec(self) - else: - dirichlet_spec = "" - - dense_cpt_spec = DenseCPTParamSpec(self) + dirichlet_spec = param_spec.generate_dirichlet_objects() + dense_cpt_spec = param_spec.generate_dense_cpt_objects() # seg_seg num_free_params += fullnum_subsegs * (fullnum_subsegs - 1) # segCountDown_seg_segTransition num_free_params += fullnum_subsegs + name_collection_spec = param_spec.generate_name_collection() distribution = self.distribution if distribution in DISTRIBUTIONS_LIKE_NORM: - mean_spec = MeanParamSpec(self) - if COVAR_TIED: - covar_spec = TiedCovarParamSpec(self) - else: - covar_spec = CovarParamSpec(self) + mean_spec = param_spec.generate_mean_objects() + covar_spec = param_spec.generate_covar_objects() - if USE_MFSDG: - real_mat_spec = RealMatParamSpec(self) - else: - real_mat_spec = "" - - mc_spec = NormMCParamSpec(self) + # TODO: class RealMatParamSpec + # for now this is sufficient because util.USE_MFSDG = False by default + real_mat_spec = "" + mc_spec = param_spec.generate_mc_objects() if COVAR_TIED: num_free_params += (fullnum_subsegs + 1) * num_track_groups else: num_free_params += (fullnum_subsegs * 2) * num_track_groups - elif distribution == DISTRIBUTION_GAMMA: - mean_spec = "" - covar_spec = "" + + # TODO: gamma distribution option - # XXX: another option is to calculate an ML estimate for - # the gamma distribution rather than the ML estimate for the - # mean and converting - real_mat_spec = GammaRealMatParamSpec(self) - mc_spec = GammaMCParamSpec(self) - - num_free_params += (fullnum_subsegs * 2) * num_track_groups else: raise ValueError("distribution %s not supported" % distribution) - mx_spec = MXParamSpec(self) - name_collection_spec = NameCollectionParamSpec(self) + dpmf_spec = param_spec.generate_dpmf_objects() + mx_spec = param_spec.generate_mx_objects() card_seg = num_segs - dpmf_spec = DPMFParamSpec(self) - ve_spec = VirtualEvidenceSpec(self) - return locals() # dict of vars set in this function + return locals() # dict of vars set in this function \ No newline at end of file From 6ee911288f77a55d5bb2506b8cdc65b1dfc5490a Mon Sep 17 00:00:00 2001 From: Sera Riddhi Date: Tue, 7 Jun 2022 17:26:40 -0400 Subject: [PATCH 2/5] changed str() and added citation --- segway/input_master.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index cffd6936..51be8d27 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -475,7 +475,7 @@ def generate_mx_objects(self): # adding MX object to input master input_master.mx.update(dict(zip(names, mx_objects))) - return input_master.mx.__str__() + return str(input_master.mx) def generate_dpmf_objects(self): """Generate string representation of DPMF objects in input master. @@ -509,7 +509,7 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # first values are the ones where segCountDown = 0 therefore # the transitions to segTransition = 2 occur early on - # see Segway paper + # see Segway paper (Hoffman, M., Buske, O., Wang, J. et al. Unsupervised pattern discovery in human chromatin structure through genomic segmentation. Nat Methods 9, 473–476 (2012). https://doi.org/10.1038/nmeth.1937) probs_force_transition = array([0.0, 0.0, 1.0]) card_seg_countdown = self.card_seg_countdown From 4f982d28118c5aa5d12de81fbbdfe0534f2cc8cc Mon Sep 17 00:00:00 2001 From: Sera Riddhi Date: Tue, 7 Jun 2022 17:41:55 -0400 Subject: [PATCH 3/5] removed dereferences --- segway/input_master.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 51be8d27..f4a3aff1 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -381,16 +381,12 @@ def generate_name_collection(self): return str(input_master.name_collection) def make_mean_data(self): - num_segs = self.num_segs - num_subsegs = self.num_subsegs - means = self.means # indexed by track_index - # maximum likelihood, adjusted by no more than 0.2*sd stds = sqrt(self.vars) # tile the means of each track (num_segs, num_subsegs times) - means_tiled = vstack_tile(means, num_segs, num_subsegs) - stds_tiled = vstack_tile(stds, num_segs, num_subsegs) + means_tiled = vstack_tile(self.means, self.num_segs, self.num_subsegs) + stds_tiled = vstack_tile(stds, self.num_segs, self.num_subsegs) jitter_std_bound = self.jitter_std_bound noise = self.random_state.uniform(-jitter_std_bound, @@ -567,15 +563,13 @@ def generate_dense_cpt_objects(self): # names of dense cpts names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg", "segCountDown_seg_segTransition"] - num_segs = self.num_segs - num_subsegs = self.num_subsegs # create required probability tables - start_seg = fill_array(1.0 / num_segs, num_segs) - seg_subseg = fill_array(1.0 / num_subsegs, (num_segs, num_subsegs)) - seg_seg = make_zero_diagonal_table(num_segs) - cpt_seg = make_zero_diagonal_table(num_subsegs) - seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1)) + start_seg = fill_array(1.0 / self.num_segs, self.num_segs) + seg_subseg = fill_array(1.0 / self.num_subsegs, (self.num_segs, self.num_subsegs)) + seg_seg = make_zero_diagonal_table(self.num_segs) + cpt_seg = make_zero_diagonal_table(self.num_subsegs) + seg_subseg_subseg = (vstack_tile(cpt_seg, self.num_segs, 1)) segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition() prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown] # create DenseCPTs and add to input_master.dense_cpt: InlineSection @@ -716,10 +710,8 @@ def make_mapping(self): param_spec = ParamSpec(self) num_free_params = 0 - num_segs = self.num_segs - num_subsegs = self.num_subsegs num_track_groups = self.num_track_groups - fullnum_subsegs = num_segs * num_subsegs + fullnum_subsegs = self.num_segs * self.num_subsegs include_filename = self.gmtk_include_filename_relative @@ -756,7 +748,7 @@ def make_mapping(self): dpmf_spec = param_spec.generate_dpmf_objects() mx_spec = param_spec.generate_mx_objects() - card_seg = num_segs + card_seg = self.num_segs ve_spec = VirtualEvidenceSpec(self) return locals() # dict of vars set in this function \ No newline at end of file From 48509941e4e0d7127f7915ae827d614adab0bdc1 Mon Sep 17 00:00:00 2001 From: Sera Riddhi Date: Tue, 7 Jun 2022 17:43:40 -0400 Subject: [PATCH 4/5] removed dead code --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index f4a3aff1..e4e5f75b 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -547,7 +547,7 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # labels with a maximum seg_countdowns_initial = self.seg_countdowns_initial res[0, labels_with_maximum] = probs_force_transition - # res[0, labels_with_maximum] = self.probs_force_transition + for label in labels_with_maximum: seg_countdown_initial = seg_countdowns_initial[label] minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] From 3ff484e2df74713bdae2233e643dc41a2dbeba41 Mon Sep 17 00:00:00 2001 From: Riddhi Sera <62648836+riddhisera@users.noreply.github.com> Date: Tue, 7 Jun 2022 17:56:11 -0400 Subject: [PATCH 5/5] Create __init__.py --- segway/gmtk/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 segway/gmtk/__init__.py diff --git a/segway/gmtk/__init__.py b/segway/gmtk/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/segway/gmtk/__init__.py @@ -0,0 +1 @@ +