From c7c4ef2c01443bb155f85bb27162031e05d85cc1 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 30 Jul 2020 11:19:35 -0400 Subject: [PATCH 01/72] added gmtk.py --- segway/gmtk.py | 587 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 587 insertions(+) create mode 100644 segway/gmtk.py diff --git a/segway/gmtk.py b/segway/gmtk.py new file mode 100644 index 00000000..0294d245 --- /dev/null +++ b/segway/gmtk.py @@ -0,0 +1,587 @@ +from collections import OrderedDict + +MC_TYPE_DIAG = "COMPONENT_TYPE_DIAG_GAUSSIAN" +MC_TYPE_GAMMA = "COMPONENT_TYPE_GAMMA" +MC_TYPE_MISSING = "COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN" +COPY_PARENT = "internal:copyParent" + + +class DenseCPT: + """ + A single DenseCPT object. + Attributes: + parent_card + cardinality + prob + """ + def __init__(self, name, cardinality, prob, parent_card=-1): + """ + name: str + parent_card: str/int or list[str/int] + cardinality: str/int + prob: list[float] + """ + self.name = name + if parent_card != -1: + if not isinstance(parent_card, list): + self.parent_card = [parent_card] + else: + self.parent_card = parent_card + else: + self.parent_card = -1 + self.cardinality = cardinality + # TODO array + self.prob = prob + + def generate(self, index): + """ + Returns string format of DenseCPT to be printed into input.master + file (new lines to be added). + index: int + index of the denseCPT + """ + lines = [] + line = [] + line.append(str(index)) + line.append(self.name) + if self.parent_card == -1: # no parents + num_parents = 0 + parent_card_str = [""] + else: + num_parents = len(self.parent_card) + parent_card_str = [] + for i in range(num_parents): + parent_card_str.append(str(self.parent_card[i])) + line.append(str(num_parents)) + if self.parent_card != -1: + line.extend(parent_card_str) + line.append(str(self.cardinality)) + lines.append(" ".join(line)) + lines.append(self.generate_prob(self.prob) + "\n") + lines.append("\n") + return "\n".join(lines) + + def generate_prob(self, prob): + """ + Generates format of probabilities for single DenseCPT. + :param prob: list[float] + probabilities of DenseCPT + :return: string format to be used by DenseCPT.generate() + """ + line = [] + if isinstance(prob[0], float): + prob_str = [] + for i in range(len(prob)): + prob_str.append(str(prob[i])) + return " ".join(prob_str) + else: + + for i in range(len(prob)): + line.append(self.generate_prob(prob[i])) + # TODO check if it works without that one line gap + return "\n".join(line) + + +class DeterministicCPT: + """ + A single DeterministicCPT objects. + Attributes: + parent_card: str/int or list[str/int] + cardinality: str/int + name_of_existing_DT: str + """ + + def __init__(self, name, parent_card, cardinality, dt): + """ + name: str + parent_card: str/int or list[str/int] + cardinality: str/int + dt: str + """ + self.name = name + if not isinstance(parent_card, list): + self.parent_card = [parent_card] + else: + self.parent_card = parent_card + + self.cardinality = cardinality + self.dt = dt + + def generate(self, index): + """ + :return: String format of DeterministicCPT to be printed into + input.master + file (new lines to be added). + index: int + index of DeterministicCPT + """ + lines = [] + line = [] + line.append(str(index)) + line.append(self.name) + lines.append(" ".join(line)) + lines.append(str(len(self.parent_card))) + num_parents_cardinalities = [] + num_parents_cardinalities.extend(self.parent_card) + num_parents_cardinalities.append(self.cardinality) + lines.append(" ".join(num_parents_cardinalities)) + lines.append(self.dt) + lines.append("\n") + + return "\n".join(lines) + + +class NameCollection: + """ + A single NameCollection object. + Attributes: + names: list[str] or str + """ + + def __init__(self, name, *args): + """ + name: str + name of collection + :param args: str + name in name collection + """ + self.name = name + self.names_in_col = [] + for name in args: + if isinstance(name, list): + self.names_in_col.extend(name) + else: + self.names_in_col.append(name) + + def generate(self, index): + """ + Returns string format of NameCollection objects to be printed into the + input.master file (new lines to be added) + index: int + index of name collection + """ + line = [] + line.append(str(index)) + line.append(self.name) + line.append(str(len(self.names_in_col))) + + lines = [] + lines.append(" ".join(line)) + lines.append("\n".join(self.names_in_col)) + lines.append("\n") + + return "\n".join(lines) + + +class Mean: + """ + A single Mean object. + name: str + value: list[float] or float + Mean values of the Mean object. + """ + + def __init__(self, name, *args): + """ + name: str + name of mean object + :param args: float + mean values + """ + self.name = name + self.mean_values = [] + for val in args: + self.mean_values.append(val) + + def generate(self, index): + """ + Returns the string format of the Mean object to be printed into the + input.master file (new lines to be added). + index: int + index of mean object + :return: + """ + line = [] + line.append(str(index)) + line.append(self.name) + line.append(str(len(self.mean_values))) + mean_str = [] + for i in self.mean_values: + mean_str.append(str(i)) + line.extend(mean_str) + line.append("\n") + + return " ".join(line) + + +class MC: + """ + A single all MC objects. + Value: list + mc = MC() + mc1 = [26, 0, "mean_0", "covar_0"] + = [, , , ] + """ + def __init__(self, name, dim, type, mean=Mean('sample_mean'), + covar=Mean('sample_covar'), weights=[], gamma_shape="", + gamma_scale=""): + """ + name: str + name of MC object + :param dim: str/int + dimensionality of mc + :param type: str/int + type of mc + :param mean: Mean + mean of mc + :param covar: Covar + covar of mc + """ + self.name = name + self.mean = mean + self.covar = covar + self.dim = dim + self.type = type + # TODO + self.weights = weights + self.gamma_shape = gamma_shape + self.gamma_scale = gamma_scale + + def generate(self, index): + """ + Returns string format of MC object to be printed into the input.master + file (new lines to be added). + index: int + index of mc object + :return: + """ + line = [] + line.append(str(index)) + line.append(str(self.dim)) + + if self.type == MC_TYPE_GAMMA: + # TODO min track + line.append(str(self.type)) + line.append(self.gamma_scale) + line.append(self.gamma_shape) + + elif self.type == MC_TYPE_MISSING: + line.append(str(self.type)) + line.append(self.name) + line.append(self.mean.name) + line.append(self.covar.name) + line.append("matrix_weightscale_1x1") + line.append("\n") + else: # default and for MC_TYPE_DIAG + # TODO component_suffix + line.append(str(self.type)) + line.append(self.name) + line.append(self.mean.name) + line.append(self.covar.name) + line.append("\n") + + return " ".join(line) + + +class MX: + """ + A single MX object. + """ + def __init__(self, name, dim, dpmf, components): + """ + name: str + name of MX object + dimensionality: int + dpmf: DPMF + components: list[mc] or mc (component) + """ + self.name = name + self.dim = dim + if not isinstance(components, list): + components = [components] + if len(dpmf.dpmf_values) != len(components): + raise ValueError("Dimension of DPMF object must be equal " + + "to number of components of MX.") + self.comp = components + self.dpmf = dpmf + + def generate(self, index): + """ + Returns string format of MX object to be printed into the input.master + file (new lines to be added). + index: int + index of mx object + :return: + """ + line = [] + line.append(str(index)) + line.append(str(self.dim)) + line.append(self.name) + line.append(str(len(self.comp))) # num components + line.append(self.dpmf.name) + comp_names = [] + for comp in self.comp: + comp_names.append(comp.name) + line.extend(comp_names) + line.append("\n") + + return " ".join(line) + + +class Covar: + """ + A single Covar object. + """ + + def __init__(self, name, *args): + """ + name: str + name of MX object + :param args: covar values + """ + self.name = name + self.covar_values = [] + for val in args: + self.covar_values.append(val) + + def generate(self, index): + """ + Returns string format of Covar object to be printed into the + input.master + file (new lines to be added). + index: int + index of Covar object + :return: + """ + line = [] + line.append(str(index)) + line.append(self.name) + line.append(str(len(self.covar_values))) + covar_str = [] + for i in self.covar_values: + covar_str.append(str(i)) + line.extend(covar_str) + line.append("\n") + + return " ".join(line) + + +class DPMF: + """ + A single DPMF object. + """ + + def __init__(self, name, *args): + """ + name: str + name of dpmf object + :param args: dpmf values summing to 1 + + """ + self.name = name + self.dpmf_values = [] + for val in args: + self.dpmf_values.append(val) + print("dpmf_val", self.dpmf_values) + if sum(self.dpmf_values) != 1.0: + self.dpmf_values = [] + raise ValueError("DPMF values must sum to 1.0.") + + def generate(self, index): + """ + Returns string format of DPMF object to be printed into the + input.master + file (new lines to be added). + :return: + """ + line = [] + line.append(str(index)) + line.append(self.name) + line.append(str(len(self.dpmf_values))) + dpmf_str = [] + for i in self.dpmf_values: + dpmf_str.append(str(i)) + line.extend(dpmf_str) + line.append("\n") + return " ".join(line) + + +class Object: + + def __new__(cls, _name, content, _kind): + pass + + def __init__(self, name, content, kind): + pass + + +class InputMaster: + """ + Main class to produce the input.master file. + Attributes: + mean: OrderedDict + covar: OrderedDict + dense: OrderedDict + deterministic: OrderedDict + dpmf: OrderedDict + mc: OrderedDict + mx: OrderedDict + name_collection: OrderedDict + key: name of object + value: GMTKObject instance + """ + + def __init__(self): + self.mean = OrderedDict() + self.covar = OrderedDict() + self.dense = OrderedDict() + self.deterministic = OrderedDict() + self.dpmf = OrderedDict() + self.mc = OrderedDict() + self.mx = OrderedDict() + self.name_collection = OrderedDict() + + def update(self, gmtk_obj): + """ + gmtk_obj: list or single gmtk object + List of GMTK objects + """ + if not isinstance(gmtk_obj, list): + gmtk_obj = [gmtk_obj] + for obj in gmtk_obj: + if not (isinstance(obj, Mean) or isinstance(obj, Covar) or + isinstance(obj, DeterministicCPT) or isinstance(obj, + DenseCPT) + or isinstance(obj, DPMF) or isinstance(obj, MC) + or isinstance(obj, MX) or isinstance(obj, NameCollection)): + + raise ValueError("Object is not an allowed GMTK type.") + + for obj in gmtk_obj: # all objects are of allowed types + + name = obj.name + + if isinstance(obj, Mean): + self.mean[name] = obj + if isinstance(obj, Covar): + self.covar[name] = obj + if isinstance(obj, DeterministicCPT): + self.deterministic[name] = obj + if isinstance(obj, DenseCPT): + self.dense[name] = obj + if isinstance(obj, DPMF): + self.dpmf[name] = obj + if isinstance(obj, MC): + self.mc[name] = obj + if isinstance(obj, MX): + self.mx[name] = obj + if isinstance(obj, NameCollection): + self.name_collection[name] = obj + + def generate_mean(self): + if len(self.mean) == 0: + return [] + + means = ["MEAN_IN_FILE inline"] + means.append(str(len(self.mean)) + "\n") + for key_index in range(len(list(self.mean))): + means.append( + self.mean[list(self.mean)[key_index]].generate(key_index)) + return "\n".join(means) + + def generate_covar(self): + if len(self.covar) == 0: + return [] + + covars = ["COVAR_IN_FILE inline"] + covars.append(str(len(self.covar)) + "\n") + + for key_index in range(len(list(self.covar))): + covars.append( + self.covar[list(self.covar)[key_index]].generate(key_index)) + return "\n".join(covars) + + def generate_dense(self): + if len(self.dense) == 0: + return [] + + dense_cpts = ["DENSE_CPT_IN_FILE inline"] + dense_cpts.append(str(len(self.dense)) + "\n") + + for key_index in range(len(list(self.dense))): + dense_cpts.append( + self.dense[list(self.dense)[key_index]].generate(key_index)) + return "\n".join(dense_cpts) + + def generate_deterministic(self): + if len(self.deterministic) == 0: + return [] + + det_cpts = ["DETERMINISTIC_CPT_IN_FILE inline"] + det_cpts.append(str(len(self.deterministic)) + "\n") + + for key_index in range(len(list(self.deterministic))): + det_cpts.append(self.deterministic[ + list(self.deterministic)[key_index]].generate( + key_index)) + return "\n".join(det_cpts) + + def generate_dpmf(self): + if len(self.dpmf) == 0: + return [] + + dpmfs = ["DPMF_IN_FILE inline"] + dpmfs.append(str(len(self.dpmf)) + "\n") + + for key_index in range(len(list(self.dpmf))): + dpmfs.append( + self.dpmf[list(self.dpmf)[key_index]].generate(key_index)) + return "\n".join(dpmfs) + + def generate_mc(self): + if len(self.mc) == 0: + return [] + + mcs = ["MC_IN_FILE inline"] + mcs.append(str(len(self.mc)) + "\n") + + for key_index in range(len(list(self.mc))): + mcs.append(self.mc[list(self.mc)[key_index]].generate(key_index)) + + return "\n".join(mcs) + + def generate_mx(self): + if len(self.mx) == 0: + return [] + + mxs = ["MX_IN_FILE inline"] + mxs.append(str(len(self.mx)) + "\n") + + for key_index in range(len(list(self.mx))): + mxs.append(self.mx[list(self.mx)[key_index]].generate(key_index)) + return "\n".join(mxs) + + def generate_name_col(self): + if len(self.name_collection) == 0: + return [] + + collections = ["NAME_COLLECTION_IN_FILE inline"] + collections.append(str(len(self.name_collection)) + "\n") + + for key_index in range(len(list(self.name_collection))): + collections.append(self.name_collection[list(self.name_collection)[ + key_index]].generate(key_index)) + + return "\n".join(collections) + + def __str__(self): + attrs_gen = [self.generate_name_col(), self.generate_deterministic(), + self.generate_dense(), self.generate_mean(), + self.generate_covar(), self.generate_dpmf(), + self.generate_mc(), self.generate_mx()] + s = [] + for obj in attrs_gen: + s.append("".join(obj)) + + return "".join(s) + From ccbe39e5ae192c2b775090346a553e3d962d95f1 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 30 Jul 2020 11:46:26 -0400 Subject: [PATCH 02/72] Update input_master.py --- segway/input_master.py | 839 +++++++++++++++++++++-------------------- 1 file changed, 438 insertions(+), 401 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 371118c2..a977042a 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python from __future__ import absolute_import, division - """input_master.py: write input master files """ @@ -26,6 +24,10 @@ SUPERVISION_SUPERVISED, USE_MFSDG, VIRTUAL_EVIDENCE_LIST_FILENAME) +from .gmtk import InputMaster, NameCollection, DenseCPT, \ + DeterministicCPT, DPMF, MC, MX, Covar, Mean + + # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of # scalars and vectors in the parameter output. By default in newer (> 1.14) # versions printed output "giv[es] the shortest unique representation". @@ -69,6 +71,7 @@ # TODO[PY2-EOL]: remove ROUND_NDIGITS = 12 +input_master = InputMaster() def vstack_tile(array_like, *reps): reps = list(reps) + [1] @@ -99,7 +102,7 @@ def make_spec(name, iterable): all_lines = header_lines + indexed_items - # In Python 2, convert from unicode to bytes to prevent + # In Python 2, convert from unicode to bytes to prevent # __str__method from being called twice # Specifically in the string template standard library provided by Python # 2, there is a call to a string escape sequence + tuple, e.g.: @@ -157,6 +160,82 @@ def jitter_cell(cell, random_state): jitter = vectorize(jitter_cell) +def generate_gmtk_obj_names(obj, track_names, num_segs, num_subsegs, + distribution, num_mix_components): + """ + Generate GMTK object names for the types: + NameCollection: "col" + entries in NameCollection: "mx_name" + Covar: "covar", "tied_covar" + Mean: "mean" + MX: "mx" + MC: "mc_diag", "mc_gamma", "mc_missing", "gammascale" + DPMF: "dpmf" + :param obj: str: type of gmtk object for which names must be generated + :param: track_names: list[str]: list of all track names + :param: num_segs: int: number of segs + :param: num_subsegs: int: number of subsegs + :param: distribution: str: distribution + :param: number of mixture components + :return: + """ + allowed_types = ["mx", "mc_diag", "mc_gamma", "mc_missing", "mean", + "covar", "col", "mx_name", "dpmf", "gammascale", + "gammashape", "tied_covar"] + if not obj in allowed_types: + raise ValueError("Undefined GMTK object type: {}".format(obj)) + + names = [] + if obj == "covar": + for name in track_names: + names.append("covar_{}".format(name)) + + + # todo check component suffix + elif obj == "tied_covar": + for name in track_names: + names.append("covar_{}".format(name)) + + elif obj == "col": + for name in track_names: + names.append("collection_seg_{}".format(name)) + + elif obj == "mx_name": + for name in track_names: + for i in range(num_segs): + for j in range(num_subsegs): + line = "mx_seg{}_subseg{}_{}".format(i, j, name) + names.append(line) + + elif obj == "dpmf" and num_mix_components == 1: + return ["dpmf_always"] + + else: + for i in range(num_segs): + for j in range(num_subsegs): + for name in track_names: + # TODO check component suffix diff + if obj == "mc_diag": + line = "mc_{}_seg{}_subseg{}_{}".format(distribution, + i, j, name) + # TODO + + # if obj == "mc_gamma": + # covered in general name generation + # line = "{}_{}_seg{}_subseg{}_{}".format(obj, + # distribution, i, j, name) + + # TODO + elif obj == "mc_missing": + line = "" + + else: + line = "{}_seg{}_subseg{}_{}".format(obj, i, j, name) + names.append(line) + + return names + + class ParamSpec(object): """ base class for parameter specifications used in input.master files @@ -164,12 +243,34 @@ class ParamSpec(object): type_name = None object_tmpl = None copy_attrs = ["distribution", "mins", "num_segs", "num_subsegs", - "num_track_groups", "track_groups", "num_mix_components"] + "num_track_groups", "track_groups", "num_mix_components", + "means", "vars", "num_mix_components", "random_state", "tracks"] + jitter_std_bound = 0.2 + track_names = [] def __init__(self, saver): # copy all variables from saver that it copied from Runner # XXX: override in subclasses to only copy subset copy_attrs(saver, self, self.copy_attrs) + self.track_names = [] + #print(self.tracks) + for track in self.tracks: + # print(track) + self.track_names.append(track.name) + #print("track_names", self.track_names) + + def make_segnames(self): + return format_indexed_strs("seg", self.num_segs) + + def make_subsegnames(self): + return format_indexed_strs("subseg", self.num_subsegs) + + def make_data(self): + """ + override this in subclasses + returns: container indexed by (seg_index, subseg_index, track_index) + """ + return None def get_track_lt_min(self, track_index): """ @@ -195,12 +296,6 @@ def get_track_lt_min(self, track_index): assert min_track_f32 - float32(ABSOLUTE_FUDGE) != min_track_f32 return min_track - ABSOLUTE_FUDGE - def make_segnames(self): - return format_indexed_strs("seg", self.num_segs) - - def make_subsegnames(self): - return format_indexed_strs("subseg", self.num_subsegs) - def get_template_component_suffix(self, component_number): """Returns the subsitution for the component suffix in the GMTK model template. Empty if there is only one component""" @@ -213,6 +308,7 @@ def generate_tmpl_mappings(self): # need segnames because in the tied covariance case, the # segnames are replaced by "any" (see .make_covar_spec()), # and only one mapping is produced + #print("gen tmpl mapping used") num_subsegs = self.num_subsegs track_groups = self.track_groups @@ -236,15 +332,7 @@ def generate_tmpl_mappings(self): track_index=track_group_index, index=track_offset, distribution=self.distribution) - - def make_data(self): - """ - override this in subclasses - - returns: container indexed by (seg_index, subseg_index, track_index) - """ - return None - + def generate_objects(self): """ returns: iterable of strs containing GMTK parameter objects starting @@ -270,6 +358,205 @@ def generate_objects(self): def __str__(self): return make_spec(self.type_name, self.generate_objects()) + def generate_name_collection(self): + # generate list of collection names + collection_names = generate_gmtk_obj_names(obj="col", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + # generate list of all names in NameCollections + names = generate_gmtk_obj_names("mx_name", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + num_tracks = len(self.track_names) + len_name_group = int(len(names) / num_tracks) + # names grouped by collection + name_groups = [names[i:i + len_name_group] for i in range(0, len(names), len_name_group)] + # create NameCollection objects + for group_index in range(len(name_groups)): + name_col = NameCollection(collection_names[group_index], + name_groups[group_index]) + input_master.update(name_col) + + return input_master.generate_name_col() + + def make_mean_data(self): + num_segs = self.num_segs + num_subsegs = self.num_subsegs + means = self.means # indexed by track_index + + # maximum likelihood, adjusted by no more than 0.2*sd + stds = sqrt(self.vars) + + # tile the means of each track (num_segs, num_subsegs times) + means_tiled = vstack_tile(means, num_segs, num_subsegs) + stds_tiled = vstack_tile(stds, num_segs, num_subsegs) + + jitter_std_bound = self.jitter_std_bound + noise = self.random_state.uniform(-jitter_std_bound, + jitter_std_bound, stds_tiled.shape) + + return means_tiled + (stds_tiled * noise) + + def generate_mean_objects(self): + # generate list of names of Mean objects + names = generate_gmtk_obj_names("mean", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + means = self.make_mean_data().tolist() + # TODO change array rep + # create Mean objects + for i in range(len(names)): + mean_obj = Mean(names[i], means[i]) + input_master.update(mean_obj) + + return input_master.generate_mean() + + def generate_covar_objects(self): + if COVAR_TIED: + names = generate_gmtk_obj_names("tied_covar", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + else: + names = generate_gmtk_obj_names("covar", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + covars = self.vars.tolist() # list of variance values + # create Covar objects + for i in range(len(names)): + covar_obj = Covar(names[i], covars[i]) + input_master.update(covar_obj) + + return input_master.generate_covar() + + def generate_real_mat_objects(self): + pass + + def generate_mc_objects(self): + # if distribution is norm or asinh_norm + if self.distribution in DISTRIBUTIONS_LIKE_NORM: + if USE_MFSDG: + # TODO + option = "mc_missing" + else: + option = "mc_diag" + # generate MC object names + names = generate_gmtk_obj_names(option, + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + covars = list(input_master.covar.values())* (self.num_segs * self.num_subsegs) # replicate covar values + # create MC objects + for i in range(len(names)): + mc_obj = MC(name=names[i], dim=1, type="COMPONENT_TYPE_DIAG_GAUSSIAN", + mean=list(input_master.mean.values())[i], covar=covars[i]) + input_master.update(mc_obj) + + # if distribution is gamma + elif self.distribution == DISTRIBUTION_GAMMA: + option = "mc_gamma" + names = generate_gmtk_obj_names(option, + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + # generate gammashape and gammascale names for MC objects + gamma_scale = generate_gmtk_obj_names("gammascale", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + + gamma_shape = generate_gmtk_obj_names("gammashape", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + # create MC objects + for i in range(len(names)): + mc_obj = MC(name=names[i], dim=1, type="COMPONENT_TYPE_GAMMA", + gamma_shape=gamma_shape[i], gamma_scale=gamma_scale[i]) + input_master.update(mc_obj) + return input_master.generate_mc() + + def generate_mx_objects(self): + # generate list of MX names + names = generate_gmtk_obj_names("mx", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + + mc_obj = list(input_master.mc.values()) + dpmf_obj = list(input_master.dpmf.values()) + multiple = int(len(names)/len(dpmf_obj)) + dpmf_obj *= multiple # replicate dpmf obj as MX obj components + # create MX objects + for i in range(len(names)): + mx_obj = MX(name=names[i], dim=1, dpmf=dpmf_obj[i], + components=mc_obj[i]) + input_master.update(mx_obj) + return input_master.generate_mx() + + def generate_dpmf_objects(self): + # generate a list of dpmf names + names = generate_gmtk_obj_names("dpmf", + track_names=self.track_names, num_segs=self.num_segs, + num_subsegs=self.num_subsegs, distribution=self.distribution, + num_mix_components=self.num_mix_components) + # if single dpmf + if self.num_mix_components == 1: + dpmf_obj = DPMF(names[0], 1.0) + input_master.update(dpmf_obj) + else: + # uniform probabilities + dpmf_values = str(round(1.0 / self.num_mix_components, + ROUND_NDIGITS)) + # create dpmf objects + for i in range(len(names)): + dpmf_obj = DPMF(names[i], dpmf_values[i]) + input_master.update(dpmf_obj) + return input_master.generate_dpmf() + + def generate_ve(self): + # TODO + pass + + def generate_dense_cpt_objects(self): + names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg"] + card = [self.num_segs, self.num_subsegs, self.num_segs, self.num_subsegs] + parent_card = [-1, self.num_segs, self.num_segs, [self.num_segs, + self.num_subsegs]] + start_seg = [1.0 / self.num_segs, self.num_segs] + seg_subseg = fill_array(1.0 / self.num_subsegs, (self.num_segs, + self.num_subsegs)).tolist() + seg_seg = make_zero_diagonal_table(self.num_segs) + cpt_seg = make_zero_diagonal_table(self.num_subsegs) + seg_subseg_subseg = (vstack_tile(cpt_seg, self.num_segs, 1)).tolist() + prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg] + # TODO last dense cpt segTransition + for i in range(len(names)): + dense_cpt = DenseCPT(name=names[i], parent_card=parent_card[i], + cardinality=card[i], prob=prob[i]) + input_master.update(dense_cpt) + return input_master.generate_dense() + + def make_dinucleotide_table_row(self): + pass + + def make_seg_dinucleotide(self): + pass + + def make_segCountDown_seg_segTransition(self): + name = "segCountDown_seg_segTransition" + # parent_card = + # card = + pass + + + def generate_objects(self): + pass + class DTParamSpec(ParamSpec): type_name = "DT" @@ -317,6 +604,24 @@ def generate_objects(self): assert supervision_type == SUPERVISION_UNSUPERVISED +class VirtualEvidenceSpec(ParamSpec): + type_name = "VE_CPT" + + # According to GMTK specification (tksrc/GMTK_VECPT.cc) + # this should be of the format: + # CPT_name num_par par_card self_card VE_CPT_FILE + # nfs:nfloats nis:nints ... fmt:obsformat ... END + object_tmpl = "seg_virtualEvidence 1 %s 2 %s nfs:%s nis:0 fmt:ascii END" + copy_attrs = ParamSpec.copy_attrs + ["virtual_evidence", "num_segs"] + + def make_virtual_evidence_spec(self): + return self.object_tmpl % (self.num_segs, VIRTUAL_EVIDENCE_LIST_FILENAME, self.num_segs) + + def generate_objects(self): + yield self.make_virtual_evidence_spec() + + + class TableParamSpec(ParamSpec): copy_attrs = ParamSpec.copy_attrs \ + ["resolution", "card_seg_countdown", "seg_table", @@ -405,257 +710,6 @@ def make_dirichlet_name(name): return "dirichlet_%s" % name -class DenseCPTParamSpec(TableParamSpec): - type_name = "DENSE_CPT" - copy_attrs = TableParamSpec.copy_attrs \ - + ["random_state", "len_seg_strength", "use_dinucleotide"] - - def make_table_spec(self, name, table, dirichlet=False): - """ - if dirichlet is True, this table has a corresponding DirichletTable - automatically generated name - """ - ndim = table.ndim - 1 # don't include output dim - - if dirichlet: - extra_rows = ["DirichletTable %s" % self.make_dirichlet_name(name)] - else: - extra_rows = [] - - return TableParamSpec.make_table_spec(self, name, table, ndim, - extra_rows) - - def make_empty_cpt(self): - num_segs = self.num_segs - - return zeros((num_segs, num_segs)) - - def make_dense_cpt_start_seg_spec(self): - num_segs = self.num_segs - cpt = fill_array(1.0 / num_segs, num_segs) - - return self.make_table_spec("start_seg", cpt) - - def make_dense_cpt_seg_subseg_spec(self): - num_subsegs = self.num_subsegs - cpt = fill_array(1.0 / num_subsegs, (self.num_segs, num_subsegs)) - - return self.make_table_spec("seg_subseg", cpt) - - def make_dense_cpt_seg_seg_spec(self): - cpt = make_zero_diagonal_table(self.num_segs) - - return self.make_table_spec("seg_seg", cpt) - - def make_dense_cpt_seg_subseg_subseg_spec(self): - cpt_seg = make_zero_diagonal_table(self.num_subsegs) - cpt = vstack_tile(cpt_seg, self.num_segs, 1) - - return self.make_table_spec("seg_subseg_subseg", cpt) - - def make_dinucleotide_table_row(self): - # simple one-parameter model - gc = self.random_state.uniform() - at = 1 - gc - - a = at / 2 - c = gc / 2 - g = gc - c - t = 1 - a - c - g - - acgt = array([a, c, g, t]) - - # shape: (16,) - return outer(acgt, acgt).ravel() - - def make_dense_cpt_seg_dinucleotide_spec(self): - table = [self.make_dinucleotide_table_row() - for seg_index in range(self.num_segs)] - - return self.make_table_spec("seg_dinucleotide", table) - - def make_dense_cpt_segCountDown_seg_segTransition_spec(self): # noqa - cpt = self.make_dense_cpt_segCountDown_seg_segTransition() - - return self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, cpt, - dirichlet=self.len_seg_strength > 0) - - def generate_objects(self): - yield self.make_dense_cpt_start_seg_spec() - yield self.make_dense_cpt_seg_subseg_spec() - yield self.make_dense_cpt_seg_seg_spec() - yield self.make_dense_cpt_seg_subseg_subseg_spec() - yield self.make_dense_cpt_segCountDown_seg_segTransition_spec() - - if self.use_dinucleotide: - yield self.make_dense_cpt_seg_dinucleotide_spec() - - -class DirichletTabParamSpec(TableParamSpec): - type_name = "DIRICHLET_TAB" - copy_attrs = TableParamSpec.copy_attrs \ - + ["len_seg_strength", "num_bases", "card_seg_countdown", - "num_mix_components"] - - def make_table_spec(self, name, table): - dirichlet_name = self.make_dirichlet_name(name) - - return TableParamSpec.make_table_spec(self, dirichlet_name, table, - table.ndim) - - def make_dirichlet_table(self): - probs = self.make_dense_cpt_segCountDown_seg_segTransition() - - # XXX: the ratio is not exact as num_bases is not the same as - # the number of base-base transitions. It is surely close - # enough, though - total_pseudocounts = self.len_seg_strength * self.num_bases - divisor = self.card_seg_countdown * self.num_segs - pseudocounts_per_row = total_pseudocounts / divisor - - # astype(int) means flooring the floats - pseudocounts = (probs * pseudocounts_per_row).astype(int) - - return pseudocounts - - def generate_objects(self): - # XXX: these called functions have confusing/duplicative names - if self.len_seg_strength > 0: - dirichlet_table = self.make_dirichlet_table() - yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, - dirichlet_table) - - -class NameCollectionParamSpec(ParamSpec): - type_name = "NAME_COLLECTION" - header_tmpl = "collection_seg_${track} ${fullnum_subsegs}" - row_tmpl = "mx_${seg}_${subseg}_${track}" - - def generate_objects(self): - num_segs = self.num_segs - num_subsegs = self.num_subsegs - track_groups = self.track_groups - - substitute_header = Template(self.header_tmpl).substitute - substitute_row = Template(self.row_tmpl).substitute - - fullnum_subsegs = num_segs * num_subsegs - - for track_group in track_groups: - head_trackname = track_group[0].name - - # XXX: rename in template: track -> head_trackname - mapping = dict(track=head_trackname, - fullnum_subsegs=fullnum_subsegs) - - rows = [substitute_header(mapping)] - for seg_index in range(num_segs): - seg = "seg%d" % seg_index - - for subseg_index in range(num_subsegs): - subseg = "subseg%d" % subseg_index - mapping = dict(seg=seg, subseg=subseg, - track=head_trackname) - - rows.append(substitute_row(mapping)) - - yield "\n".join(rows) - - -class MeanParamSpec(ParamSpec): - type_name = "MEAN" - object_tmpl = "mean_${seg}_${subseg}_${track}${component_suffix} 1 ${datum}" - jitter_std_bound = 0.2 - - copy_attrs = ParamSpec.copy_attrs + ["means", "num_mix_components", "random_state", "vars"] - - def make_data(self): - num_segs = self.num_segs - num_subsegs = self.num_subsegs - means = self.means # indexed by track_index - - # maximum likelihood, adjusted by no more than 0.2*sd - stds = sqrt(self.vars) - - # tile the means of each track (num_segs, num_subsegs times) - means_tiled = vstack_tile(means, num_segs, num_subsegs) - stds_tiled = vstack_tile(stds, num_segs, num_subsegs) - - jitter_std_bound = self.jitter_std_bound - noise = self.random_state.uniform(-jitter_std_bound, - jitter_std_bound, stds_tiled.shape) - - return means_tiled + (stds_tiled * noise) - - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - - for component in range(self.num_mix_components): - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) - - mapping["datum"] = mapping["datum"] - yield substitute(mapping) - - -class CovarParamSpec(ParamSpec): - type_name = "COVAR" - object_tmpl = "covar_${seg}_${subseg}_${track}${component_suffix} 1 ${datum}" - - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components", "vars"] - - def make_data(self): - return vstack_tile(self.vars, self.num_segs, self.num_subsegs) - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - for component in range(self.num_mix_components): - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) - - mapping["datum"] = mapping["datum"] - yield substitute(mapping) - - -class TiedCovarParamSpec(CovarParamSpec): - object_tmpl = "covar_${track}${component_suffix} 1 ${datum}" - - def make_segnames(self): - return ["any"] - - def make_subsegnames(self): - return ["any"] - - class RealMatParamSpec(ParamSpec): type_name = "REAL_MAT" @@ -696,141 +750,127 @@ def generate_objects(self): shape = jitter(shapes[track_index], self.random_state) yield substitute_shape(dict(datum=shape, **mapping)) +class TableParamSpec(ParamSpec): + copy_attrs = ParamSpec.copy_attrs \ + + ["resolution", "card_seg_countdown", "seg_table", + "seg_countdowns_initial"] -class MCParamSpec(ParamSpec): - type_name = "MC" + # see Segway paper + probs_force_transition = array([0.0, 0.0, 1.0]) + def make_table_spec(self, name, table, ndim, extra_rows=[]): + header_rows = [name, ndim] + header_rows.extend(table.shape) -class NormMCParamSpec(MCParamSpec): - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components"] + rows = [" ".join(map(str, header_rows))] + rows.extend(extra_rows) + rows.extend([array2text(table), ""]) - if USE_MFSDG: - # dimensionality component_type name mean covar weights - object_tmpl = "1 COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN" \ - " mc_${distribution}_${seg}_${subseg}_${track}" \ - " mean_${seg}_${subseg}_${track} covar_${seg}_${subseg}_${track}" \ - " matrix_weightscale_1x1" - else: - # dimensionality component_type name mean covar - object_tmpl = "1 COMPONENT_TYPE_DIAG_GAUSSIAN" \ - " mc_${distribution}_${seg}_${subseg}_${track}${component_suffix}" \ - " mean_${seg}_${subseg}_${track}${component_suffix} covar_${track}${component_suffix}" + return "\n".join(rows) - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names + def calc_prob_transition(self, length): + """Calculate probability transition from scaled expected length. """ - substitute = Template(self.object_tmpl).substitute - for component in range(self.num_mix_components): - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) + length_scaled = length // self.resolution - yield substitute(mapping) + prob_self_self = prob_transition_from_expected_len(length_scaled) + prob_self_other = 1.0 - prob_self_self + return prob_self_self, prob_self_other -class GammaMCParamSpec(MCParamSpec): - object_tmpl = "1 COMPONENT_TYPE_GAMMA mc_gamma_${seg}_${subseg}_${track}" \ - " ${min_track} gammascale_${seg}_${subseg}_${track}" \ - " gammashape_${seg}_${subseg}_${track}" + def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa + # first values are the ones where segCountDown = 0 therefore + # the transitions to segTransition = 2 occur early on + card_seg_countdown = self.card_seg_countdown + # by default, when segCountDown is high, never transition + res = empty((card_seg_countdown, self.num_segs, CARD_SEGTRANSITION)) -class MXParamSpec(ParamSpec): - type_name = "MX" - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - object_tmpl = "1 mx_${seg}_${subseg}_${track} ${num_mix_components} " + prob_seg_self_self, prob_seg_self_other = \ + self.calc_prob_transition(LEN_SEG_EXPECTED) - # If the number of mixture components is one - if self.num_mix_components == 1: - # Set the dense probabily mass function containing component - # responsibilites to be set to always 1 for 1 component - object_tmpl += "dpmf_always" - # Otherwise set the dense probability mass function based on number - # of components from the GMTK DPMF definition - else: - object_tmpl += "dpmf_${seg}_${subseg}_${track}" + prob_subseg_self_self, prob_subseg_self_other = \ + self.calc_prob_transition(LEN_SUBSEG_EXPECTED) - for component in range(self.num_mix_components): - add = " mc_${distribution}_${seg}_${subseg}_${track}%s" % ( - self.get_template_component_suffix(component)) - object_tmpl += add - substitute = Template(object_tmpl).substitute + # 0: no transition + # 1: subseg transition (no transition when CARD_SUBSEG == 1) + # 2: seg transition + probs_allow_transition = \ + array([prob_seg_self_self * prob_subseg_self_self, + prob_seg_self_self * prob_subseg_self_other, + prob_seg_self_other]) - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - mapping["num_mix_components"] = self.num_mix_components - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - yield substitute(mapping) + probs_prevent_transition = array([prob_subseg_self_self, + prob_subseg_self_other, + 0.0]) -class DPMFParamSpec(DenseCPTParamSpec): - type_name = "DPMF" - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components"] + # find the labels with maximum segment lengths and those without + table = self.seg_table + ends = table[:, OFFSET_END] + bitmap_without_maximum = ends == 0 - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - # If the number of mixture components is one - if self.num_mix_components == 1: - # Create a dense probability mass function of one value of 1 - # to fix the number of mixture components to one - yield "dpmf_always 1 1.0" - # Otherwise - else: - # Create a dense probability mass function of dirichlet constants - # with the same amount of mixture components - object_tmpl = "dpmf_${seg}_${subseg}_${track} ${num_mix_components} "\ - "DirichletConst %s ${weights}" % GAUSSIAN_MIXTURE_WEIGHTS_PSEUDOCOUNT - component_weight = str(round(1.0 / self.num_mix_components, - ROUND_NDIGITS)) - weights = (" " + component_weight) * self.num_mix_components - substitute = Template(object_tmpl).substitute - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - mapping["weights"] = weights - track_index = mapping["track_index"] - mapping["num_mix_components"] = self.num_mix_components - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - yield substitute(mapping) + # where() returns a tuple; this unpacks it + labels_with_maximum, = where(~bitmap_without_maximum) + labels_without_maximum, = where(bitmap_without_maximum) -class VirtualEvidenceSpec(ParamSpec): - type_name = "VE_CPT" + # labels without a maximum + res[0, labels_without_maximum] = probs_allow_transition + res[1:, labels_without_maximum] = probs_prevent_transition - # According to GMTK specification (tksrc/GMTK_VECPT.cc) - # this should be of the format: - # CPT_name num_par par_card self_card VE_CPT_FILE - # nfs:nfloats nis:nints ... fmt:obsformat ... END - object_tmpl = "seg_virtualEvidence 1 %s 2 %s nfs:%s nis:0 fmt:ascii END" - copy_attrs = ParamSpec.copy_attrs + ["virtual_evidence", "num_segs"] + # labels with a maximum + seg_countdowns_initial = self.seg_countdowns_initial - def make_virtual_evidence_spec(self): - return self.object_tmpl % (self.num_segs, VIRTUAL_EVIDENCE_LIST_FILENAME, self.num_segs) + res[0, labels_with_maximum] = self.probs_force_transition + for label in labels_with_maximum: + seg_countdown_initial = seg_countdowns_initial[label] + minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] - def generate_objects(self): - yield self.make_virtual_evidence_spec() + seg_countdown_allow = seg_countdown_initial - minimum + 1 + res[1:seg_countdown_allow, label] = probs_allow_transition + res[seg_countdown_allow:, label] = probs_prevent_transition + + return res + + + @staticmethod + def make_dirichlet_name(name): + return "dirichlet_%s" % name + + +class DirichletTabParamSpec(TableParamSpec): + type_name = "DIRICHLET_TAB" + copy_attrs = TableParamSpec.copy_attrs \ + + ["len_seg_strength", "num_bases", "card_seg_countdown", + "num_mix_components"] + + def make_table_spec(self, name, table): + dirichlet_name = self.make_dirichlet_name(name) + + return TableParamSpec.make_table_spec(self, dirichlet_name, table, + table.ndim) + + def make_dirichlet_table(self): + probs = self.make_dense_cpt_segCountDown_seg_segTransition() + + # XXX: the ratio is not exact as num_bases is not the same as + # the number of base-base transitions. It is surely close + # enough, though + total_pseudocounts = self.len_seg_strength * self.num_bases + divisor = self.card_seg_countdown * self.num_segs + pseudocounts_per_row = total_pseudocounts / divisor + + # astype(int) means flooring the floats + pseudocounts = (probs * pseudocounts_per_row).astype(int) + + return pseudocounts + + def generate_objects(self): + # XXX: these called functions have confusing/duplicative names + if self.len_seg_strength > 0: + dirichlet_table = self.make_dirichlet_table() + yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, + dirichlet_table) class InputMasterSaver(Saver): resource_name = "input.master.tmpl" @@ -840,12 +880,13 @@ class InputMasterSaver(Saver): "len_seg_strength", "resolution", "random_state", "supervision_type", "use_dinucleotide", "mins", "means", "vars", "gmtk_include_filename_relative", "track_groups", - "num_mix_components", "virtual_evidence"] + "num_mix_components", "virtual_evidence", "tracks"] def make_mapping(self): # the locals of this function are used as the template mapping # use caution before deleting or renaming any variables # check that they are not used in the input.master template + param_spec = ParamSpec(self) num_free_params = 0 num_segs = self.num_segs @@ -862,33 +903,30 @@ def make_mapping(self): else: dirichlet_spec = "" - dense_cpt_spec = DenseCPTParamSpec(self) + dense_cpt_spec = param_spec.generate_dense_cpt_objects() # seg_seg num_free_params += fullnum_subsegs * (fullnum_subsegs - 1) - + # segCountDown_seg_segTransition num_free_params += fullnum_subsegs - + name_collection_spec = param_spec.generate_name_collection() distribution = self.distribution if distribution in DISTRIBUTIONS_LIKE_NORM: - mean_spec = MeanParamSpec(self) - if COVAR_TIED: - covar_spec = TiedCovarParamSpec(self) - else: - covar_spec = CovarParamSpec(self) - + mean_spec = param_spec.generate_mean_objects() + covar_spec = param_spec.generate_covar_objects() if USE_MFSDG: real_mat_spec = RealMatParamSpec(self) else: real_mat_spec = "" - mc_spec = NormMCParamSpec(self) + mc_spec = param_spec.generate_mc_objects() if COVAR_TIED: num_free_params += (fullnum_subsegs + 1) * num_track_groups else: num_free_params += (fullnum_subsegs * 2) * num_track_groups + elif distribution == DISTRIBUTION_GAMMA: mean_spec = "" covar_spec = "" @@ -897,17 +935,16 @@ def make_mapping(self): # the gamma distribution rather than the ML estimate for the # mean and converting real_mat_spec = GammaRealMatParamSpec(self) - mc_spec = GammaMCParamSpec(self) + mc_spec = param_spec.generate_mc_objects() num_free_params += (fullnum_subsegs * 2) * num_track_groups else: raise ValueError("distribution %s not supported" % distribution) - - mx_spec = MXParamSpec(self) - name_collection_spec = NameCollectionParamSpec(self) + dpmf_spec = param_spec.generate_dpmf_objects() + mx_spec = param_spec.generate_mx_objects() card_seg = num_segs - dpmf_spec = DPMFParamSpec(self) - ve_spec = VirtualEvidenceSpec(self) return locals() # dict of vars set in this function + + From 44420abf21231cb0e788ff3c6e82242070b29f7e Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 4 Aug 2020 12:26:23 -0400 Subject: [PATCH 03/72] added InlineMX/MCSection, changes to GMTK types --- segway/gmtk.py | 884 +++++++++++++++++++++++-------------------------- 1 file changed, 419 insertions(+), 465 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 0294d245..3040bfdc 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -1,587 +1,541 @@ from collections import OrderedDict +import numpy as np +from numpy import array, ndarray -MC_TYPE_DIAG = "COMPONENT_TYPE_DIAG_GAUSSIAN" -MC_TYPE_GAMMA = "COMPONENT_TYPE_GAMMA" -MC_TYPE_MISSING = "COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN" -COPY_PARENT = "internal:copyParent" +COMPONENT_TYPE_DIAG_GAUSSIAN = 0 -class DenseCPT: +def array2text(a): """ - A single DenseCPT object. - Attributes: - parent_card - cardinality - prob + Convert multi-dimensional array to text. + :param a: array + :return: + """ + ndim = a.ndim + if ndim == 1: + return " ".join(map(str, a)) + else: + delimiter = "\n" * (ndim - 1) + return delimiter.join(array2text(row) for row in a) + + +class Array(ndarray): + # TODO + def __init__(self, *args): + array.__init__(self, list(args)) + + def __str__(self): + # 1 dimensional str representation covered here + # Multidimensional vary between kinds and will have to be specified + # in specific sub classes. + assert(len(self.shape) <= 1) + return " ".join([str(self.size), array2text(self)]) + + +class Section(OrderedDict): + """ + Contains GMTK objects of a single type and supports writing them to file. + Key: name of GMTK object + Value: GMTK object """ - def __init__(self, name, cardinality, prob, parent_card=-1): + def kind(self): """ - name: str - parent_card: str/int or list[str/int] - cardinality: str/int - prob: list[float] + Return string attribute kind of all GMTK objects in this Section object. + :return: str: type of all GMTK objects in this Section object """ - self.name = name - if parent_card != -1: - if not isinstance(parent_card, list): - self.parent_card = [parent_card] + section_kind = None + for obj in self.values(): + if not section_kind: + section_kind = obj.kind else: - self.parent_card = parent_card - else: - self.parent_card = -1 - self.cardinality = cardinality - # TODO array - self.prob = prob + assert section_kind == obj.kind, "Objects must be of same type." + return section_kind + - def generate(self, index): + def __setattr__(self, key, value): """ - Returns string format of DenseCPT to be printed into input.master - file (new lines to be added). - index: int - index of the denseCPT + Check if all the GMTK objects are of the same type. + :param key: str: name of GMTK object + :param value: GMTK object + :return: + For now, single object + TODO, add multiple objects at once """ - lines = [] - line = [] - line.append(str(index)) - line.append(self.name) - if self.parent_card == -1: # no parents - num_parents = 0 - parent_card_str = [""] + if not self.kind() == value.kind: + raise ValueError("Object has incorrect type.") else: - num_parents = len(self.parent_card) - parent_card_str = [] - for i in range(num_parents): - parent_card_str.append(str(self.parent_card[i])) - line.append(str(num_parents)) - if self.parent_card != -1: - line.extend(parent_card_str) - line.append(str(self.cardinality)) - lines.append(" ".join(line)) - lines.append(self.generate_prob(self.prob) + "\n") - lines.append("\n") - return "\n".join(lines) + super(Section, self).__setattr__(key, value) - def generate_prob(self, prob): +class InlineSection(Section): + + def __str__(self): """ - Generates format of probabilities for single DenseCPT. - :param prob: list[float] - probabilities of DenseCPT - :return: string format to be used by DenseCPT.generate() + Returns inline string representation of this Section object by calling + the individual GMTK object's __str__(). + :return: """ - line = [] - if isinstance(prob[0], float): - prob_str = [] - for i in range(len(prob)): - prob_str.append(str(prob[i])) - return " ".join(prob_str) - else: + # if no gmtk objects + if len(self) == 0: + return "" - for i in range(len(prob)): - line.append(self.generate_prob(prob[i])) - # TODO check if it works without that one line gap - return "\n".join(line) + lines = ["{}_IN_FILE inline".format(self.kind())] + lines.append(str(len(self)) + "\n") # total number of gmtk objects + for i in range(len(self)): + lines.append(str(i)) # index of gmtk object + lines.append(list(self)[i]) # name of gmtk object + lines.append(list(self.values())[i].__str__()) + # string representation of gmtk object + return "\n".join(lines) -class DeterministicCPT: + +class InlineMCSection(InlineSection): """ - A single DeterministicCPT objects. + Special InlineSection subclass which contains MC objects. Attributes: - parent_card: str/int or list[str/int] - cardinality: str/int - name_of_existing_DT: str + mean: InlineSection object which point to InputMaster.mean + covar: InlineSection object which point to InputMaster.covar """ + def __init__(self, mean, covar): + """ + :param mean: InlineSection: InlineSection object which point to + InputMaster.mean + :param covar: InlineSection: InlineSection object which point to + InputMaster.covar + """ + self.mean = mean + self.covar = covar + InlineSection.__init__(self) + + def __setattr__(self, key, value): + OrderedDict.__setattr__(self, key, value) - def __init__(self, name, parent_card, cardinality, dt): + + def __str__(self): """ - name: str - parent_card: str/int or list[str/int] - cardinality: str/int - dt: str + Returns string representation of all MC objects contained in this + InlineMCSection by calling the individual MC object's __str__(). + :return: """ - self.name = name - if not isinstance(parent_card, list): - self.parent_card = [parent_card] + if len(self) == 0: + return "" else: - self.parent_card = parent_card - - self.cardinality = cardinality - self.dt = dt + lines = ["{}_IN_FILE inline".format(self.kind())] + lines.append(str(len(self)) + "\n") # total number of MC objects + for i in range(len(self)): + lines.append(str(i)) # index of MC object + # check if dimension of Mean and Covar of this MC are the same + obj = list(self.values())[i] + mean_name = obj.mean + covar_name = obj.covar + if not self.mean[mean_name].get_dimension() == self.covar[covar_name].get_dimension(): + # TODO delete MC? redefine? + raise ValueError("Inconsistent dimensions of mean and covar associated to MC.") + else: + lines.append(str(self.mean[mean_name].get_dimension())) + # dimension of MC + lines.append(str(obj.component_type)) # component type + lines.append(list(self)[i]) # name of MC + lines.append(obj.__str__()) # string representation of MC obj + + lines.append("\n") + return "\n".join(lines) + + +class InlineMXSection(InlineSection): + """ + Special InlineSection subclass which contains MX objects. + Attributes: + dpmf: InlineSection object which point to InputMaster.dpmf + components: InlineSection object which point to InputMaster.mc + """ - def generate(self, index): + def __init__(self, dpmf, components): """ - :return: String format of DeterministicCPT to be printed into - input.master - file (new lines to be added). - index: int - index of DeterministicCPT + :param dpmf: InlineSection: InlineSection object which point to + InputMaster.dpmf + :param components: InlineSection: InlineSection object which point to + InputMaster.mc """ - lines = [] - line = [] - line.append(str(index)) - line.append(self.name) - lines.append(" ".join(line)) - lines.append(str(len(self.parent_card))) - num_parents_cardinalities = [] - num_parents_cardinalities.extend(self.parent_card) - num_parents_cardinalities.append(self.cardinality) - lines.append(" ".join(num_parents_cardinalities)) - lines.append(self.dt) - lines.append("\n") + self.dpmf = dpmf + self.components = components + InlineSection.__init__(self) - return "\n".join(lines) + def __setattr__(self, key, value): + OrderedDict.__setattr__(self, key, value) + + def __str__(self): + """ + Returns string representation of all MX objects contained in this + InlineMXSection by calling the individual MX object's __str__. + :return: + """ + if len(self) == 0: + return [] + else: + lines = ["{}_IN_FILE inline".format(self.kind())] + lines.append(str(len(self)) + "\n") # total number of MX objects + for i in range(len(self)): + lines.append(str(i)) # index of MX object + # check if dimension of Mean and Covar of this MC are the same + obj = list(self.values())[i] + dpmf_name = obj.dpmf + components_name = obj.components + dpmf_length = self.dpmf[dpmf_name].get_length() + if not dpmf_length == len(components_name): + raise ValueError( + "Dimension of DPMF must be equal to number of components associated with this MX object.") + else: + lines.append(str(dpmf_length)) + # dimension of MX + lines.append(list(self)[i]) # name of MX + lines.append(obj.__str__()) + # string representation of this MX object + + lines.append("\n") + return "\n".join(lines) -class NameCollection: +class DenseCPT: """ - A single NameCollection object. - Attributes: - names: list[str] or str + A single DenseCPT object. """ + kind = "DENSE_CPT" - def __init__(self, name, *args): + def __init__(self, probabilites): """ - name: str - name of collection - :param args: str - name in name collection + todo check if probabilities sums to 1 + TODO temporary (make densecpt a subclass of Array) + :param probabilites: list[float]: probabilities """ - self.name = name - self.names_in_col = [] - for name in args: - if isinstance(name, list): - self.names_in_col.extend(name) - else: - self.names_in_col.append(name) + self.prob = array(probabilites) - def generate(self, index): + def __str__(self): """ - Returns string format of NameCollection objects to be printed into the - input.master file (new lines to be added) - index: int - index of name collection + Return string representation of this DenseCPT object. + :return: """ line = [] - line.append(str(index)) - line.append(self.name) - line.append(str(len(self.names_in_col))) + # num_parents = len(self.shape) - 1 + num_parents = len(self.prob.shape) - 1 + if not num_parents == 0: + line.append(str(num_parents)) # number of parents + cardinality_line = [] + if num_parents == 0: + parent_cardinality = 0 + cardinality_line.append(parent_cardinality) + # cardinality_line.extend(self.shape) + cardinality_line.extend(self.prob.shape) + cardinality_line = map(str, cardinality_line) + else: + # cardinality_line = map(str, self.shape) + cardinality_line = map(str, self.prob.shape) - lines = [] - lines.append(" ".join(line)) - lines.append("\n".join(self.names_in_col)) - lines.append("\n") + line.append(" ".join(cardinality_line)) # cardinalities + # line.append(array2text(self)) # probabilities + line.append(array2text(self.prob)) + line.append("\n") - return "\n".join(lines) + return "\n".join(line) -class Mean: +class NameCollection(list): """ - A single Mean object. - name: str - value: list[float] or float - Mean values of the Mean object. + A single NameCollection object. """ + kind = "NAME_COLLECTION" - def __init__(self, name, *args): + def __init__(self, *args): """ - name: str - name of mean object - :param args: float - mean values + Initialize a single NameCollection object. + :param args: str: names in this NameCollection """ - self.name = name - self.mean_values = [] - for val in args: - self.mean_values.append(val) + list.__init__(self, list(args)) - def generate(self, index): + def __str__(self): """ - Returns the string format of the Mean object to be printed into the - input.master file (new lines to be added). - index: int - index of mean object - :return: + Returns string format of NameCollection object to be printed into the + input.master file (new lines to be added) """ line = [] - line.append(str(index)) - line.append(self.name) - line.append(str(len(self.mean_values))) - mean_str = [] - for i in self.mean_values: - mean_str.append(str(i)) - line.extend(mean_str) + if len(self) == 0: + return line + else: + line.append(str(len(self))) + line.extend(self) line.append("\n") - return " ".join(line) + return "\n".join(line) -class MC: +class Mean: """ - A single all MC objects. - Value: list - mc = MC() - mc1 = [26, 0, "mean_0", "covar_0"] - = [, , , ] + TODO + A single Mean object. """ - def __init__(self, name, dim, type, mean=Mean('sample_mean'), - covar=Mean('sample_covar'), weights=[], gamma_shape="", - gamma_scale=""): - """ - name: str - name of MC object - :param dim: str/int - dimensionality of mc - :param type: str/int - type of mc - :param mean: Mean - mean of mc - :param covar: Covar - covar of mc - """ - self.name = name - self.mean = mean - self.covar = covar - self.dim = dim - self.type = type - # TODO - self.weights = weights - self.gamma_shape = gamma_shape - self.gamma_scale = gamma_scale - - def generate(self, index): - """ - Returns string format of MC object to be printed into the input.master - file (new lines to be added). - index: int - index of mc object - :return: - """ - line = [] - line.append(str(index)) - line.append(str(self.dim)) - - if self.type == MC_TYPE_GAMMA: - # TODO min track - line.append(str(self.type)) - line.append(self.gamma_scale) - line.append(self.gamma_shape) - - elif self.type == MC_TYPE_MISSING: - line.append(str(self.type)) - line.append(self.name) - line.append(self.mean.name) - line.append(self.covar.name) - line.append("matrix_weightscale_1x1") - line.append("\n") - else: # default and for MC_TYPE_DIAG - # TODO component_suffix - line.append(str(self.type)) - line.append(self.name) - line.append(self.mean.name) - line.append(self.covar.name) - line.append("\n") - - return " ".join(line) - + kind = "MEAN" -class MX: - """ - A single MX object. - """ - def __init__(self, name, dim, dpmf, components): - """ - name: str - name of MX object - dimensionality: int - dpmf: DPMF - components: list[mc] or mc (component) - """ - self.name = name - self.dim = dim - if not isinstance(components, list): - components = [components] - if len(dpmf.dpmf_values) != len(components): - raise ValueError("Dimension of DPMF object must be equal " + - "to number of components of MX.") - self.comp = components - self.dpmf = dpmf + def __init__(self, *args): + """ + :param args: float: mean values + """ + self.mean_values = array(args) - def generate(self, index): + def __str__(self): """ - Returns string format of MX object to be printed into the input.master - file (new lines to be added). - index: int - index of mx object + Returns the string format of the Mean object to be printed into the + input.master file (new lines to be added). :return: """ line = [] - line.append(str(index)) - line.append(str(self.dim)) - line.append(self.name) - line.append(str(len(self.comp))) # num components - line.append(self.dpmf.name) - comp_names = [] - for comp in self.comp: - comp_names.append(comp.name) - line.extend(comp_names) + #line.append(str(len(self))) # dimension of Mean + line.append(str(self.get_dimension())) + # line.extend(array2text(self)) + line.append(array2text(self.mean_values)) line.append("\n") + return "\n".join(line) - return " ".join(line) + def get_dimension(self): + """ + Return dimension of this Mean object. + :return: int: dimension of this Mean object + """ + # return + return len(self.mean_values) class Covar: """ A single Covar object. """ + kind = "COVAR" - def __init__(self, name, *args): + def __init__(self, *args): """ - name: str - name of MX object - :param args: covar values + :param args: float: covar values """ - self.name = name - self.covar_values = [] - for val in args: - self.covar_values.append(val) + self.covar_values = array(args) - def generate(self, index): + def __str__(self): """ - Returns string format of Covar object to be printed into the - input.master - file (new lines to be added). - index: int - index of Covar object + Return string representation of single Covar object. :return: """ - line = [] - line.append(str(index)) - line.append(self.name) - line.append(str(len(self.covar_values))) - covar_str = [] - for i in self.covar_values: - covar_str.append(str(i)) - line.extend(covar_str) + line = [str(self.get_dimension())] # dimension of covar object + line.append(array2text(self.covar_values)) # covar values line.append("\n") + return "\n".join(line) - return " ".join(line) + def get_dimension(self): + """ + Return dimension of this Covar object. + :return: int: dimension of this Covar object + """ + #return len(self) + # is len the best + return len(self.covar_values) class DPMF: """ A single DPMF object. """ + kind = "DPMF" - def __init__(self, name, *args): + def __init__(self, *args): """ - name: str - name of dpmf object - :param args: dpmf values summing to 1 - + Initialize a single DPMF object. + :param args: float: DPMF values """ - self.name = name - self.dpmf_values = [] - for val in args: - self.dpmf_values.append(val) - print("dpmf_val", self.dpmf_values) - if sum(self.dpmf_values) != 1.0: - self.dpmf_values = [] - raise ValueError("DPMF values must sum to 1.0.") + dpmf_val = array(args) + if np.sum(dpmf_val) != 1.0: + raise ValueError("Sum of DPMF values must be 1.0.") + else: + self.dpmf_val = dpmf_val - def generate(self, index): + def __str__(self): """ - Returns string format of DPMF object to be printed into the - input.master - file (new lines to be added). + Return string representation of this DPMF. :return: """ - line = [] - line.append(str(index)) - line.append(self.name) - line.append(str(len(self.dpmf_values))) - dpmf_str = [] - for i in self.dpmf_values: - dpmf_str.append(str(i)) - line.extend(dpmf_str) + line = [str(self.get_length())] # dpmf length + line.append(array2text(self.dpmf_val)) # dpmf values line.append("\n") - return " ".join(line) - - -class Object: - - def __new__(cls, _name, content, _kind): - pass + return "\n".join(line) - def __init__(self, name, content, kind): - pass + def get_length(self): + return len(self.dpmf_val) -class InputMaster: +class MC: """ - Main class to produce the input.master file. + A single MC object. Attributes: - mean: OrderedDict - covar: OrderedDict - dense: OrderedDict - deterministic: OrderedDict - dpmf: OrderedDict - mc: OrderedDict - mx: OrderedDict - name_collection: OrderedDict - key: name of object - value: GMTKObject instance + component_type: int: type of MC """ + kind = "MC" - def __init__(self): - self.mean = OrderedDict() - self.covar = OrderedDict() - self.dense = OrderedDict() - self.deterministic = OrderedDict() - self.dpmf = OrderedDict() - self.mc = OrderedDict() - self.mx = OrderedDict() - self.name_collection = OrderedDict() - - def update(self, gmtk_obj): - """ - gmtk_obj: list or single gmtk object - List of GMTK objects - """ - if not isinstance(gmtk_obj, list): - gmtk_obj = [gmtk_obj] - for obj in gmtk_obj: - if not (isinstance(obj, Mean) or isinstance(obj, Covar) or - isinstance(obj, DeterministicCPT) or isinstance(obj, - DenseCPT) - or isinstance(obj, DPMF) or isinstance(obj, MC) - or isinstance(obj, MX) or isinstance(obj, NameCollection)): - - raise ValueError("Object is not an allowed GMTK type.") - - for obj in gmtk_obj: # all objects are of allowed types - - name = obj.name - - if isinstance(obj, Mean): - self.mean[name] = obj - if isinstance(obj, Covar): - self.covar[name] = obj - if isinstance(obj, DeterministicCPT): - self.deterministic[name] = obj - if isinstance(obj, DenseCPT): - self.dense[name] = obj - if isinstance(obj, DPMF): - self.dpmf[name] = obj - if isinstance(obj, MC): - self.mc[name] = obj - if isinstance(obj, MX): - self.mx[name] = obj - if isinstance(obj, NameCollection): - self.name_collection[name] = obj - - def generate_mean(self): - if len(self.mean) == 0: - return [] - - means = ["MEAN_IN_FILE inline"] - means.append(str(len(self.mean)) + "\n") - for key_index in range(len(list(self.mean))): - means.append( - self.mean[list(self.mean)[key_index]].generate(key_index)) - return "\n".join(means) - - def generate_covar(self): - if len(self.covar) == 0: - return [] - - covars = ["COVAR_IN_FILE inline"] - covars.append(str(len(self.covar)) + "\n") - - for key_index in range(len(list(self.covar))): - covars.append( - self.covar[list(self.covar)[key_index]].generate(key_index)) - return "\n".join(covars) - - def generate_dense(self): - if len(self.dense) == 0: - return [] - - dense_cpts = ["DENSE_CPT_IN_FILE inline"] - dense_cpts.append(str(len(self.dense)) + "\n") - - for key_index in range(len(list(self.dense))): - dense_cpts.append( - self.dense[list(self.dense)[key_index]].generate(key_index)) - return "\n".join(dense_cpts) - - def generate_deterministic(self): - if len(self.deterministic) == 0: - return [] - - det_cpts = ["DETERMINISTIC_CPT_IN_FILE inline"] - det_cpts.append(str(len(self.deterministic)) + "\n") - - for key_index in range(len(list(self.deterministic))): - det_cpts.append(self.deterministic[ - list(self.deterministic)[key_index]].generate( - key_index)) - return "\n".join(det_cpts) - - def generate_dpmf(self): - if len(self.dpmf) == 0: - return [] + def __init__(self, component_type): + """ + Initialize a single MC object. + :param component_type: int: type of MC + """ + self.component_type = component_type - dpmfs = ["DPMF_IN_FILE inline"] - dpmfs.append(str(len(self.dpmf)) + "\n") - for key_index in range(len(list(self.dpmf))): - dpmfs.append( - self.dpmf[list(self.dpmf)[key_index]].generate(key_index)) - return "\n".join(dpmfs) +class DiagGaussianMC(MC): + """ + Attributes: + component_type = 0 + mean: str: name of Mean object associated to this MC + covar: str: name of Covar obejct associated to this MC + """ + def __init__(self, mean, covar): + """ + Initialize a single DiagGaussianMC object. + :param mean: name of Mean object associated to this MC + :param covar: name of Covar obejct associated to this MC + """ + # more component types? + self.mean = mean + self.covar = covar + MC.__init__(self, COMPONENT_TYPE_DIAG_GAUSSIAN) - def generate_mc(self): - if len(self.mc) == 0: - return [] + def __str__(self): + """ + Return string representation of this MC object. + :return: + """ + return " ".join([self.mean, self.covar]) - mcs = ["MC_IN_FILE inline"] - mcs.append(str(len(self.mc)) + "\n") - for key_index in range(len(list(self.mc))): - mcs.append(self.mc[list(self.mc)[key_index]].generate(key_index)) +class MX: + """ + A single MX object. + Attributes: + dpmf: str: name of DPMF object associated with MX + components: list[str]: names of components associated with this MX + """ + kind = "MX" - return "\n".join(mcs) + def __init__(self, dpmf, components): + """ + Initialize a single MX object. + :param dpmf: str: name of DPMF object associated with this MX + :param components: str or list[str]: names of components associated with + this MX + """ + self.dpmf = dpmf + if isinstance(components, str): + self.components = [components] + elif isinstance(components, list): + for name in components: + if not isinstance(name, str): + raise ValueError("All component names must be strings.") + self.components = components + else: # not allowed types + raise ValueError("Incorrect format of component names.") - def generate_mx(self): - if len(self.mx) == 0: - return [] + def __str__(self): + """ + Return string representation of this MX. + :return: + """ + line = [str(len(self.components))] # number of components + line.append(self.dpmf) # dpmf name + line.append(" ".join(self.components)) # component names + return "\n".join(line) - mxs = ["MX_IN_FILE inline"] - mxs.append(str(len(self.mx)) + "\n") - for key_index in range(len(list(self.mx))): - mxs.append(self.mx[list(self.mx)[key_index]].generate(key_index)) - return "\n".join(mxs) +class DeterministicCPT: + """ + A single DeterministicCPT object. + Attributes: + parent_cardinality: tuple[int]: cardinality of parents + cardinality: int: cardinality of self + dt: str: name existing Decision Tree (DT) associated with this + DeterministicCPT + """ + kind = "DETERMINISTIC_CPT" - def generate_name_col(self): - if len(self.name_collection) == 0: - return [] + def __init__(self, parent_cardinality, cardinality, dt): + """ + Initialize a single DeterministicCPT object. + :param parent_cardinality: tuple[int]: cardinality of parents + (if empty, then number of parents = 0; if no parents, then placeholder + for parent_cardinality = -1) + :param cardinality: int: cardinality of self + :param dt: name existing Decision Tree (DT) associated with this + DeterministicCPT + """ + if len(parent_cardinality) == 0: + self.parent_cardinality = -1 + self.parent_cardinality = parent_cardinality + self.cardinality = cardinality + self.dt = dt - collections = ["NAME_COLLECTION_IN_FILE inline"] - collections.append(str(len(self.name_collection)) + "\n") + def __str__(self): + """ + Return string representation of this DeterministicCPT. + :return: + """ + line = [] + if self.parent_cardinality == -1: + num_parents = 0 + else: + num_parents = len(self.parent_cardinality) + line.append(str(num_parents)) # number of parents + cardinalities = [] + cardinalities.extend(self.parent_cardinality) + cardinalities.append(self.cardinality) + line.append(" ".join(map(str, cardinalities))) # cardinalities of parent and self + line.append(self.dt) + line.append("\n") + return "\n".join(line) - for key_index in range(len(list(self.name_collection))): - collections.append(self.name_collection[list(self.name_collection)[ - key_index]].generate(key_index)) +class InputMaster: + """ + Master class which contains all GMTK objects present in the input + master and is responsible for creating their string representation. + Attributes: + mean: InlineSection: contains all Mean objects in input master + covar: InlineSection: contains all Covar objects in input master + dpmf: InlineSection: contains all DPMF objects in input master + dense_cpt: InlineSection: contains all DenseCPT objects in input master + deterministic_cpt: InlineSection: contains all DeterministicCPT objects + in input master + mc: InlineMCSection: contains all MC objects in input master + mx: InlineMXSection: contains all MX objects in input master + name_collection: InlineSection: contains all NameCollection objects in + input master + """ - return "\n".join(collections) + def __init__(self): + """ + Initialize InputMaster instance with empty attributes (InlineSection + and its subclasses). + """ + self.mean = InlineSection() + self.covar = InlineSection() + self.dpmf = InlineSection() + self.dense_cpt = InlineSection() + self.deterministic_cpt = InlineSection() + # TODO fix error + self.mc = InlineMCSection(mean=self.mean, covar=self.covar) + self.mx = InlineMXSection(dpmf=self.dpmf, components=self.mc) + self.name_collection = InlineSection() def __str__(self): - attrs_gen = [self.generate_name_col(), self.generate_deterministic(), - self.generate_dense(), self.generate_mean(), - self.generate_covar(), self.generate_dpmf(), - self.generate_mc(), self.generate_mx()] + """ + Return string representation of all the attributes (GMTK types) by + calling the attributes' (InlineSection and its subclasses) __str__(). + :return: + """ + attrs = [self.deterministic_cpt, self.name_collection, self.mean, self.covar, self.dense_cpt, + self.dpmf, self.mc, self.mx] + s = [] - for obj in attrs_gen: - s.append("".join(obj)) + for obj in attrs: + s.append("".join(obj.__str__())) return "".join(s) - From f95cb44def7ae88e8cd81f26ec1ad5c362e1d4e4 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 4 Aug 2020 13:56:52 -0400 Subject: [PATCH 04/72] array error fixed --- segway/gmtk.py | 194 ++++++++++++++++++++----------------------------- 1 file changed, 77 insertions(+), 117 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 3040bfdc..aeb2679d 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -20,17 +20,14 @@ def array2text(a): class Array(ndarray): - # TODO - def __init__(self, *args): - array.__init__(self, list(args)) - - def __str__(self): - # 1 dimensional str representation covered here - # Multidimensional vary between kinds and will have to be specified - # in specific sub classes. - assert(len(self.shape) <= 1) - return " ".join([str(self.size), array2text(self)]) - + def __new__(cls, *args): + """ + :param input_array: ndarray + :return: + """ + input_array = array(args) + obj = np.asarray(input_array).view(cls) + return obj class Section(OrderedDict): """ @@ -66,6 +63,7 @@ def __setattr__(self, key, value): else: super(Section, self).__setattr__(key, value) + class InlineSection(Section): def __str__(self): @@ -196,19 +194,61 @@ def __str__(self): return "\n".join(lines) -class DenseCPT: +class InputMaster: """ - A single DenseCPT object. + Master class which contains all GMTK objects present in the input + master and is responsible for creating their string representation. + Attributes: + mean: InlineSection: contains all Mean objects in input master + covar: InlineSection: contains all Covar objects in input master + dpmf: InlineSection: contains all DPMF objects in input master + dense_cpt: InlineSection: contains all DenseCPT objects in input master + deterministic_cpt: InlineSection: contains all DeterministicCPT objects + in input master + mc: InlineMCSection: contains all MC objects in input master + mx: InlineMXSection: contains all MX objects in input master + name_collection: InlineSection: contains all NameCollection objects in + input master """ - kind = "DENSE_CPT" - def __init__(self, probabilites): + def __init__(self): + """ + Initialize InputMaster instance with empty attributes (InlineSection + and its subclasses). + """ + self.mean = InlineSection() + self.covar = InlineSection() + self.dpmf = InlineSection() + self.dense_cpt = InlineSection() + self.deterministic_cpt = InlineSection() + # TODO fix error + self.mc = InlineMCSection(mean=self.mean, covar=self.covar) + self.mx = InlineMXSection(dpmf=self.dpmf, components=self.mc) + self.name_collection = InlineSection() + + def __str__(self): """ - todo check if probabilities sums to 1 - TODO temporary (make densecpt a subclass of Array) - :param probabilites: list[float]: probabilities + Return string representation of all the attributes (GMTK types) by + calling the attributes' (InlineSection and its subclasses) __str__(). + :return: """ - self.prob = array(probabilites) + attrs = [self.deterministic_cpt, self.name_collection, self.mean, + self.covar, self.dense_cpt, self.dpmf, self.mc, self.mx] + + s = [] + for obj in attrs: + s.append("".join(obj.__str__())) + + return "".join(s) + + +class DenseCPT(Array): + """ + A single DenseCPT object. + """ + kind = "DENSE_CPT" + + # todo check if sums to 1.0 def __str__(self): """ @@ -216,24 +256,11 @@ def __str__(self): :return: """ line = [] - # num_parents = len(self.shape) - 1 - num_parents = len(self.prob.shape) - 1 - if not num_parents == 0: - line.append(str(num_parents)) # number of parents - cardinality_line = [] - if num_parents == 0: - parent_cardinality = 0 - cardinality_line.append(parent_cardinality) - # cardinality_line.extend(self.shape) - cardinality_line.extend(self.prob.shape) - cardinality_line = map(str, cardinality_line) - else: - # cardinality_line = map(str, self.shape) - cardinality_line = map(str, self.prob.shape) - + num_parents = len(self.shape) - 1 + line.append(str(num_parents)) # number of parents + cardinality_line = map(str, self.shape) line.append(" ".join(cardinality_line)) # cardinalities - # line.append(array2text(self)) # probabilities - line.append(array2text(self.prob)) + line.append(array2text(self)) line.append("\n") return "\n".join(line) @@ -268,18 +295,15 @@ def __str__(self): return "\n".join(line) -class Mean: +class Mean(Array): """ TODO A single Mean object. """ kind = "MEAN" - def __init__(self, *args): - """ - :param args: float: mean values - """ - self.mean_values = array(args) + def __array_finalize__(self, obj): + if obj is None: return def __str__(self): """ @@ -288,10 +312,8 @@ def __str__(self): :return: """ line = [] - #line.append(str(len(self))) # dimension of Mean - line.append(str(self.get_dimension())) - # line.extend(array2text(self)) - line.append(array2text(self.mean_values)) + line.append(str(self.get_dimension())) # dimension + line.append(array2text(self)) line.append("\n") return "\n".join(line) @@ -301,28 +323,22 @@ def get_dimension(self): :return: int: dimension of this Mean object """ # return - return len(self.mean_values) + return len(self) -class Covar: +class Covar(Array): """ A single Covar object. """ kind = "COVAR" - def __init__(self, *args): - """ - :param args: float: covar values - """ - self.covar_values = array(args) - def __str__(self): """ Return string representation of single Covar object. :return: """ - line = [str(self.get_dimension())] # dimension of covar object - line.append(array2text(self.covar_values)) # covar values + line = [str(self.get_dimension())] # dimension + line.append(array2text(self)) # covar values line.append("\n") return "\n".join(line) @@ -333,25 +349,16 @@ def get_dimension(self): """ #return len(self) # is len the best - return len(self.covar_values) + return len(self) -class DPMF: +class DPMF(Array): """ A single DPMF object. """ kind = "DPMF" - def __init__(self, *args): - """ - Initialize a single DPMF object. - :param args: float: DPMF values - """ - dpmf_val = array(args) - if np.sum(dpmf_val) != 1.0: - raise ValueError("Sum of DPMF values must be 1.0.") - else: - self.dpmf_val = dpmf_val + # todo check if sums to 1.0 def __str__(self): """ @@ -359,12 +366,12 @@ def __str__(self): :return: """ line = [str(self.get_length())] # dpmf length - line.append(array2text(self.dpmf_val)) # dpmf values + line.append(array2text(self)) # dpmf values line.append("\n") return "\n".join(line) def get_length(self): - return len(self.dpmf_val) + return len(self) class MC: @@ -492,50 +499,3 @@ def __str__(self): line.append(self.dt) line.append("\n") return "\n".join(line) - -class InputMaster: - """ - Master class which contains all GMTK objects present in the input - master and is responsible for creating their string representation. - Attributes: - mean: InlineSection: contains all Mean objects in input master - covar: InlineSection: contains all Covar objects in input master - dpmf: InlineSection: contains all DPMF objects in input master - dense_cpt: InlineSection: contains all DenseCPT objects in input master - deterministic_cpt: InlineSection: contains all DeterministicCPT objects - in input master - mc: InlineMCSection: contains all MC objects in input master - mx: InlineMXSection: contains all MX objects in input master - name_collection: InlineSection: contains all NameCollection objects in - input master - """ - - def __init__(self): - """ - Initialize InputMaster instance with empty attributes (InlineSection - and its subclasses). - """ - self.mean = InlineSection() - self.covar = InlineSection() - self.dpmf = InlineSection() - self.dense_cpt = InlineSection() - self.deterministic_cpt = InlineSection() - # TODO fix error - self.mc = InlineMCSection(mean=self.mean, covar=self.covar) - self.mx = InlineMXSection(dpmf=self.dpmf, components=self.mc) - self.name_collection = InlineSection() - - def __str__(self): - """ - Return string representation of all the attributes (GMTK types) by - calling the attributes' (InlineSection and its subclasses) __str__(). - :return: - """ - attrs = [self.deterministic_cpt, self.name_collection, self.mean, self.covar, self.dense_cpt, - self.dpmf, self.mc, self.mx] - - s = [] - for obj in attrs: - s.append("".join(obj.__str__())) - - return "".join(s) From fcd8a3218daac9f81936c4d7d92d2c241f622af1 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 4 Aug 2020 16:58:34 -0400 Subject: [PATCH 05/72] updated DetCPT and MX param names --- segway/gmtk.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index aeb2679d..e37f96b9 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -148,7 +148,7 @@ class InlineMXSection(InlineSection): components: InlineSection object which point to InputMaster.mc """ - def __init__(self, dpmf, components): + def __init__(self, dpmf, mc): """ :param dpmf: InlineSection: InlineSection object which point to InputMaster.dpmf @@ -156,7 +156,7 @@ def __init__(self, dpmf, components): InputMaster.mc """ self.dpmf = dpmf - self.components = components + self.mc = mc InlineSection.__init__(self) def __setattr__(self, key, value): @@ -175,12 +175,12 @@ def __str__(self): lines.append(str(len(self)) + "\n") # total number of MX objects for i in range(len(self)): lines.append(str(i)) # index of MX object - # check if dimension of Mean and Covar of this MC are the same + # check if number of components is equal to length of DPMF obj = list(self.values())[i] dpmf_name = obj.dpmf - components_name = obj.components + components = obj.mc dpmf_length = self.dpmf[dpmf_name].get_length() - if not dpmf_length == len(components_name): + if not dpmf_length == len(components): raise ValueError( "Dimension of DPMF must be equal to number of components associated with this MX object.") else: @@ -465,7 +465,7 @@ class DeterministicCPT: """ kind = "DETERMINISTIC_CPT" - def __init__(self, parent_cardinality, cardinality, dt): + def __init__(self, cardinality_parents, cardinality, dt): """ Initialize a single DeterministicCPT object. :param parent_cardinality: tuple[int]: cardinality of parents @@ -477,7 +477,7 @@ def __init__(self, parent_cardinality, cardinality, dt): """ if len(parent_cardinality) == 0: self.parent_cardinality = -1 - self.parent_cardinality = parent_cardinality + self.parent_cardinality = cardinality_parents self.cardinality = cardinality self.dt = dt @@ -487,13 +487,13 @@ def __str__(self): :return: """ line = [] - if self.parent_cardinality == -1: + if self.cardinality_parents == -1: num_parents = 0 else: - num_parents = len(self.parent_cardinality) + num_parents = len(self.cardinality_parents) line.append(str(num_parents)) # number of parents cardinalities = [] - cardinalities.extend(self.parent_cardinality) + cardinalities.extend(self.cardinality_parents) cardinalities.append(self.cardinality) line.append(" ".join(map(str, cardinalities))) # cardinalities of parent and self line.append(self.dt) From 11a1c18332087742d537fdd0bc7c02ea971a1150 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 4 Aug 2020 18:13:20 -0400 Subject: [PATCH 06/72] updated dense_cpt cardinalities --- segway/gmtk.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index e37f96b9..b59d3f09 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -256,6 +256,11 @@ def __str__(self): :return: """ line = [] + new_shape = self.shape[1:] + if len(new_shape) == 1: + new_shape = (new_shape[0], ) + self.reshape((new_shape)) + num_parents = len(self.shape) - 1 line.append(str(num_parents)) # number of parents cardinality_line = map(str, self.shape) From 279c14b50048755817fc475f5ac072c52300e1f8 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 4 Aug 2020 18:38:03 -0400 Subject: [PATCH 07/72] fixed typos and added Object --- segway/gmtk.py | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index b59d3f09..60f5c54d 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -18,6 +18,13 @@ def array2text(a): delimiter = "\n" * (ndim - 1) return delimiter.join(array2text(row) for row in a) +class Object(str): + def __new__(cls, _name, content, _kind): + return str.__new__(cls, content) + + def __init__(self, name, content, kind): + self.kind = kind + self.name = name class Array(ndarray): def __new__(cls, *args): @@ -28,7 +35,7 @@ def __new__(cls, *args): input_array = array(args) obj = np.asarray(input_array).view(cls) return obj - + class Section(OrderedDict): """ Contains GMTK objects of a single type and supports writing them to file. @@ -63,7 +70,6 @@ def __setattr__(self, key, value): else: super(Section, self).__setattr__(key, value) - class InlineSection(Section): def __str__(self): @@ -107,8 +113,7 @@ def __init__(self, mean, covar): def __setattr__(self, key, value): OrderedDict.__setattr__(self, key, value) - - + def __str__(self): """ Returns string representation of all MC objects contained in this @@ -138,8 +143,8 @@ def __str__(self): lines.append("\n") return "\n".join(lines) - - + + class InlineMXSection(InlineSection): """ Special InlineSection subclass which contains MX objects. @@ -178,7 +183,7 @@ def __str__(self): # check if number of components is equal to length of DPMF obj = list(self.values())[i] dpmf_name = obj.dpmf - components = obj.mc + components = obj.components dpmf_length = self.dpmf[dpmf_name].get_length() if not dpmf_length == len(components): raise ValueError( @@ -193,7 +198,6 @@ def __str__(self): lines.append("\n") return "\n".join(lines) - class InputMaster: """ Master class which contains all GMTK objects present in the input @@ -223,7 +227,7 @@ def __init__(self): self.deterministic_cpt = InlineSection() # TODO fix error self.mc = InlineMCSection(mean=self.mean, covar=self.covar) - self.mx = InlineMXSection(dpmf=self.dpmf, components=self.mc) + self.mx = InlineMXSection(dpmf=self.dpmf, mc=self.mc) self.name_collection = InlineSection() def __str__(self): @@ -233,7 +237,7 @@ def __str__(self): :return: """ attrs = [self.deterministic_cpt, self.name_collection, self.mean, - self.covar, self.dense_cpt, self.dpmf, self.mc, self.mx] + self.covar, self.dense_cpt, self.dpmf, self.mc, self.mx] s = [] for obj in attrs: @@ -260,7 +264,7 @@ def __str__(self): if len(new_shape) == 1: new_shape = (new_shape[0], ) self.reshape((new_shape)) - + num_parents = len(self.shape) - 1 line.append(str(num_parents)) # number of parents cardinality_line = map(str, self.shape) @@ -270,7 +274,6 @@ def __str__(self): return "\n".join(line) - class NameCollection(list): """ A single NameCollection object. @@ -346,7 +349,7 @@ def __str__(self): line.append(array2text(self)) # covar values line.append("\n") return "\n".join(line) - + def get_dimension(self): """ Return dimension of this Covar object. @@ -378,7 +381,6 @@ def __str__(self): def get_length(self): return len(self) - class MC: """ A single MC object. @@ -393,8 +395,7 @@ def __init__(self, component_type): :param component_type: int: type of MC """ self.component_type = component_type - - + class DiagGaussianMC(MC): """ Attributes: @@ -458,7 +459,6 @@ def __str__(self): line.append(" ".join(self.components)) # component names return "\n".join(line) - class DeterministicCPT: """ A single DeterministicCPT object. @@ -474,15 +474,15 @@ def __init__(self, cardinality_parents, cardinality, dt): """ Initialize a single DeterministicCPT object. :param parent_cardinality: tuple[int]: cardinality of parents - (if empty, then number of parents = 0; if no parents, then placeholder - for parent_cardinality = -1) + (if empty, then number of parents = 0 :param cardinality: int: cardinality of self :param dt: name existing Decision Tree (DT) associated with this DeterministicCPT """ - if len(parent_cardinality) == 0: - self.parent_cardinality = -1 - self.parent_cardinality = cardinality_parents + if not isinstance(cardinality_parents, tuple): + self.cardinality_parents = (cardinality_parents, ) + else: + self.cardinality_parents = cardinality_parents self.cardinality = cardinality self.dt = dt @@ -492,10 +492,7 @@ def __str__(self): :return: """ line = [] - if self.cardinality_parents == -1: - num_parents = 0 - else: - num_parents = len(self.cardinality_parents) + num_parents = len(self.cardinality_parents) line.append(str(num_parents)) # number of parents cardinalities = [] cardinalities.extend(self.cardinality_parents) From 3c01e2cb232fff3a2b79447bbe33fbbe7d531b4c Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Wed, 5 Aug 2020 21:38:24 -0400 Subject: [PATCH 08/72] added InputMaster.save --- segway/gmtk.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index 60f5c54d..3edf5d01 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -244,6 +244,15 @@ def __str__(self): s.append("".join(obj.__str__())) return "".join(s) + + def save(self, filename): + """ + Opens filename for writing and writes out the contents of its attributes. + :param filename: str + :return: None + """ + with open(filename, 'w') as file: + print(self, file=file) class DenseCPT(Array): From 103de8e394a14cd497c49656b9a7e0ca03c3ba37 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Wed, 5 Aug 2020 21:48:40 -0400 Subject: [PATCH 09/72] InputMaster save --- segway/gmtk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index 3edf5d01..77a4b2ab 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -252,6 +252,7 @@ def save(self, filename): :return: None """ with open(filename, 'w') as file: + print('# include "traindir/auxiliary/segway.inc"', file=file) print(self, file=file) From 3c439532e896fb446deb7cb8c8f33785f8a52c70 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:12:55 -0400 Subject: [PATCH 10/72] input_master using new gmtk interface --- segway/input_master.py | 724 ++++++++++++++--------------------------- 1 file changed, 236 insertions(+), 488 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index a977042a..097dd546 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -1,4 +1,5 @@ from __future__ import absolute_import, division + """input_master.py: write input master files """ @@ -25,8 +26,7 @@ VIRTUAL_EVIDENCE_LIST_FILENAME) from .gmtk import InputMaster, NameCollection, DenseCPT, \ - DeterministicCPT, DPMF, MC, MX, Covar, Mean - + DeterministicCPT, DPMF, MC, MX, Covar, Mean, DiagGaussianMC # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of # scalars and vectors in the parameter output. By default in newer (> 1.14) @@ -73,6 +73,7 @@ input_master = InputMaster() + def vstack_tile(array_like, *reps): reps = list(reps) + [1] @@ -142,24 +143,6 @@ def make_zero_diagonal_table(length): return res -def format_indexed_strs(fmt, num): - full_fmt = fmt + "%d" - return [full_fmt % index for index in range(num)] - - -def jitter_cell(cell, random_state): - """ - adds some random noise - """ - # get the binary exponent and subtract JITTER_ORDERS_MAGNITUDE - # e.g. 3 * 2**10 --> 1 * 2**5 - max_noise = ldexp(1, frexp(cell)[1] - JITTER_ORDERS_MAGNITUDE) - - return cell + random_state.uniform(-max_noise, max_noise) - -jitter = vectorize(jitter_cell) - - def generate_gmtk_obj_names(obj, track_names, num_segs, num_subsegs, distribution, num_mix_components): """ @@ -178,19 +161,19 @@ def generate_gmtk_obj_names(obj, track_names, num_segs, num_subsegs, :param: distribution: str: distribution :param: number of mixture components :return: - """ + """ allowed_types = ["mx", "mc_diag", "mc_gamma", "mc_missing", "mean", "covar", "col", "mx_name", "dpmf", "gammascale", "gammashape", "tied_covar"] - if not obj in allowed_types: + if not obj in allowed_types: raise ValueError("Undefined GMTK object type: {}".format(obj)) - + names = [] if obj == "covar": for name in track_names: names.append("covar_{}".format(name)) - - + + # todo check component suffix elif obj == "tied_covar": for name in track_names: @@ -244,27 +227,24 @@ class ParamSpec(object): object_tmpl = None copy_attrs = ["distribution", "mins", "num_segs", "num_subsegs", "num_track_groups", "track_groups", "num_mix_components", - "means", "vars", "num_mix_components", "random_state", "tracks"] + "means", "vars", "num_mix_components", "random_state", + "tracks", "resolution", "card_seg_countdown", "seg_table", + "seg_countdowns_initial"] jitter_std_bound = 0.2 track_names = [] + def __init__(self, saver): # copy all variables from saver that it copied from Runner # XXX: override in subclasses to only copy subset copy_attrs(saver, self, self.copy_attrs) - self.track_names = [] - #print(self.tracks) + self.track_names = [] for track in self.tracks: - # print(track) self.track_names.append(track.name) - #print("track_names", self.track_names) - - def make_segnames(self): - return format_indexed_strs("seg", self.num_segs) - - def make_subsegnames(self): - return format_indexed_strs("subseg", self.num_subsegs) + def __str__(self): + return make_spec(self.type_name, self.generate_objects()) + def make_data(self): """ override this in subclasses @@ -272,114 +252,34 @@ def make_data(self): """ return None - def get_track_lt_min(self, track_index): - """ - returns a value less than a minimum in a track - """ - # XXX: refactor into a new function - min_track = self.mins[track_index] - - # fudge the minimum by a very small amount. this is not - # continuous, but hopefully we won't get values where it - # matters - # XXX: restore this after GMTK issues fixed - # if min_track == 0.0: - # min_track_fudged = FUDGE_TINY - # else: - # min_track_fudged = min_track - ldexp(abs(min_track), FUDGE_EP) - - # this happens for really big numbers or really small - # numbers; you only have 7 orders of magnitude to play - # with on a float32 - min_track_f32 = float32(min_track) - - assert min_track_f32 - float32(ABSOLUTE_FUDGE) != min_track_f32 - return min_track - ABSOLUTE_FUDGE - - def get_template_component_suffix(self, component_number): - """Returns the subsitution for the component suffix in the GMTK model - template. Empty if there is only one component""" - if self.num_mix_components == 1: - return "" - else: - return "_component{}".format(component_number) - - def generate_tmpl_mappings(self): - # need segnames because in the tied covariance case, the - # segnames are replaced by "any" (see .make_covar_spec()), - # and only one mapping is produced - #print("gen tmpl mapping used") - num_subsegs = self.num_subsegs - - track_groups = self.track_groups - num_track_groups = self.num_track_groups - - for seg_index, segname in enumerate(self.make_segnames()): - seg_offset = num_track_groups * num_subsegs * seg_index - - for subseg_index, subsegname in enumerate(self.make_subsegnames()): - subseg_offset = seg_offset + (num_track_groups * subseg_index) - - for track_group_index, track_group in enumerate(track_groups): - track_offset = subseg_offset + track_group_index - head_trackname = track_group[0].name - - # XXX: change name of index to track_offset in templates - # XXX: change name of track_index to track_group_index - yield dict(seg=segname, subseg=subsegname, - track=head_trackname, seg_index=seg_index, - subseg_index=subseg_index, - track_index=track_group_index, - index=track_offset, - distribution=self.distribution) - - def generate_objects(self): - """ - returns: iterable of strs containing GMTK parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - - data = self.make_data() - - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - - yield substitute(mapping) - - def __str__(self): - return make_spec(self.type_name, self.generate_objects()) - def generate_name_collection(self): - # generate list of collection names + # generate list of collection names collection_names = generate_gmtk_obj_names(obj="col", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - # generate list of all names in NameCollections + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) + #  generate list of all names in NameCollections names = generate_gmtk_obj_names("mx_name", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) num_tracks = len(self.track_names) len_name_group = int(len(names) / num_tracks) # names grouped by collection - name_groups = [names[i:i + len_name_group] for i in range(0, len(names), len_name_group)] - # create NameCollection objects + name_groups = [names[i:i + len_name_group] for i in + range(0, len(names), len_name_group)] + # create NameCollection objects and add to + # input_master.name_collection: InlineSection for group_index in range(len(name_groups)): - name_col = NameCollection(collection_names[group_index], - name_groups[group_index]) - input_master.update(name_col) + input_master.name_collection[collection_names[group_index]] = \ + NameCollection(name_groups[group_index]) + + return input_master.name_collection.__str__() - return input_master.generate_name_col() def make_mean_data(self): num_segs = self.num_segs @@ -395,43 +295,54 @@ def make_mean_data(self): jitter_std_bound = self.jitter_std_bound noise = self.random_state.uniform(-jitter_std_bound, - jitter_std_bound, stds_tiled.shape) + jitter_std_bound, stds_tiled.shape) return means_tiled + (stds_tiled * noise) def generate_mean_objects(self): - # generate list of names of Mean objects + # generate list of names of Mean objects names = generate_gmtk_obj_names("mean", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - means = self.make_mean_data().tolist() - # TODO change array rep - # create Mean objects - for i in range(len(names)): - mean_obj = Mean(names[i], means[i]) - input_master.update(mean_obj) - - return input_master.generate_mean() + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) + means = self.make_mean_data() # array + # dimensions of means: num_segs x num_subsegs x num_tracks + # create Mean objects + num_segs, num_subsegs, num_tracks = means.shape + names_array = array(names).reshape(means.shape) + for i in range(num_segs): + for j in range(num_subsegs): + for k in range(num_tracks): + input_master.mean[names_array[i, j, k]] = Mean(means[i, j, k]) + + return input_master.mean.__str__() + def generate_covar_objects(self): if COVAR_TIED: names = generate_gmtk_obj_names("tied_covar", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) else: names = generate_gmtk_obj_names("covar", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - covars = self.vars.tolist() # list of variance values - # create Covar objects + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) + covars = self.vars # array of variance values + # create Covar objects for i in range(len(names)): - covar_obj = Covar(names[i], covars[i]) - input_master.update(covar_obj) + input_master.covar[names[i]] = Covar(covars[i]) # TODO index error + + return input_master.covar.__str__() + - return input_master.generate_covar() def generate_real_mat_objects(self): pass @@ -444,202 +355,103 @@ def generate_mc_objects(self): option = "mc_missing" else: option = "mc_diag" - # generate MC object names + # generate MC object names names = generate_gmtk_obj_names(option, - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - covars = list(input_master.covar.values())* (self.num_segs * self.num_subsegs) # replicate covar values - # create MC objects + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) + + covar_names = list(input_master.mc.covar) * ( + self.num_segs * self.num_subsegs) + # replicate covar names for iteration + mean_names = list(input_master.mc.mean) + # list of all mean names + + # create MC objects for i in range(len(names)): - mc_obj = MC(name=names[i], dim=1, type="COMPONENT_TYPE_DIAG_GAUSSIAN", - mean=list(input_master.mean.values())[i], covar=covars[i]) - input_master.update(mc_obj) - - # if distribution is gamma - elif self.distribution == DISTRIBUTION_GAMMA: - option = "mc_gamma" - names = generate_gmtk_obj_names(option, - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - # generate gammashape and gammascale names for MC objects - gamma_scale = generate_gmtk_obj_names("gammascale", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - - gamma_shape = generate_gmtk_obj_names("gammashape", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - # create MC objects - for i in range(len(names)): - mc_obj = MC(name=names[i], dim=1, type="COMPONENT_TYPE_GAMMA", - gamma_shape=gamma_shape[i], gamma_scale=gamma_scale[i]) - input_master.update(mc_obj) - return input_master.generate_mc() + input_master.mc[names[i]] = DiagGaussianMC(mean=mean_names[i], + covar=covar_names[i]) + return input_master.mc.__str__() + + # # TODO if distribution is gamma + # elif self.distribution == DISTRIBUTION_GAMMA: + # option = "mc_gamma" + # names = generate_gmtk_obj_names(option, + # track_names=self.track_names, + # num_segs=self.num_segs, + # num_subsegs=self.num_subsegs, + # distribution=self.distribution, + # num_mix_components=self.num_mix_components) + # # generate gammashape and gammascale names for MC objects + # gamma_scale = generate_gmtk_obj_names("gammascale", + # track_names=self.track_names, + # num_segs=self.num_segs, + # num_subsegs=self.num_subsegs, + # distribution=self.distribution, + # num_mix_components=self.num_mix_components) + # + # gamma_shape = generate_gmtk_obj_names("gammashape", + # track_names=self.track_names, + # num_segs=self.num_segs, + # num_subsegs=self.num_subsegs, + # distribution=self.distribution, + # num_mix_components=self.num_mix_components) + # # create MC objects + # for i in range(len(names)): + # mc_obj = MC(name=names[i], dim=1, type="COMPONENT_TYPE_GAMMA", + # gamma_shape=gamma_shape[i], + # gamma_scale=gamma_scale[i]) + # input_master.update(mc_obj) def generate_mx_objects(self): - # generate list of MX names + # generate list of MX names names = generate_gmtk_obj_names("mx", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - - mc_obj = list(input_master.mc.values()) - dpmf_obj = list(input_master.dpmf.values()) - multiple = int(len(names)/len(dpmf_obj)) - dpmf_obj *= multiple # replicate dpmf obj as MX obj components - # create MX objects + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) + + mc_names = list(input_master.mc) # list of all mc names + dpmf_names = list(input_master.dpmf) # list of all dpmf names + multiple = int(len(names) / len(dpmf_names)) + dpmf_names *= multiple + # replicate dpmf names for iteration + + # create MX objects for i in range(len(names)): - mx_obj = MX(name=names[i], dim=1, dpmf=dpmf_obj[i], - components=mc_obj[i]) - input_master.update(mx_obj) - return input_master.generate_mx() + input_master.mx[names[i]] = MX(dpmf=dpmf_names[i], + components=mc_names[i]) + return input_master.mx.__str__() + def generate_dpmf_objects(self): - # generate a list of dpmf names + # generate a list of dpmf names names = generate_gmtk_obj_names("dpmf", - track_names=self.track_names, num_segs=self.num_segs, - num_subsegs=self.num_subsegs, distribution=self.distribution, - num_mix_components=self.num_mix_components) - # if single dpmf + track_names=self.track_names, + num_segs=self.num_segs, + num_subsegs=self.num_subsegs, + distribution=self.distribution, + num_mix_components=self.num_mix_components) + # if single dpmf if self.num_mix_components == 1: - dpmf_obj = DPMF(names[0], 1.0) - input_master.update(dpmf_obj) - else: - # uniform probabilities + input_master.dpmf[names[0]] = DPMF(1.0) + else: + # uniform probabilities dpmf_values = str(round(1.0 / self.num_mix_components, ROUND_NDIGITS)) - # create dpmf objects + # create dpmf objects for i in range(len(names)): - dpmf_obj = DPMF(names[i], dpmf_values[i]) - input_master.update(dpmf_obj) - return input_master.generate_dpmf() - - def generate_ve(self): - # TODO - pass - - def generate_dense_cpt_objects(self): - names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg"] - card = [self.num_segs, self.num_subsegs, self.num_segs, self.num_subsegs] - parent_card = [-1, self.num_segs, self.num_segs, [self.num_segs, - self.num_subsegs]] - start_seg = [1.0 / self.num_segs, self.num_segs] - seg_subseg = fill_array(1.0 / self.num_subsegs, (self.num_segs, - self.num_subsegs)).tolist() - seg_seg = make_zero_diagonal_table(self.num_segs) - cpt_seg = make_zero_diagonal_table(self.num_subsegs) - seg_subseg_subseg = (vstack_tile(cpt_seg, self.num_segs, 1)).tolist() - prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg] - # TODO last dense cpt segTransition - for i in range(len(names)): - dense_cpt = DenseCPT(name=names[i], parent_card=parent_card[i], - cardinality=card[i], prob=prob[i]) - input_master.update(dense_cpt) - return input_master.generate_dense() - - def make_dinucleotide_table_row(self): - pass + input_master.dpmf[names[i]] = DPMF(dpmf_values[i]) + return input_master.dpmf.__str__() - def make_seg_dinucleotide(self): - pass - - def make_segCountDown_seg_segTransition(self): - name = "segCountDown_seg_segTransition" - # parent_card = - # card = - pass - - def generate_objects(self): + def generate_ve(self): + #  TODO pass - -class DTParamSpec(ParamSpec): - type_name = "DT" - copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", - "supervision_type"] - - def make_segCountDown_tree_spec(self, resourcename): # noqa - num_segs = self.num_segs - seg_countdowns_initial = self.seg_countdowns_initial - - header = ([str(num_segs)] + - [str(num_seg) for num_seg in range(num_segs - 1)] + - ["default"]) - - lines = [" ".join(header)] - - for seg, seg_countdown_initial in enumerate(seg_countdowns_initial): - lines.append(" -1 %d" % seg_countdown_initial) - - tree = "\n".join(lines) - - return resource_substitute(resourcename)(tree=tree) - - def make_map_seg_segCountDown_dt_spec(self): # noqa - return self.make_segCountDown_tree_spec("map_seg_segCountDown.dt.tmpl") - - def make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec(self): # noqa - template_name = \ - "map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl" - return self.make_segCountDown_tree_spec(template_name) - - def generate_objects(self): - yield data_string("map_frameIndex_ruler.dt.txt") - yield self.make_map_seg_segCountDown_dt_spec() - yield self.make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec() # noqa - yield data_string("map_seg_subseg_obs.dt.txt") - - supervision_type = self.supervision_type - if supervision_type == SUPERVISION_SEMISUPERVISED: - yield data_string("map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt") # noqa - elif supervision_type == SUPERVISION_SUPERVISED: - # XXX: does not exist yet - yield data_string("map_supervisionLabel_seg_alwaysTrue_supervised.dt.txt") # noqa - else: - assert supervision_type == SUPERVISION_UNSUPERVISED - - -class VirtualEvidenceSpec(ParamSpec): - type_name = "VE_CPT" - - # According to GMTK specification (tksrc/GMTK_VECPT.cc) - # this should be of the format: - # CPT_name num_par par_card self_card VE_CPT_FILE - # nfs:nfloats nis:nints ... fmt:obsformat ... END - object_tmpl = "seg_virtualEvidence 1 %s 2 %s nfs:%s nis:0 fmt:ascii END" - copy_attrs = ParamSpec.copy_attrs + ["virtual_evidence", "num_segs"] - - def make_virtual_evidence_spec(self): - return self.object_tmpl % (self.num_segs, VIRTUAL_EVIDENCE_LIST_FILENAME, self.num_segs) - - def generate_objects(self): - yield self.make_virtual_evidence_spec() - - - -class TableParamSpec(ParamSpec): - copy_attrs = ParamSpec.copy_attrs \ - + ["resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial"] - - # see Segway paper - probs_force_transition = array([0.0, 0.0, 1.0]) - - def make_table_spec(self, name, table, ndim, extra_rows=[]): - header_rows = [name, ndim] - header_rows.extend(table.shape) - - rows = [" ".join(map(str, header_rows))] - rows.extend(extra_rows) - rows.extend([array2text(table), ""]) - - return "\n".join(rows) - def calc_prob_transition(self, length): """Calculate probability transition from scaled expected length. """ @@ -653,6 +465,10 @@ def calc_prob_transition(self, length): def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # first values are the ones where segCountDown = 0 therefore # the transitions to segTransition = 2 occur early on + + # see Segway paper + probs_force_transition = array([0.0, 0.0, 1.0]) + card_seg_countdown = self.card_seg_countdown # by default, when segCountDown is high, never transition @@ -691,8 +507,8 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # labels with a maximum seg_countdowns_initial = self.seg_countdowns_initial - - res[0, labels_with_maximum] = self.probs_force_transition + res[0, labels_with_maximum] = probs_force_transition + # res[0, labels_with_maximum] = self.probs_force_transition for label in labels_with_maximum: seg_countdown_initial = seg_countdowns_initial[label] minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] @@ -705,179 +521,110 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa return res - @staticmethod - def make_dirichlet_name(name): - return "dirichlet_%s" % name - - -class RealMatParamSpec(ParamSpec): - type_name = "REAL_MAT" - - def generate_objects(self): - yield "matrix_weightscale_1x1 1 1 1.0" - - -class GammaRealMatParamSpec(RealMatParamSpec): - scale_tmpl = "gammascale_${seg}_${subseg}_${track} 1 1 ${datum}" - shape_tmpl = "gammashape_${seg}_${subseg}_${track} 1 1 ${datum}" - - copy_attrs = ParamSpec.copy_attrs \ - + ["means", "random_state", "vars"] - - def generate_objects(self): - means = self.means - vars = self.vars - - substitute_scale = Template(self.scale_tmpl).substitute - substitute_shape = Template(self.shape_tmpl).substitute - - # random start values are equivalent to the random start - # values of a Gaussian: - # - # means = scales * shapes - # vars = shapes * scales**2 - # - # therefore: - scales = vars / means - shapes = (means ** 2) / vars - - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - - scale = jitter(scales[track_index], self.random_state) - yield substitute_scale(dict(datum=scale, **mapping)) - - shape = jitter(shapes[track_index], self.random_state) - yield substitute_shape(dict(datum=shape, **mapping)) - -class TableParamSpec(ParamSpec): - copy_attrs = ParamSpec.copy_attrs \ - + ["resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial"] - - # see Segway paper - probs_force_transition = array([0.0, 0.0, 1.0]) - - def make_table_spec(self, name, table, ndim, extra_rows=[]): - header_rows = [name, ndim] - header_rows.extend(table.shape) - - rows = [" ".join(map(str, header_rows))] - rows.extend(extra_rows) - rows.extend([array2text(table), ""]) - - return "\n".join(rows) - - def calc_prob_transition(self, length): - """Calculate probability transition from scaled expected length. - """ - length_scaled = length // self.resolution - - prob_self_self = prob_transition_from_expected_len(length_scaled) - prob_self_other = 1.0 - prob_self_self - - return prob_self_self, prob_self_other - - def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa - # first values are the ones where segCountDown = 0 therefore - # the transitions to segTransition = 2 occur early on - card_seg_countdown = self.card_seg_countdown - - # by default, when segCountDown is high, never transition - res = empty((card_seg_countdown, self.num_segs, CARD_SEGTRANSITION)) - - prob_seg_self_self, prob_seg_self_other = \ - self.calc_prob_transition(LEN_SEG_EXPECTED) + def generate_dense_cpt_objects(self): + # names of dense cpts + names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg", + "segCountDown_seg_segTransition"] + num_segs = self.num_segs + num_subsegs = self.num_subsegs - prob_subseg_self_self, prob_subseg_self_other = \ - self.calc_prob_transition(LEN_SUBSEG_EXPECTED) + # create required probability tables + start_seg = fill_array(1.0 / num_segs, num_segs) + seg_subseg = fill_array(1.0 / num_subsegs, (num_segs, num_subsegs)) + seg_seg = make_zero_diagonal_table(num_segs) + cpt_seg = make_zero_diagonal_table(num_subsegs) + seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1)) + segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition() + #rint(start_seg.shape, "\n", seg_subseg.shape, "\n", seg_seg.shape, "\n",seg_subseg_subseg.shape, "\n",segCountDown.shape) + prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown] + # create DenseCPTs and add to input_master.dense_cpt: InlineSection + # object + for i in range(len(names)): + input_master.dense_cpt[names[i]] = DenseCPT(prob[i]) - # 0: no transition - # 1: subseg transition (no transition when CARD_SUBSEG == 1) - # 2: seg transition - probs_allow_transition = \ - array([prob_seg_self_self * prob_subseg_self_self, - prob_seg_self_self * prob_subseg_self_other, - prob_seg_self_other]) + return input_master.dense_cpt.__str__() - probs_prevent_transition = array([prob_subseg_self_self, - prob_subseg_self_other, - 0.0]) + def make_dinucleotide_table_row(self): + pass - # find the labels with maximum segment lengths and those without - table = self.seg_table - ends = table[:, OFFSET_END] - bitmap_without_maximum = ends == 0 + def make_seg_dinucleotide(self): + pass - # where() returns a tuple; this unpacks it - labels_with_maximum, = where(~bitmap_without_maximum) - labels_without_maximum, = where(bitmap_without_maximum) - # labels without a maximum - res[0, labels_without_maximum] = probs_allow_transition - res[1:, labels_without_maximum] = probs_prevent_transition +class DTParamSpec(ParamSpec): + type_name = "DT" + copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", + "supervision_type"] - # labels with a maximum + def make_segCountDown_tree_spec(self, resourcename): # noqa + num_segs = self.num_segs seg_countdowns_initial = self.seg_countdowns_initial - res[0, labels_with_maximum] = self.probs_force_transition - for label in labels_with_maximum: - seg_countdown_initial = seg_countdowns_initial[label] - minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] - - seg_countdown_allow = seg_countdown_initial - minimum + 1 + header = ([str(num_segs)] + + [str(num_seg) for num_seg in range(num_segs - 1)] + + ["default"]) - res[1:seg_countdown_allow, label] = probs_allow_transition - res[seg_countdown_allow:, label] = probs_prevent_transition + lines = [" ".join(header)] - return res + for seg, seg_countdown_initial in enumerate(seg_countdowns_initial): + lines.append(" -1 %d" % seg_countdown_initial) + tree = "\n".join(lines) - @staticmethod - def make_dirichlet_name(name): - return "dirichlet_%s" % name + return resource_substitute(resourcename)(tree=tree) + def make_map_seg_segCountDown_dt_spec(self): # noqa + return self.make_segCountDown_tree_spec("map_seg_segCountDown.dt.tmpl") -class DirichletTabParamSpec(TableParamSpec): - type_name = "DIRICHLET_TAB" - copy_attrs = TableParamSpec.copy_attrs \ - + ["len_seg_strength", "num_bases", "card_seg_countdown", - "num_mix_components"] + def make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec( + self): # noqa + template_name = \ + "map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl" + return self.make_segCountDown_tree_spec(template_name) - def make_table_spec(self, name, table): - dirichlet_name = self.make_dirichlet_name(name) + def generate_objects(self): + yield data_string("map_frameIndex_ruler.dt.txt") + yield self.make_map_seg_segCountDown_dt_spec() + yield self.make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec() # noqa + yield data_string("map_seg_subseg_obs.dt.txt") - return TableParamSpec.make_table_spec(self, dirichlet_name, table, - table.ndim) + supervision_type = self.supervision_type + if supervision_type == SUPERVISION_SEMISUPERVISED: + yield data_string( + "map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt") # noqa + elif supervision_type == SUPERVISION_SUPERVISED: + # XXX: does not exist yet + yield data_string( + "map_supervisionLabel_seg_alwaysTrue_supervised.dt.txt") # noqa + else: + assert supervision_type == SUPERVISION_UNSUPERVISED - def make_dirichlet_table(self): - probs = self.make_dense_cpt_segCountDown_seg_segTransition() - # XXX: the ratio is not exact as num_bases is not the same as - # the number of base-base transitions. It is surely close - # enough, though - total_pseudocounts = self.len_seg_strength * self.num_bases - divisor = self.card_seg_countdown * self.num_segs - pseudocounts_per_row = total_pseudocounts / divisor +class VirtualEvidenceSpec(ParamSpec): + type_name = "VE_CPT" - # astype(int) means flooring the floats - pseudocounts = (probs * pseudocounts_per_row).astype(int) + # According to GMTK specification (tksrc/GMTK_VECPT.cc) + # this should be of the format: + # CPT_name num_par par_card self_card VE_CPT_FILE + # nfs:nfloats nis:nints ... fmt:obsformat ... END + object_tmpl = "seg_virtualEvidence 1 %s 2 %s nfs:%s nis:0 fmt:ascii END" + copy_attrs = ParamSpec.copy_attrs + ["virtual_evidence", "num_segs"] - return pseudocounts + def make_virtual_evidence_spec(self): + return self.object_tmpl % ( + self.num_segs, VIRTUAL_EVIDENCE_LIST_FILENAME, self.num_segs) def generate_objects(self): - # XXX: these called functions have confusing/duplicative names - if self.len_seg_strength > 0: - dirichlet_table = self.make_dirichlet_table() - yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, - dirichlet_table) + yield self.make_virtual_evidence_spec() + class InputMasterSaver(Saver): resource_name = "input.master.tmpl" copy_attrs = ["num_bases", "num_segs", "num_subsegs", "num_track_groups", "card_seg_countdown", "seg_countdowns_initial", "seg_table", "distribution", - "len_seg_strength", "resolution", "random_state", "supervision_type", + "len_seg_strength", "resolution", "random_state", + "supervision_type", "use_dinucleotide", "mins", "means", "vars", "gmtk_include_filename_relative", "track_groups", "num_mix_components", "virtual_evidence", "tracks"] @@ -907,10 +654,12 @@ def make_mapping(self): # seg_seg num_free_params += fullnum_subsegs * (fullnum_subsegs - 1) - + # segCountDown_seg_segTransition num_free_params += fullnum_subsegs + name_collection_spec = param_spec.generate_name_collection() + distribution = self.distribution if distribution in DISTRIBUTIONS_LIKE_NORM: mean_spec = param_spec.generate_mean_objects() @@ -940,11 +689,10 @@ def make_mapping(self): num_free_params += (fullnum_subsegs * 2) * num_track_groups else: raise ValueError("distribution %s not supported" % distribution) + dpmf_spec = param_spec.generate_dpmf_objects() mx_spec = param_spec.generate_mx_objects() card_seg = num_segs ve_spec = VirtualEvidenceSpec(self) return locals() # dict of vars set in this function - - From 0c31df49a93916f346e22ef381f6297db8d15f9e Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:13:58 -0400 Subject: [PATCH 11/72] Update gmtk.py --- segway/gmtk.py | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 77a4b2ab..1bbfb19d 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -35,7 +35,7 @@ def __new__(cls, *args): input_array = array(args) obj = np.asarray(input_array).view(cls) return obj - + class Section(OrderedDict): """ Contains GMTK objects of a single type and supports writing them to file. @@ -70,6 +70,7 @@ def __setattr__(self, key, value): else: super(Section, self).__setattr__(key, value) + class InlineSection(Section): def __str__(self): @@ -113,7 +114,8 @@ def __init__(self, mean, covar): def __setattr__(self, key, value): OrderedDict.__setattr__(self, key, value) - + + def __str__(self): """ Returns string representation of all MC objects contained in this @@ -143,8 +145,8 @@ def __str__(self): lines.append("\n") return "\n".join(lines) - - + + class InlineMXSection(InlineSection): """ Special InlineSection subclass which contains MX objects. @@ -198,6 +200,7 @@ def __str__(self): lines.append("\n") return "\n".join(lines) + class InputMaster: """ Master class which contains all GMTK objects present in the input @@ -237,23 +240,13 @@ def __str__(self): :return: """ attrs = [self.deterministic_cpt, self.name_collection, self.mean, - self.covar, self.dense_cpt, self.dpmf, self.mc, self.mx] + self.covar, self.dense_cpt, self.dpmf, self.mc, self.mx] s = [] for obj in attrs: s.append("".join(obj.__str__())) return "".join(s) - - def save(self, filename): - """ - Opens filename for writing and writes out the contents of its attributes. - :param filename: str - :return: None - """ - with open(filename, 'w') as file: - print('# include "traindir/auxiliary/segway.inc"', file=file) - print(self, file=file) class DenseCPT(Array): @@ -274,7 +267,7 @@ def __str__(self): if len(new_shape) == 1: new_shape = (new_shape[0], ) self.reshape((new_shape)) - + num_parents = len(self.shape) - 1 line.append(str(num_parents)) # number of parents cardinality_line = map(str, self.shape) @@ -284,6 +277,7 @@ def __str__(self): return "\n".join(line) + class NameCollection(list): """ A single NameCollection object. @@ -295,21 +289,25 @@ def __init__(self, *args): Initialize a single NameCollection object. :param args: str: names in this NameCollection """ - list.__init__(self, list(args)) + if isinstance(args[0], list): # names in NameCollection have been given in a single list + list.__init__(self, []) + self.extend(args[0]) + else: + list.__init__(self, list(args)) def __str__(self): """ Returns string format of NameCollection object to be printed into the input.master file (new lines to be added) """ - line = [] if len(self) == 0: - return line + return "" else: + line = [] line.append(str(len(self))) line.extend(self) line.append("\n") - + list.__str__(self) return "\n".join(line) @@ -359,7 +357,7 @@ def __str__(self): line.append(array2text(self)) # covar values line.append("\n") return "\n".join(line) - + def get_dimension(self): """ Return dimension of this Covar object. @@ -391,6 +389,7 @@ def __str__(self): def get_length(self): return len(self) + class MC: """ A single MC object. @@ -405,7 +404,8 @@ def __init__(self, component_type): :param component_type: int: type of MC """ self.component_type = component_type - + + class DiagGaussianMC(MC): """ Attributes: @@ -469,6 +469,7 @@ def __str__(self): line.append(" ".join(self.components)) # component names return "\n".join(line) + class DeterministicCPT: """ A single DeterministicCPT object. @@ -491,7 +492,7 @@ def __init__(self, cardinality_parents, cardinality, dt): """ if not isinstance(cardinality_parents, tuple): self.cardinality_parents = (cardinality_parents, ) - else: + else: self.cardinality_parents = cardinality_parents self.cardinality = cardinality self.dt = dt From 97417b63719f132c45fad8ca7c7a7e7ce75c152e Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:17:52 -0400 Subject: [PATCH 12/72] added save --- segway/gmtk.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index 1bbfb19d..ed75e128 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -248,6 +248,16 @@ def __str__(self): return "".join(s) + def save(self, filename): + """ + Opens filename for writing and writes out the contents of its attributes. + :param filename: str + :return: + """ + with open(filename, 'w') as file: + print('# include "traindir/auxiliary/segway.inc"', file=file) + print(self, file=file) + class DenseCPT(Array): """ From a8293280228e017419df89df746f64c61280bdf7 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:26:57 -0400 Subject: [PATCH 13/72] Update gmtk.py --- segway/gmtk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index ed75e128..5aa074e6 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -254,10 +254,10 @@ def save(self, filename): :param filename: str :return: """ - with open(filename, 'w') as file: - print('# include "traindir/auxiliary/segway.inc"', file=file) - print(self, file=file) - + with open(filename, 'w') as filename: + print('# include "traindir/auxiliary/segway.inc"', file=filename) + print(self, file=filename) + class DenseCPT(Array): """ From 42e109c584cc100ed15599943f8549ddcc72d639 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:29:35 -0400 Subject: [PATCH 14/72] Update gmtk.py --- segway/gmtk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 5aa074e6..9b423dd1 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -255,7 +255,7 @@ def save(self, filename): :return: """ with open(filename, 'w') as filename: - print('# include "traindir/auxiliary/segway.inc"', file=filename) + print('''# include "traindir/auxiliary/segway.inc"''', file=filename) print(self, file=filename) From a73b6920eaa60029614cbd98a55a55623e0e1b98 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:32:23 -0400 Subject: [PATCH 15/72] Update gmtk.py --- segway/gmtk.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 9b423dd1..7dcdc40f 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -248,15 +248,15 @@ def __str__(self): return "".join(s) - def save(self, filename): - """ - Opens filename for writing and writes out the contents of its attributes. - :param filename: str - :return: - """ - with open(filename, 'w') as filename: - print('''# include "traindir/auxiliary/segway.inc"''', file=filename) - print(self, file=filename) +# def save(self, filename): +# """ +# Opens filename for writing and writes out the contents of its attributes. +# :param filename: str +# :return: +# """ +# with open(filename, 'w') as filename: +# print('''# include "traindir/auxiliary/segway.inc"''', file=filename) +# print(self, file=filename) class DenseCPT(Array): From 93165deedbd2dbb421a039deb887ba005abb2cc9 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:40:58 -0400 Subject: [PATCH 16/72] Update gmtk.py --- segway/gmtk.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index 7dcdc40f..6d9f627c 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -42,6 +42,12 @@ class Section(OrderedDict): Key: name of GMTK object Value: GMTK object """ + def __init__(self): + """ + Initialize an empty Section object. + """ + super(Section, self).__init__() + def kind(self): """ Return string attribute kind of all GMTK objects in this Section object. From 2569be6e3c353fafea3f80afe639c8b692ef5275 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 7 Aug 2020 00:05:03 -0400 Subject: [PATCH 17/72] Update gmtk.py --- segway/gmtk.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 6d9f627c..6df7c014 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -114,9 +114,11 @@ def __init__(self, mean, covar): :param covar: InlineSection: InlineSection object which point to InputMaster.covar """ + super(InlineSection, self).__init__() self.mean = mean self.covar = covar - InlineSection.__init__(self) + + # InlineSection.__init__(self) def __setattr__(self, key, value): OrderedDict.__setattr__(self, key, value) @@ -168,9 +170,10 @@ def __init__(self, dpmf, mc): :param components: InlineSection: InlineSection object which point to InputMaster.mc """ + super(InlineMXSection, self).__init__() self.dpmf = dpmf self.mc = mc - InlineSection.__init__(self) + # InlineSection.__init__(self) def __setattr__(self, key, value): OrderedDict.__setattr__(self, key, value) @@ -306,10 +309,12 @@ def __init__(self, *args): :param args: str: names in this NameCollection """ if isinstance(args[0], list): # names in NameCollection have been given in a single list - list.__init__(self, []) + super(NameCollection, self).__init__([]) + #list.__init__(self, []) self.extend(args[0]) else: - list.__init__(self, list(args)) + super(NameCollection, self).__init__(list(args)) + # list.__init__(self, list(args)) def __str__(self): """ @@ -436,9 +441,10 @@ def __init__(self, mean, covar): :param covar: name of Covar obejct associated to this MC """ # more component types? + super(DiagGaussianMC, self).__init__(COMPONENT_TYPE_DIAG_GAUSSIAN) self.mean = mean self.covar = covar - MC.__init__(self, COMPONENT_TYPE_DIAG_GAUSSIAN) + #MC.__init__(self, COMPONENT_TYPE_DIAG_GAUSSIAN) def __str__(self): """ From 67da040da493bc723766bf8246eba89ad627e299 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 10 Aug 2020 11:06:36 -0400 Subject: [PATCH 18/72] added uniform_from_shape --- segway/gmtk.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index 6df7c014..64a265b6 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -35,6 +35,28 @@ def __new__(cls, *args): input_array = array(args) obj = np.asarray(input_array).view(cls) return obj + + @classmethod + def uniform_from_shape(cls, *shape): + """ + Instantiate Array of a specific shape with probabilities set uniformly + in each leaf. + :param shape: int + TODO assumptions about square matrix? + :return: + """ + a = np.empty(shape) + + value = 1.0/shape[-1] # number of columns + a.fill(value) + + if len(shape) != 1: + # set diagonal elements to 0.0 + # TODO sq matrix assumption + diag_index = range(shape[0]) + a[diag_index, diag_index] = 0.0 + + return a class Section(OrderedDict): """ From 1f9496f5731a4af686f3cb7a89acf1635cdff62f Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Wed, 12 Aug 2020 10:45:33 -0400 Subject: [PATCH 19/72] InlineMCSection.__init__ --- segway/gmtk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 64a265b6..de9061c7 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -136,7 +136,7 @@ def __init__(self, mean, covar): :param covar: InlineSection: InlineSection object which point to InputMaster.covar """ - super(InlineSection, self).__init__() + super(InlineMCSection, self).__init__() self.mean = mean self.covar = covar From 506bb5c7217c8979cff8a295e5e9c00b0122ebde Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 17 Aug 2020 11:25:02 -0400 Subject: [PATCH 20/72] get and set attr in Section --- segway/gmtk.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index de9061c7..2f1f604a 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -70,6 +70,17 @@ def __init__(self): """ super(Section, self).__init__() + def __getattr__(self, name): + if not name.startswith('_'): + return self[name] + super(Section, self).__getattr__(name) + + def __setattr__(self, name, value): + if not name.startswith('_'): + self[name] = value + else: + super(Section, self).__setattr__(name, value) + def kind(self): """ Return string attribute kind of all GMTK objects in this Section object. From e92420d2bb9ee1eaeb6f87a85d69c0ff98136808 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 17 Aug 2020 11:30:36 -0400 Subject: [PATCH 21/72] Section.getattr and setattr --- segway/gmtk.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 2f1f604a..9b2a6f63 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -73,13 +73,15 @@ def __init__(self): def __getattr__(self, name): if not name.startswith('_'): return self[name] - super(Section, self).__getattr__(name) + OrderedDict.__getattr__(self, name) def __setattr__(self, name, value): if not name.startswith('_'): - self[name] = value + if not self.kind() == value.kind: + raise ValueError("Object has incorrect type.") +# self[name] = value else: - super(Section, self).__setattr__(name, value) + OrderedDict.__setattr__(self, name, value) def kind(self): """ @@ -95,19 +97,19 @@ def kind(self): return section_kind - def __setattr__(self, key, value): - """ - Check if all the GMTK objects are of the same type. - :param key: str: name of GMTK object - :param value: GMTK object - :return: - For now, single object - TODO, add multiple objects at once - """ - if not self.kind() == value.kind: - raise ValueError("Object has incorrect type.") - else: - super(Section, self).__setattr__(key, value) +# def __setattr__(self, key, value): +# """ +# Check if all the GMTK objects are of the same type. +# :param key: str: name of GMTK object +# :param value: GMTK object +# :return: +# For now, single object +# TODO, add multiple objects at once +# """ +# if not self.kind() == value.kind: +# raise ValueError("Object has incorrect type.") +# else: +# super(Section, self).__setattr__(key, value) class InlineSection(Section): From 0de65cf811a310bfcf57f31ced529d5f095f1112 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 17 Aug 2020 11:36:16 -0400 Subject: [PATCH 22/72] DiagGaussianMC subclass of object --- segway/gmtk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 9b2a6f63..1a9068fc 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -462,7 +462,7 @@ def __init__(self, component_type): self.component_type = component_type -class DiagGaussianMC(MC): +class DiagGaussianMC(MC, object): """ Attributes: component_type = 0 From 8ee8a97b7581d670158ab23a547c615ce9af1b8b Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 17 Aug 2020 11:39:43 -0400 Subject: [PATCH 23/72] Section.setattr --- segway/gmtk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 1a9068fc..037c908b 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -79,7 +79,7 @@ def __setattr__(self, name, value): if not name.startswith('_'): if not self.kind() == value.kind: raise ValueError("Object has incorrect type.") -# self[name] = value + self[name] = value else: OrderedDict.__setattr__(self, name, value) From a8c33864aebb9d3fded5f33c4f2a79c741df1eed Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 17 Aug 2020 11:49:52 -0400 Subject: [PATCH 24/72] generate_mean_obj --- segway/input_master.py | 1 + 1 file changed, 1 insertion(+) diff --git a/segway/input_master.py b/segway/input_master.py index 097dd546..f6d14f0b 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -308,6 +308,7 @@ def generate_mean_objects(self): distribution=self.distribution, num_mix_components=self.num_mix_components) means = self.make_mean_data() # array + print(means) # dimensions of means: num_segs x num_subsegs x num_tracks # create Mean objects num_segs, num_subsegs, num_tracks = means.shape From acd2fbe52b66094335191ac2773d7ee36204ab3b Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 17 Aug 2020 12:08:29 -0400 Subject: [PATCH 25/72] generate_mean_obj debug --- segway/input_master.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index f6d14f0b..5193e101 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -311,13 +311,12 @@ def generate_mean_objects(self): print(means) # dimensions of means: num_segs x num_subsegs x num_tracks # create Mean objects - num_segs, num_subsegs, num_tracks = means.shape - names_array = array(names).reshape(means.shape) - for i in range(num_segs): - for j in range(num_subsegs): - for k in range(num_tracks): - input_master.mean[names_array[i, j, k]] = Mean(means[i, j, k]) - + names_array = array(names).reshape((self.num_segs, self.num_subsegs, len(self.track_names))) + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for k in range(len(self.track_names)): + input_master.mean[names_array[i, j, k]] = Mean(means[k][i, 1, j]) + #Mean(means[i, j, k]) return input_master.mean.__str__() From 2b8cf943b7a277e98427b9497934dbd239e12ca5 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 17 Aug 2020 12:13:48 -0400 Subject: [PATCH 26/72] Update input_master.py --- segway/input_master.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 5193e101..6db727f6 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -308,14 +308,14 @@ def generate_mean_objects(self): distribution=self.distribution, num_mix_components=self.num_mix_components) means = self.make_mean_data() # array - print(means) + print("means", type(means), means) # dimensions of means: num_segs x num_subsegs x num_tracks # create Mean objects names_array = array(names).reshape((self.num_segs, self.num_subsegs, len(self.track_names))) for i in range(self.num_segs): for j in range(self.num_subsegs): for k in range(len(self.track_names)): - input_master.mean[names_array[i, j, k]] = Mean(means[k][i, 1, j]) + input_master.mean[names_array[i, j, k]] = Mean(means[k][i, :, j]) #Mean(means[i, j, k]) return input_master.mean.__str__() From 6fc3e2531eb32001f600b2de79e471db95184d12 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 18 Aug 2020 12:51:24 -0400 Subject: [PATCH 27/72] added head_track_names --- segway/input_master.py | 291 ++++++++++++++++++++--------------------- 1 file changed, 140 insertions(+), 151 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 6db727f6..bf42ed38 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -143,82 +143,6 @@ def make_zero_diagonal_table(length): return res -def generate_gmtk_obj_names(obj, track_names, num_segs, num_subsegs, - distribution, num_mix_components): - """ - Generate GMTK object names for the types: - NameCollection: "col" - entries in NameCollection: "mx_name" - Covar: "covar", "tied_covar" - Mean: "mean" - MX: "mx" - MC: "mc_diag", "mc_gamma", "mc_missing", "gammascale" - DPMF: "dpmf" - :param obj: str: type of gmtk object for which names must be generated - :param: track_names: list[str]: list of all track names - :param: num_segs: int: number of segs - :param: num_subsegs: int: number of subsegs - :param: distribution: str: distribution - :param: number of mixture components - :return: - """ - allowed_types = ["mx", "mc_diag", "mc_gamma", "mc_missing", "mean", - "covar", "col", "mx_name", "dpmf", "gammascale", - "gammashape", "tied_covar"] - if not obj in allowed_types: - raise ValueError("Undefined GMTK object type: {}".format(obj)) - - names = [] - if obj == "covar": - for name in track_names: - names.append("covar_{}".format(name)) - - - # todo check component suffix - elif obj == "tied_covar": - for name in track_names: - names.append("covar_{}".format(name)) - - elif obj == "col": - for name in track_names: - names.append("collection_seg_{}".format(name)) - - elif obj == "mx_name": - for name in track_names: - for i in range(num_segs): - for j in range(num_subsegs): - line = "mx_seg{}_subseg{}_{}".format(i, j, name) - names.append(line) - - elif obj == "dpmf" and num_mix_components == 1: - return ["dpmf_always"] - - else: - for i in range(num_segs): - for j in range(num_subsegs): - for name in track_names: - # TODO check component suffix diff - if obj == "mc_diag": - line = "mc_{}_seg{}_subseg{}_{}".format(distribution, - i, j, name) - # TODO - - # if obj == "mc_gamma": - # covered in general name generation - # line = "{}_{}_seg{}_subseg{}_{}".format(obj, - # distribution, i, j, name) - - # TODO - elif obj == "mc_missing": - line = "" - - else: - line = "{}_seg{}_subseg{}_{}".format(obj, i, j, name) - names.append(line) - - return names - - class ParamSpec(object): """ base class for parameter specifications used in input.master files @@ -252,22 +176,100 @@ def make_data(self): """ return None - def generate_name_collection(self): + def get_head_track_names(self): + """ + Return list of head track names. + """ + print(self.track_groups, self.tracks) + head_track_names = [] + for group in self.track_groups: + head_track_names.append(group[0].name) + return head_track_names + + def generate_gmtk_obj_names(self, obj, track_names): + """ + Generate GMTK object names for the types: + NameCollection: "col" + entries in NameCollection: "mx_name" + Covar: "covar", "tied_covar" + Mean: "mean" + MX: "mx" + MC: "mc_diag", "mc_gamma", "mc_missing", "gammascale" + DPMF: "dpmf" + :param obj: str: type of gmtk object for which names must be generated + :param track_names: list[str]: list of track names + :return: list[str]: list of GMTK object names + """ + allowed_types = ["mx", "mc_diag", "mc_gamma", "mc_missing", "mean", + "covar", "col", "mx_name", "dpmf", "gammascale", + "gammashape", "tied_covar"] + if not obj in allowed_types: + raise ValueError("Undefined GMTK object type: {}".format(obj)) + num_segs = self.num_segs + num_subsegs = self.num_subsegs + distribution = self.distribution + num_mix_components = self.num_mix_components + names = [] + if obj == "covar": + for name in track_names: + names.append("covar_{}".format(name)) + # todo check component suffix + elif obj == "tied_covar": + for name in track_names: + names.append("covar_{}".format(name)) + + elif obj == "col": + for name in track_names: + names.append("collection_seg_{}".format(name)) + + elif obj == "mx_name": + for name in track_names: + for i in range(num_segs): + for j in range(num_subsegs): + line = "mx_seg{}_subseg{}_{}".format(i, j, name) + names.append(line) + + elif obj == "dpmf" and num_mix_components == 1: + return ["dpmf_always"] + + else: + for i in range(num_segs): + for j in range(num_subsegs): + for name in track_names: + # TODO check component suffix diff + if obj == "mc_diag": + line = "mc_{}_seg{}_subseg{}_{}".format(distribution, + i, j, name) + # TODO + + # if obj == "mc_gamma": + # covered in general name generation + # line = "{}_{}_seg{}_subseg{}_{}".format(obj, + # distribution, i, j, name) + + # TODO + elif obj == "mc_missing": + line = "" + + else: + line = "{}_seg{}_subseg{}_{}".format(obj, i, j, name) + names.append(line) + + return names + + + def generate_name_collection(self, track_names): + """ + Generate string representation of NameCollection objects in input master. + :param: track_names: list[str]: list of track names + """ # generate list of collection names - collection_names = generate_gmtk_obj_names(obj="col", - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) + collection_names = self.generate_gmtk_obj_names(obj="col", + track_names=track_names) #  generate list of all names in NameCollections - names = generate_gmtk_obj_names("mx_name", - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) - num_tracks = len(self.track_names) + names = self.generate_gmtk_obj_names("mx_name", + track_names=track_names) + num_tracks = len(track_names) len_name_group = int(len(names) / num_tracks) # names grouped by collection name_groups = [names[i:i + len_name_group] for i in @@ -299,42 +301,36 @@ def make_mean_data(self): return means_tiled + (stds_tiled * noise) - def generate_mean_objects(self): + def generate_mean_objects(self, track_names): + """ + Generate string representation of Mean objects in input master. + :param: track_names: list[str]: list of track names + """ # generate list of names of Mean objects - names = generate_gmtk_obj_names("mean", - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) + names = self.generate_gmtk_obj_names("mean", + track_names=track_names) means = self.make_mean_data() # array - print("means", type(means), means) - # dimensions of means: num_segs x num_subsegs x num_tracks + # dimensions of means: num_segs x num_subsegs x num_head_tracks # create Mean objects - names_array = array(names).reshape((self.num_segs, self.num_subsegs, len(self.track_names))) + names_array = array(names).reshape((self.num_segs, self.num_subsegs, len(self.track_groups))) for i in range(self.num_segs): for j in range(self.num_subsegs): - for k in range(len(self.track_names)): - input_master.mean[names_array[i, j, k]] = Mean(means[k][i, :, j]) - #Mean(means[i, j, k]) + for k in range(len(self.track_groups)): + input_master.mean[names_array[i, j, k]] = Mean(means[i, j, k]) return input_master.mean.__str__() - def generate_covar_objects(self): + def generate_covar_objects(self, track_names): + """ + Generate string representation of Covar objects in input master. + :param: track_names: list[str]: list of track names + """ if COVAR_TIED: - names = generate_gmtk_obj_names("tied_covar", - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) + names = self.generate_gmtk_obj_names("tied_covar", + track_names=track_names) else: - names = generate_gmtk_obj_names("covar", - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) + names = self.generate_gmtk_obj_names("covar", + track_names=track_names) covars = self.vars # array of variance values # create Covar objects for i in range(len(names)): @@ -342,12 +338,14 @@ def generate_covar_objects(self): return input_master.covar.__str__() - - def generate_real_mat_objects(self): pass - def generate_mc_objects(self): + def generate_mc_objects(self, track_names): + """ + Generate string representation of MC objects in input master. + :param: track_names: list[str]: list of track names + """ # if distribution is norm or asinh_norm if self.distribution in DISTRIBUTIONS_LIKE_NORM: if USE_MFSDG: @@ -356,12 +354,8 @@ def generate_mc_objects(self): else: option = "mc_diag" # generate MC object names - names = generate_gmtk_obj_names(option, - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) + names = self.generate_gmtk_obj_names(option, + track_names=track_names) covar_names = list(input_master.mc.covar) * ( self.num_segs * self.num_subsegs) @@ -405,20 +399,18 @@ def generate_mc_objects(self): # gamma_scale=gamma_scale[i]) # input_master.update(mc_obj) - def generate_mx_objects(self): + def generate_mx_objects(self, track_names): + """Generate string representation of MX objects in input master. + :param: track_names: list[str]: list of track names + """ # generate list of MX names - names = generate_gmtk_obj_names("mx", - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) + names = self.generate_gmtk_obj_names("mx", + track_names=track_names) mc_names = list(input_master.mc) # list of all mc names dpmf_names = list(input_master.dpmf) # list of all dpmf names multiple = int(len(names) / len(dpmf_names)) - dpmf_names *= multiple - # replicate dpmf names for iteration + dpmf_names *= multiple # replicate dpmf names for iteration # create MX objects for i in range(len(names)): @@ -427,14 +419,13 @@ def generate_mx_objects(self): return input_master.mx.__str__() - def generate_dpmf_objects(self): + def generate_dpmf_objects(self, track_names): + """Generate string representation of DPMF objects in input master. + :param: track_names: list[str]: list of track names + """ # generate a list of dpmf names - names = generate_gmtk_obj_names("dpmf", - track_names=self.track_names, - num_segs=self.num_segs, - num_subsegs=self.num_subsegs, - distribution=self.distribution, - num_mix_components=self.num_mix_components) + names = self.generate_gmtk_obj_names("dpmf", + track_names=track_names) # if single dpmf if self.num_mix_components == 1: input_master.dpmf[names[0]] = DPMF(1.0) @@ -535,10 +526,8 @@ def generate_dense_cpt_objects(self): cpt_seg = make_zero_diagonal_table(num_subsegs) seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1)) segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition() - #rint(start_seg.shape, "\n", seg_subseg.shape, "\n", seg_seg.shape, "\n",seg_subseg_subseg.shape, "\n",segCountDown.shape) prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown] # create DenseCPTs and add to input_master.dense_cpt: InlineSection - # object for i in range(len(names)): input_master.dense_cpt[names[i]] = DenseCPT(prob[i]) @@ -657,19 +646,19 @@ def make_mapping(self): # segCountDown_seg_segTransition num_free_params += fullnum_subsegs - - name_collection_spec = param_spec.generate_name_collection() + head_track_names = param_spec.get_head_track_names() + name_collection_spec = param_spec.generate_name_collection(head_track_names) distribution = self.distribution if distribution in DISTRIBUTIONS_LIKE_NORM: - mean_spec = param_spec.generate_mean_objects() - covar_spec = param_spec.generate_covar_objects() + mean_spec = param_spec.generate_mean_objects(head_track_names) + covar_spec = param_spec.generate_covar_objects(head_track_names) if USE_MFSDG: real_mat_spec = RealMatParamSpec(self) else: real_mat_spec = "" - mc_spec = param_spec.generate_mc_objects() + mc_spec = param_spec.generate_mc_objects(head_track_names) if COVAR_TIED: num_free_params += (fullnum_subsegs + 1) * num_track_groups @@ -684,14 +673,14 @@ def make_mapping(self): # the gamma distribution rather than the ML estimate for the # mean and converting real_mat_spec = GammaRealMatParamSpec(self) - mc_spec = param_spec.generate_mc_objects() + mc_spec = param_spec.generate_mc_objects(head_track_names) num_free_params += (fullnum_subsegs * 2) * num_track_groups else: raise ValueError("distribution %s not supported" % distribution) - dpmf_spec = param_spec.generate_dpmf_objects() - mx_spec = param_spec.generate_mx_objects() + dpmf_spec = param_spec.generate_dpmf_objects(head_track_names) + mx_spec = param_spec.generate_mx_objects(head_track_names) card_seg = num_segs ve_spec = VirtualEvidenceSpec(self) From 4bb6a360ddeef3e05b393f3468c025ef3dfc1fcb Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 18 Aug 2020 12:57:56 -0400 Subject: [PATCH 28/72] Update input_master.py --- segway/input_master.py | 1 - 1 file changed, 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index bf42ed38..2b22fa30 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -180,7 +180,6 @@ def get_head_track_names(self): """ Return list of head track names. """ - print(self.track_groups, self.tracks) head_track_names = [] for group in self.track_groups: head_track_names.append(group[0].name) From 928960de89a5d9e611a43f2ac78bac5cf641434c Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 18 Aug 2020 12:58:18 -0400 Subject: [PATCH 29/72] removed print statements From 107b47feeda5a48343145f56339448396db6e604 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Wed, 19 Aug 2020 14:31:27 -0400 Subject: [PATCH 30/72] Array.uniform_from_shape dim, InputMaster.save --- segway/gmtk.py | 141 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 96 insertions(+), 45 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 037c908b..be11ebfe 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -37,27 +37,52 @@ def __new__(cls, *args): return obj @classmethod - def uniform_from_shape(cls, *shape): + def uniform_from_shape(cls, shape): """ Instantiate Array of a specific shape with probabilities set uniformly in each leaf. - :param shape: int + :param shape: Tuple[int]: shape of Array TODO assumptions about square matrix? :return: """ - a = np.empty(shape) - - value = 1.0/shape[-1] # number of columns - a.fill(value) - - if len(shape) != 1: - # set diagonal elements to 0.0 - # TODO sq matrix assumption - diag_index = range(shape[0]) - a[diag_index, diag_index] = 0.0 - + if len(shape) == 1: + a = np.squeeze(DenseCPT(np.empty(shape)), axis=0) + a.fill(1.0 / shape[-1]) #  number of columns + else: + a = np.empty(shape) + div = shape[-1] - 1 + # if num_subsegs = 1 + if div == 0: + a.fill(1.0) + else: + value = 1.0 / div + a.fill(value) + # len(shape) = 2 => seg_seg => square matrix + if len(shape) == 2: + # set diagonal elements to 0.0 + diag_index = range(shape[0]) + a[diag_index, diag_index] = 0.0 + + # len(shape) = 3 => seg_subseg_subseg + # => num_segs x square matrix + if len(shape) == 3: + # "diag_indices" to be set to 0: + # range(seg), diag_index, diag_index + diag_index = [] + for s in range(shape[-1]): + diag_index.append([s] * len(shape[1:])) + final_indices = [] + for i in range(shape[0]): + for item in diag_index: + index = [i] + index.extend(item) + final_indices.append(tuple(index)) + + for index in final_indices: + a[index] = 0.0 return a + class Section(OrderedDict): """ Contains GMTK objects of a single type and supports writing them to file. @@ -96,22 +121,6 @@ def kind(self): assert section_kind == obj.kind, "Objects must be of same type." return section_kind - -# def __setattr__(self, key, value): -# """ -# Check if all the GMTK objects are of the same type. -# :param key: str: name of GMTK object -# :param value: GMTK object -# :return: -# For now, single object -# TODO, add multiple objects at once -# """ -# if not self.kind() == value.kind: -# raise ValueError("Object has incorrect type.") -# else: -# super(Section, self).__setattr__(key, value) - - class InlineSection(Section): def __str__(self): @@ -292,15 +301,17 @@ def __str__(self): return "".join(s) -# def save(self, filename): -# """ -# Opens filename for writing and writes out the contents of its attributes. -# :param filename: str -# :return: -# """ -# with open(filename, 'w') as filename: -# print('''# include "traindir/auxiliary/segway.inc"''', file=filename) -# print(self, file=filename) + def save(self, filename, traindir='segway_output/traindir'): + """ + Opens filename for writing and writes out the contents of its attributes. + :param: filename: str: path to input master file + :param: traindir: str: path to traindir + (default assumes path to traindir is 'segway_output/traindir') + :return: None + """ + with open(filename, 'w') as filename: + print('''# include "''' + traindir + '''/auxiliary/segway.inc"''', file=filename) + print(self, file=filename) class DenseCPT(Array): @@ -330,7 +341,17 @@ def __str__(self): line.append("\n") return "\n".join(line) - + + @classmethod + def uniform_from_shape(cls, *shape): + """ + :param shape: int: shape of DenseCPT + :return: DenseCPT with uniform probabilities and given shape + """ + a = super(DenseCPT, cls).uniform_from_shape(shape) + if a.shape != (1,) and len(shape) != 1: + return np.squeeze(DenseCPT(a)) + return a class NameCollection(list): """ @@ -396,8 +417,19 @@ def get_dimension(self): """ # return return len(self) + + @classmethod + def uniform_from_shape(cls, *shape): + """ + :param shape: int: shape of DenseCPT + :return: DenseCPT with uniform probabilities and given shape + """ + a = super(Mean, cls).uniform_from_shape(shape) + if a.shape != (1,): + return np.squeeze(Mean(a)) + return Mean(a) - + class Covar(Array): """ A single Covar object. @@ -419,11 +451,20 @@ def get_dimension(self): Return dimension of this Covar object. :return: int: dimension of this Covar object """ - #return len(self) - # is len the best return len(self) - - + + @classmethod + def uniform_from_shape(cls, *shape): + """ + :param shape: int: shape of DenseCPT + :return: DenseCPT with uniform probabilities and given shape + """ + a = super(Covar, cls).uniform_from_shape(shape) + if a.shape != (1,): + return np.squeeze(Covar(a)) + return Covar(a) + + class DPMF(Array): """ A single DPMF object. @@ -444,7 +485,17 @@ def __str__(self): def get_length(self): return len(self) - + + @classmethod + def uniform_from_shape(cls, *shape): + """ + :param shape: int: shape of DenseCPT + :return: DenseCPT with uniform probabilities and given shape + """ + a = super(DPMF, cls).uniform_from_shape(shape) + if a.shape != (1,): + return np.squeeze(DPMF(a)) + return DPMF(a) class MC: """ From 176bc221ffdb9919ca1962e4599762bec49d2c9d Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Wed, 19 Aug 2020 15:04:45 -0400 Subject: [PATCH 31/72] added uniform_from_shape() with value --- segway/gmtk.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index be11ebfe..ed5a5707 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -37,11 +37,12 @@ def __new__(cls, *args): return obj @classmethod - def uniform_from_shape(cls, shape): + def uniform_from_shape(cls, shape, diag_value=0.0): """ Instantiate Array of a specific shape with probabilities set uniformly in each leaf. :param shape: Tuple[int]: shape of Array + :param: diag_value: float: optional value for the diagonal entry TODO assumptions about square matrix? :return: """ @@ -55,7 +56,7 @@ def uniform_from_shape(cls, shape): if div == 0: a.fill(1.0) else: - value = 1.0 / div + value = (1.0 - diag_value) / div a.fill(value) # len(shape) = 2 => seg_seg => square matrix if len(shape) == 2: @@ -343,12 +344,13 @@ def __str__(self): return "\n".join(line) @classmethod - def uniform_from_shape(cls, *shape): + def uniform_from_shape(cls, *shape, value=0.0): """ - :param shape: int: shape of DenseCPT + :param: shape: int: shape of DenseCPT + :param: value: float: optional value for diagonal entry of DenseCPT :return: DenseCPT with uniform probabilities and given shape """ - a = super(DenseCPT, cls).uniform_from_shape(shape) + a = super(DenseCPT, cls).uniform_from_shape(shape, value) if a.shape != (1,) and len(shape) != 1: return np.squeeze(DenseCPT(a)) return a @@ -419,12 +421,13 @@ def get_dimension(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape): + def uniform_from_shape(cls, *shape, value): """ - :param shape: int: shape of DenseCPT + :param: shape: int: shape of DenseCPT + :param: value: float: optional value for diagonal entry :return: DenseCPT with uniform probabilities and given shape """ - a = super(Mean, cls).uniform_from_shape(shape) + a = super(Mean, cls).uniform_from_shape(shape, value) if a.shape != (1,): return np.squeeze(Mean(a)) return Mean(a) @@ -454,12 +457,13 @@ def get_dimension(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape): + def uniform_from_shape(cls, *shape, value=0.0): """ :param shape: int: shape of DenseCPT + :param: value: float: optional value for diagonal entry :return: DenseCPT with uniform probabilities and given shape """ - a = super(Covar, cls).uniform_from_shape(shape) + a = super(Covar, cls).uniform_from_shape(shape, value) if a.shape != (1,): return np.squeeze(Covar(a)) return Covar(a) @@ -487,12 +491,13 @@ def get_length(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape): + def uniform_from_shape(cls, *shape, value): """ :param shape: int: shape of DenseCPT + :param: value: float: optional value for diagonal entry :return: DenseCPT with uniform probabilities and given shape """ - a = super(DPMF, cls).uniform_from_shape(shape) + a = super(DPMF, cls).uniform_from_shape(shape, value) if a.shape != (1,): return np.squeeze(DPMF(a)) return DPMF(a) From 72efcc86eedb46b0641b76e1ad20c123bea98efc Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 10:06:10 -0400 Subject: [PATCH 32/72] DenseCPT dim --- segway/input_master.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 2b22fa30..5c3fbd5f 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -14,6 +14,7 @@ from genomedata._util import fill_array from numpy import (array, empty, float32, outer, set_printoptions, sqrt, tile, vectorize, where, zeros) +import numpy as np from six.moves import map, range from ._util import (copy_attrs, data_string, DISTRIBUTION_GAMMA, @@ -528,7 +529,7 @@ def generate_dense_cpt_objects(self): prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown] # create DenseCPTs and add to input_master.dense_cpt: InlineSection for i in range(len(names)): - input_master.dense_cpt[names[i]] = DenseCPT(prob[i]) + input_master.dense_cpt[names[i]] = np.squeeze(DenseCPT(prob[i]), axis=0) return input_master.dense_cpt.__str__() From f72b6f9d19ea9ed9e7fcbee74e53424c9e8a546e Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 10:06:46 -0400 Subject: [PATCH 33/72] denseCPT shape --- segway/gmtk.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index ed5a5707..2bfaf8a5 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -329,10 +329,6 @@ def __str__(self): :return: """ line = [] - new_shape = self.shape[1:] - if len(new_shape) == 1: - new_shape = (new_shape[0], ) - self.reshape((new_shape)) num_parents = len(self.shape) - 1 line.append(str(num_parents)) # number of parents From 046973a2ed22aac745cf561902bba2f96092d8c4 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 10:15:33 -0400 Subject: [PATCH 34/72] minor formatting --- segway/input_master.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 5c3fbd5f..6f001766 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -26,8 +26,8 @@ SUPERVISION_SUPERVISED, USE_MFSDG, VIRTUAL_EVIDENCE_LIST_FILENAME) -from .gmtk import InputMaster, NameCollection, DenseCPT, \ - DeterministicCPT, DPMF, MC, MX, Covar, Mean, DiagGaussianMC +from .gmtk import (InputMaster, NameCollection, DenseCPT, + DeterministicCPT, DPMF, MC, MX, Covar, Mean, DiagGaussianMC) # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of # scalars and vectors in the parameter output. By default in newer (> 1.14) From 686ceb617d523765150266484ae620ee5408af21 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 10:19:32 -0400 Subject: [PATCH 35/72] import division --- segway/gmtk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index 2bfaf8a5..facc228e 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -1,3 +1,4 @@ +from __future__ import division from collections import OrderedDict import numpy as np from numpy import array, ndarray From 8003cc98d55d3fa6678ad5859976d60e37762181 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 11:14:10 -0400 Subject: [PATCH 36/72] Array.diag_value --- segway/gmtk.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index facc228e..f624ad15 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -63,7 +63,7 @@ def uniform_from_shape(cls, shape, diag_value=0.0): if len(shape) == 2: # set diagonal elements to 0.0 diag_index = range(shape[0]) - a[diag_index, diag_index] = 0.0 + a[diag_index, diag_index] = diag_value # len(shape) = 3 => seg_subseg_subseg # => num_segs x square matrix @@ -81,7 +81,7 @@ def uniform_from_shape(cls, shape, diag_value=0.0): final_indices.append(tuple(index)) for index in final_indices: - a[index] = 0.0 + a[index] = diag_value return a @@ -418,11 +418,11 @@ def get_dimension(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape, value): + def uniform_from_shape(cls, *shape, value=0.0): """ - :param: shape: int: shape of DenseCPT + :param: shape: int: shape of Mean :param: value: float: optional value for diagonal entry - :return: DenseCPT with uniform probabilities and given shape + :return: Mean with uniform probabilities and given shape """ a = super(Mean, cls).uniform_from_shape(shape, value) if a.shape != (1,): @@ -456,9 +456,9 @@ def get_dimension(self): @classmethod def uniform_from_shape(cls, *shape, value=0.0): """ - :param shape: int: shape of DenseCPT + :param shape: int: shape of Covar :param: value: float: optional value for diagonal entry - :return: DenseCPT with uniform probabilities and given shape + :return: Covar with uniform probabilities and given shape """ a = super(Covar, cls).uniform_from_shape(shape, value) if a.shape != (1,): @@ -488,11 +488,11 @@ def get_length(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape, value): + def uniform_from_shape(cls, *shape, value=0.0): """ - :param shape: int: shape of DenseCPT + :param shape: int: shape of DPMF :param: value: float: optional value for diagonal entry - :return: DenseCPT with uniform probabilities and given shape + :return: DPMF with uniform probabilities and given shape """ a = super(DPMF, cls).uniform_from_shape(shape, value) if a.shape != (1,): From 45c9039d5a5ab1647cbd08f455204cdf13d8aec9 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 11:17:17 -0400 Subject: [PATCH 37/72] encoding --- segway/gmtk.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index f624ad15..d4d0d5ef 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + from __future__ import division from collections import OrderedDict import numpy as np From 4097b116acd1b9b6422a9c1c9b00059bf67c4992 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 11:43:49 -0400 Subject: [PATCH 38/72] python version for InputMaster.save() --- segway/gmtk.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index d4d0d5ef..3aaabdec 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import division +import sys from collections import OrderedDict import numpy as np from numpy import array, ndarray @@ -314,8 +315,12 @@ def save(self, filename, traindir='segway_output/traindir'): :return: None """ with open(filename, 'w') as filename: - print('''# include "''' + traindir + '''/auxiliary/segway.inc"''', file=filename) - print(self, file=filename) + if sys.version.startswith('2'): + print >> filename, '''# include "''' + traindir + '''/auxiliary/segway.inc"''' + print >> filename, self + else: + print('''# include "''' + traindir + '''/auxiliary/segway.inc"''', file=filename) + print(self, file=filename) class DenseCPT(Array): From ed28762f54743df8984c86bcdfde48b915f4092b Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 11:51:42 -0400 Subject: [PATCH 39/72] Update gmtk.py --- segway/gmtk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/segway/gmtk.py b/segway/gmtk.py index 3aaabdec..404d0482 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -315,6 +315,7 @@ def save(self, filename, traindir='segway_output/traindir'): :return: None """ with open(filename, 'w') as filename: + print(sys.version) if sys.version.startswith('2'): print >> filename, '''# include "''' + traindir + '''/auxiliary/segway.inc"''' print >> filename, self From 9021f033c111789fd33c2b90ed8af6b1318c34b1 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 11:58:22 -0400 Subject: [PATCH 40/72] Update gmtk.py --- segway/gmtk.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 404d0482..a4a4a06c 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -167,8 +167,6 @@ def __init__(self, mean, covar): self.mean = mean self.covar = covar - # InlineSection.__init__(self) - def __setattr__(self, key, value): OrderedDict.__setattr__(self, key, value) @@ -222,7 +220,6 @@ def __init__(self, dpmf, mc): super(InlineMXSection, self).__init__() self.dpmf = dpmf self.mc = mc - # InlineSection.__init__(self) def __setattr__(self, key, value): OrderedDict.__setattr__(self, key, value) @@ -315,13 +312,11 @@ def save(self, filename, traindir='segway_output/traindir'): :return: None """ with open(filename, 'w') as filename: - print(sys.version) - if sys.version.startswith('2'): - print >> filename, '''# include "''' + traindir + '''/auxiliary/segway.inc"''' - print >> filename, self - else: - print('''# include "''' + traindir + '''/auxiliary/segway.inc"''', file=filename) - print(self, file=filename) + print >> filename, '''# include "''' + traindir + '''/auxiliary/segway.inc"''' + print >> filename, self +# else: +# print('''# include "''' + traindir + '''/auxiliary/segway.inc"''', file=filename) +# print(self, file=filename) class DenseCPT(Array): From 31c03f93e1c5c277734290638c17bcf04fdcd914 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 12:19:23 -0400 Subject: [PATCH 41/72] uniform_from_shape(kwargs) - default value --- segway/gmtk.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index a4a4a06c..1e3a2634 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -344,13 +344,15 @@ def __str__(self): return "\n".join(line) @classmethod - def uniform_from_shape(cls, *shape, value=0.0): + def uniform_from_shape(cls, *shape, **kwargs): """ :param: shape: int: shape of DenseCPT - :param: value: float: optional value for diagonal entry of DenseCPT + :param: value: float: optional value for diagonal entry of DenseCPT (default is 0.0) :return: DenseCPT with uniform probabilities and given shape """ - a = super(DenseCPT, cls).uniform_from_shape(shape, value) + if 'value' not in kwargs.keys(): + kwargs['value'] = 0.0 + a = super(DenseCPT, cls).uniform_from_shape(shape, kwargs['value']) if a.shape != (1,) and len(shape) != 1: return np.squeeze(DenseCPT(a)) return a @@ -368,11 +370,9 @@ def __init__(self, *args): """ if isinstance(args[0], list): # names in NameCollection have been given in a single list super(NameCollection, self).__init__([]) - #list.__init__(self, []) self.extend(args[0]) else: super(NameCollection, self).__init__(list(args)) - # list.__init__(self, list(args)) def __str__(self): """ @@ -421,13 +421,15 @@ def get_dimension(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape, value=0.0): + def uniform_from_shape(cls, *shape, **kwargs): """ :param: shape: int: shape of Mean - :param: value: float: optional value for diagonal entry + :param: value: float: optional value for diagonal entry (default is 0.0) :return: Mean with uniform probabilities and given shape """ - a = super(Mean, cls).uniform_from_shape(shape, value) + if 'value' not in kwargs.keys(): + kwargs['value'] = 0.0 + a = super(Mean, cls).uniform_from_shape(shape, kwargs['value']) if a.shape != (1,): return np.squeeze(Mean(a)) return Mean(a) @@ -457,13 +459,15 @@ def get_dimension(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape, value=0.0): + def uniform_from_shape(cls, *shape, **kwargs): """ :param shape: int: shape of Covar - :param: value: float: optional value for diagonal entry + :param: value: float: optional value for diagonal entry (default is 0.0) :return: Covar with uniform probabilities and given shape """ - a = super(Covar, cls).uniform_from_shape(shape, value) + if 'value' not in kwargs.keys(): + kwargs['value'] = 0.0 + a = super(Covar, cls).uniform_from_shape(shape, kwargs['value']) if a.shape != (1,): return np.squeeze(Covar(a)) return Covar(a) @@ -491,13 +495,15 @@ def get_length(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape, value=0.0): + def uniform_from_shape(cls, *shape, **kwargs): """ :param shape: int: shape of DPMF - :param: value: float: optional value for diagonal entry + :param: value: float: optional value for diagonal entry (default is 0.0) :return: DPMF with uniform probabilities and given shape """ - a = super(DPMF, cls).uniform_from_shape(shape, value) + if 'value' not in kwargs.keys(): + kwargs['value'] = 0.0 + a = super(DPMF, cls).uniform_from_shape(shape, kwargs['value']) if a.shape != (1,): return np.squeeze(DPMF(a)) return DPMF(a) From 507645d328256f5af82f5174dedb5e6890e963ba Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 20 Aug 2020 14:17:56 -0400 Subject: [PATCH 42/72] Update gmtk.py --- segway/gmtk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/segway/gmtk.py b/segway/gmtk.py index 1e3a2634..f084781f 100644 --- a/segway/gmtk.py +++ b/segway/gmtk.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from __future__ import division import sys from collections import OrderedDict @@ -47,7 +45,9 @@ def uniform_from_shape(cls, shape, diag_value=0.0): in each leaf. :param shape: Tuple[int]: shape of Array :param: diag_value: float: optional value for the diagonal entry - TODO assumptions about square matrix? + Assuming that if len(shape) = 2, then this method is being called for + DenseCPT 'seg_seg' (which is a square matrix), if len(shape) = 3, + then this method is being called for DenseCPT 'seg_subseg_subseg'. :return: """ if len(shape) == 1: From 644782389cba0360c23d36713eeaad6c440b77a0 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 00:55:00 -0400 Subject: [PATCH 43/72] Rename segway/gmtk.py to segway/gmtk/input_master.py --- segway/{gmtk.py => gmtk/input_master.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename segway/{gmtk.py => gmtk/input_master.py} (100%) diff --git a/segway/gmtk.py b/segway/gmtk/input_master.py similarity index 100% rename from segway/gmtk.py rename to segway/gmtk/input_master.py From 399b8592ea791fa558c80b6f48e9afa07ffebec6 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 00:55:15 -0400 Subject: [PATCH 44/72] Create __init__.py --- segway/gmtk/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 segway/gmtk/__init__.py diff --git a/segway/gmtk/__init__.py b/segway/gmtk/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/segway/gmtk/__init__.py @@ -0,0 +1 @@ + From d06510f3ca2cd6ed4f61a7fcc47cbde9ea3d6c9d Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 01:20:47 -0400 Subject: [PATCH 45/72] added DirichletTab, RealMatParamSpec --- segway/input_master.py | 189 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 173 insertions(+), 16 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 6f001766..2fe2d245 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -154,7 +154,8 @@ class ParamSpec(object): "num_track_groups", "track_groups", "num_mix_components", "means", "vars", "num_mix_components", "random_state", "tracks", "resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial"] + "seg_countdowns_initial", "len_seg_strength", + "use_dinucleotide"] jitter_std_bound = 0.2 track_names = [] @@ -257,7 +258,6 @@ def generate_gmtk_obj_names(self, obj, track_names): return names - def generate_name_collection(self, track_names): """ Generate string representation of NameCollection objects in input master. @@ -282,7 +282,6 @@ def generate_name_collection(self, track_names): return input_master.name_collection.__str__() - def make_mean_data(self): num_segs = self.num_segs num_subsegs = self.num_subsegs @@ -338,9 +337,6 @@ def generate_covar_objects(self, track_names): return input_master.covar.__str__() - def generate_real_mat_objects(self): - pass - def generate_mc_objects(self, track_names): """ Generate string representation of MC objects in input master. @@ -418,7 +414,6 @@ def generate_mx_objects(self, track_names): components=mc_names[i]) return input_master.mx.__str__() - def generate_dpmf_objects(self, track_names): """Generate string representation of DPMF objects in input master. :param: track_names: list[str]: list of track names @@ -438,11 +433,6 @@ def generate_dpmf_objects(self, track_names): input_master.dpmf[names[i]] = DPMF(dpmf_values[i]) return input_master.dpmf.__str__() - - def generate_ve(self): - #  TODO - pass - def calc_prob_transition(self, length): """Calculate probability transition from scaled expected length. """ @@ -510,7 +500,21 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa res[seg_countdown_allow:, label] = probs_prevent_transition return res + + def make_table_spec(self, name, table, dirichlet=False): + """ + if dirichlet is True, this table has a corresponding DirichletTable + automatically generated name + """ + ndim = table.ndim - 1 # don't include output dim + + if dirichlet: + extra_rows = ["DirichletTable %s" % self.make_dirichlet_name(name)] + else: + extra_rows = [] + return TableParamSpec.make_table_spec(self, name, table, ndim, + extra_rows) def generate_dense_cpt_objects(self): # names of dense cpts @@ -527,17 +531,163 @@ def generate_dense_cpt_objects(self): seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1)) segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition() prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown] + + # create corresponding DirichletTable generated name if necessary + for i in range(len(names[0:4])): + self.make_table_spec(names[i], prob[i]) + + # for DenseCPT segCountDown_seg_segTransition: + self.make_table_spec(names[4], prob[4], dirichlet=self.len_seg_strength > 0) + # create DenseCPTs and add to input_master.dense_cpt: InlineSection for i in range(len(names)): input_master.dense_cpt[names[i]] = np.squeeze(DenseCPT(prob[i]), axis=0) - + return input_master.dense_cpt.__str__() def make_dinucleotide_table_row(self): - pass + # simple one-parameter model + gc = self.random_state.uniform() + at = 1 - gc + + a = at / 2 + c = gc / 2 + g = gc - c + t = 1 - a - c - g + + acgt = array([a, c, g, t]) + + # shape: (16,) + return outer(acgt, acgt).ravel() + + def make_dense_cpt_seg_dinucleotide_spec(self): + table = [self.make_dinucleotide_table_row() + for seg_index in range(self.num_segs)] + + return self.make_table_spec("seg_dinucleotide", table) + + +class TableParamSpec(ParamSpec): + copy_attrs = ParamSpec.copy_attrs \ + + ["resolution", "card_seg_countdown", "seg_table", + "seg_countdowns_initial"] + + # see Segway paper + probs_force_transition = array([0.0, 0.0, 1.0]) + + def make_table_spec(self, name, table, ndim, extra_rows=[]): + header_rows = [name, ndim] + header_rows.extend(table.shape) + + rows = [" ".join(map(str, header_rows))] + rows.extend(extra_rows) + rows.extend([array2text(table), ""]) + + return "\n".join(rows) + + def calc_prob_transition(self, length): + """Calculate probability transition from scaled expected length. + """ + length_scaled = length // self.resolution + + prob_self_self = prob_transition_from_expected_len(length_scaled) + prob_self_other = 1.0 - prob_self_self + + return prob_self_self, prob_self_other + + def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa + # first values are the ones where segCountDown = 0 therefore + # the transitions to segTransition = 2 occur early on + card_seg_countdown = self.card_seg_countdown + + # by default, when segCountDown is high, never transition + res = empty((card_seg_countdown, self.num_segs, CARD_SEGTRANSITION)) + + prob_seg_self_self, prob_seg_self_other = \ + self.calc_prob_transition(LEN_SEG_EXPECTED) + + prob_subseg_self_self, prob_subseg_self_other = \ + self.calc_prob_transition(LEN_SUBSEG_EXPECTED) + + # 0: no transition + # 1: subseg transition (no transition when CARD_SUBSEG == 1) + # 2: seg transition + probs_allow_transition = \ + array([prob_seg_self_self * prob_subseg_self_self, + prob_seg_self_self * prob_subseg_self_other, + prob_seg_self_other]) + + probs_prevent_transition = array([prob_subseg_self_self, + prob_subseg_self_other, + 0.0]) + + # find the labels with maximum segment lengths and those without + table = self.seg_table + ends = table[:, OFFSET_END] + bitmap_without_maximum = ends == 0 + + # where() returns a tuple; this unpacks it + labels_with_maximum, = where(~bitmap_without_maximum) + labels_without_maximum, = where(bitmap_without_maximum) + + # labels without a maximum + res[0, labels_without_maximum] = probs_allow_transition + res[1:, labels_without_maximum] = probs_prevent_transition + + # labels with a maximum + seg_countdowns_initial = self.seg_countdowns_initial + + res[0, labels_with_maximum] = self.probs_force_transition + for label in labels_with_maximum: + seg_countdown_initial = seg_countdowns_initial[label] + minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] + + seg_countdown_allow = seg_countdown_initial - minimum + 1 + + res[1:seg_countdown_allow, label] = probs_allow_transition + res[seg_countdown_allow:, label] = probs_prevent_transition + + return res - def make_seg_dinucleotide(self): - pass + + @staticmethod + def make_dirichlet_name(name): + return "dirichlet_%s" % name + + +class DirichletTabParamSpec(TableParamSpec): + type_name = "DIRICHLET_TAB" + copy_attrs = TableParamSpec.copy_attrs \ + + ["len_seg_strength", "num_bases", "card_seg_countdown", + "num_mix_components"] + + def make_table_spec(self, name, table): + dirichlet_name = self.make_dirichlet_name(name) + + return TableParamSpec.make_table_spec(self, dirichlet_name, table, + table.ndim) + + def make_dirichlet_table(self): + probs = self.make_dense_cpt_segCountDown_seg_segTransition() + + # XXX: the ratio is not exact as num_bases is not the same as + # the number of base-base transitions. It is surely close + # enough, though + total_pseudocounts = self.len_seg_strength * self.num_bases + divisor = self.card_seg_countdown * self.num_segs + pseudocounts_per_row = total_pseudocounts / divisor + + # astype(int) means flooring the floats + pseudocounts = (probs * pseudocounts_per_row).astype(int) + + return pseudocounts + + def generate_objects(self): + # XXX: these called functions have confusing/duplicative names + if self.len_seg_strength > 0: + dirichlet_table = self.make_dirichlet_table() + yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, + dirichlet_table) class DTParamSpec(ParamSpec): @@ -589,6 +739,13 @@ def generate_objects(self): assert supervision_type == SUPERVISION_UNSUPERVISED +class RealMatParamSpec(ParamSpec): + type_name = "REAL_MAT" + + def generate_objects(self): + yield "matrix_weightscale_1x1 1 1 1.0" + + class VirtualEvidenceSpec(ParamSpec): type_name = "VE_CPT" From d143768e9becf16808de02f957fe30614fea0a45 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 01:23:26 -0400 Subject: [PATCH 46/72] import gmtk.input_master --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 2fe2d245..c579cb56 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -26,7 +26,7 @@ SUPERVISION_SUPERVISED, USE_MFSDG, VIRTUAL_EVIDENCE_LIST_FILENAME) -from .gmtk import (InputMaster, NameCollection, DenseCPT, +from gmtk.input_master import (InputMaster, NameCollection, DenseCPT, DeterministicCPT, DPMF, MC, MX, Covar, Mean, DiagGaussianMC) # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of From bd31503d4749b3f2238796ff1846e95a2c78f2a7 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 01:27:34 -0400 Subject: [PATCH 47/72] Update input_master.py --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index c579cb56..84f7729c 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -26,7 +26,7 @@ SUPERVISION_SUPERVISED, USE_MFSDG, VIRTUAL_EVIDENCE_LIST_FILENAME) -from gmtk.input_master import (InputMaster, NameCollection, DenseCPT, +from .gmtk.input_master import (InputMaster, NameCollection, DenseCPT, DeterministicCPT, DPMF, MC, MX, Covar, Mean, DiagGaussianMC) # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of From 86729d2fadd1faba01d3e0dc28f6dd29b8242348 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 12:14:09 -0400 Subject: [PATCH 48/72] ParamSpec.generate_dense to DenseCPTParamSpec --- segway/input_master.py | 210 ++++++++++++++++++----------------------- 1 file changed, 93 insertions(+), 117 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 84f7729c..797bdb58 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -432,6 +432,25 @@ def generate_dpmf_objects(self, track_names): for i in range(len(names)): input_master.dpmf[names[i]] = DPMF(dpmf_values[i]) return input_master.dpmf.__str__() + + +class TableParamSpec(ParamSpec): + copy_attrs = ParamSpec.copy_attrs \ + + ["resolution", "card_seg_countdown", "seg_table", + "seg_countdowns_initial"] + + # see Segway paper + probs_force_transition = array([0.0, 0.0, 1.0]) + + def make_table_spec(self, name, table, ndim, extra_rows=[]): + header_rows = [name, ndim] + header_rows.extend(table.shape) + + rows = [" ".join(map(str, header_rows))] + rows.extend(extra_rows) + rows.extend([array2text(table), ""]) + + return "\n".join(rows) def calc_prob_transition(self, length): """Calculate probability transition from scaled expected length. @@ -446,10 +465,6 @@ def calc_prob_transition(self, length): def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # first values are the ones where segCountDown = 0 therefore # the transitions to segTransition = 2 occur early on - - # see Segway paper - probs_force_transition = array([0.0, 0.0, 1.0]) - card_seg_countdown = self.card_seg_countdown # by default, when segCountDown is high, never transition @@ -488,8 +503,8 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # labels with a maximum seg_countdowns_initial = self.seg_countdowns_initial - res[0, labels_with_maximum] = probs_force_transition - # res[0, labels_with_maximum] = self.probs_force_transition + + res[0, labels_with_maximum] = self.probs_force_transition for label in labels_with_maximum: seg_countdown_initial = seg_countdowns_initial[label] minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] @@ -500,7 +515,18 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa res[seg_countdown_allow:, label] = probs_prevent_transition return res + + + @staticmethod + def make_dirichlet_name(name): + return "dirichlet_%s" % name + +class DenseCPTParamSpec(TableParamSpec): + type_name = "DENSE_CPT" + copy_attrs = TableParamSpec.copy_attrs \ + + ["random_state", "len_seg_strength", "use_dinucleotide"] + def make_table_spec(self, name, table, dirichlet=False): """ if dirichlet is True, this table has a corresponding DirichletTable @@ -516,6 +542,61 @@ def make_table_spec(self, name, table, dirichlet=False): return TableParamSpec.make_table_spec(self, name, table, ndim, extra_rows) + def make_empty_cpt(self): + num_segs = self.num_segs + + return zeros((num_segs, num_segs)) + + def make_dense_cpt_start_seg_spec(self): + num_segs = self.num_segs + cpt = fill_array(1.0 / num_segs, num_segs) + + return self.make_table_spec("start_seg", cpt) + + def make_dense_cpt_seg_subseg_spec(self): + num_subsegs = self.num_subsegs + cpt = fill_array(1.0 / num_subsegs, (self.num_segs, num_subsegs)) + + return self.make_table_spec("seg_subseg", cpt) + + def make_dense_cpt_seg_seg_spec(self): + cpt = make_zero_diagonal_table(self.num_segs) + + return self.make_table_spec("seg_seg", cpt) + + def make_dense_cpt_seg_subseg_subseg_spec(self): + cpt_seg = make_zero_diagonal_table(self.num_subsegs) + cpt = vstack_tile(cpt_seg, self.num_segs, 1) + + return self.make_table_spec("seg_subseg_subseg", cpt) + + def make_dinucleotide_table_row(self): + # simple one-parameter model + gc = self.random_state.uniform() + at = 1 - gc + + a = at / 2 + c = gc / 2 + g = gc - c + t = 1 - a - c - g + + acgt = array([a, c, g, t]) + + # shape: (16,) + return outer(acgt, acgt).ravel() + + def make_dense_cpt_seg_dinucleotide_spec(self): + table = [self.make_dinucleotide_table_row() + for seg_index in range(self.num_segs)] + + return self.make_table_spec("seg_dinucleotide", table) + + def make_dense_cpt_segCountDown_seg_segTransition_spec(self): # noqa + cpt = self.make_dense_cpt_segCountDown_seg_segTransition() + + return self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, cpt, + dirichlet=self.len_seg_strength > 0) + def generate_dense_cpt_objects(self): # names of dense cpts names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg", @@ -545,115 +626,10 @@ def generate_dense_cpt_objects(self): return input_master.dense_cpt.__str__() - def make_dinucleotide_table_row(self): - # simple one-parameter model - gc = self.random_state.uniform() - at = 1 - gc - - a = at / 2 - c = gc / 2 - g = gc - c - t = 1 - a - c - g - - acgt = array([a, c, g, t]) - - # shape: (16,) - return outer(acgt, acgt).ravel() - - def make_dense_cpt_seg_dinucleotide_spec(self): - table = [self.make_dinucleotide_table_row() - for seg_index in range(self.num_segs)] - - return self.make_table_spec("seg_dinucleotide", table) - - -class TableParamSpec(ParamSpec): - copy_attrs = ParamSpec.copy_attrs \ - + ["resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial"] - - # see Segway paper - probs_force_transition = array([0.0, 0.0, 1.0]) - - def make_table_spec(self, name, table, ndim, extra_rows=[]): - header_rows = [name, ndim] - header_rows.extend(table.shape) - - rows = [" ".join(map(str, header_rows))] - rows.extend(extra_rows) - rows.extend([array2text(table), ""]) - - return "\n".join(rows) - - def calc_prob_transition(self, length): - """Calculate probability transition from scaled expected length. - """ - length_scaled = length // self.resolution - - prob_self_self = prob_transition_from_expected_len(length_scaled) - prob_self_other = 1.0 - prob_self_self - - return prob_self_self, prob_self_other - - def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa - # first values are the ones where segCountDown = 0 therefore - # the transitions to segTransition = 2 occur early on - card_seg_countdown = self.card_seg_countdown - - # by default, when segCountDown is high, never transition - res = empty((card_seg_countdown, self.num_segs, CARD_SEGTRANSITION)) - - prob_seg_self_self, prob_seg_self_other = \ - self.calc_prob_transition(LEN_SEG_EXPECTED) - - prob_subseg_self_self, prob_subseg_self_other = \ - self.calc_prob_transition(LEN_SUBSEG_EXPECTED) - - # 0: no transition - # 1: subseg transition (no transition when CARD_SUBSEG == 1) - # 2: seg transition - probs_allow_transition = \ - array([prob_seg_self_self * prob_subseg_self_self, - prob_seg_self_self * prob_subseg_self_other, - prob_seg_self_other]) - - probs_prevent_transition = array([prob_subseg_self_self, - prob_subseg_self_other, - 0.0]) - - # find the labels with maximum segment lengths and those without - table = self.seg_table - ends = table[:, OFFSET_END] - bitmap_without_maximum = ends == 0 - - # where() returns a tuple; this unpacks it - labels_with_maximum, = where(~bitmap_without_maximum) - labels_without_maximum, = where(bitmap_without_maximum) - - # labels without a maximum - res[0, labels_without_maximum] = probs_allow_transition - res[1:, labels_without_maximum] = probs_prevent_transition - - # labels with a maximum - seg_countdowns_initial = self.seg_countdowns_initial - - res[0, labels_with_maximum] = self.probs_force_transition - for label in labels_with_maximum: - seg_countdown_initial = seg_countdowns_initial[label] - minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] - - seg_countdown_allow = seg_countdown_initial - minimum + 1 - - res[1:seg_countdown_allow, label] = probs_allow_transition - res[seg_countdown_allow:, label] = probs_prevent_transition - - return res - - - @staticmethod - def make_dirichlet_name(name): - return "dirichlet_%s" % name - +# TODO +# if self.use_dinucleotide: +# yield self.make_dense_cpt_seg_dinucleotide_spec() + class DirichletTabParamSpec(TableParamSpec): type_name = "DIRICHLET_TAB" @@ -689,7 +665,7 @@ def generate_objects(self): yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, dirichlet_table) - + class DTParamSpec(ParamSpec): type_name = "DT" copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", @@ -796,7 +772,7 @@ def make_mapping(self): else: dirichlet_spec = "" - dense_cpt_spec = param_spec.generate_dense_cpt_objects() + dense_cpt_spec = DenseCPTParamSpec(self) # seg_seg num_free_params += fullnum_subsegs * (fullnum_subsegs - 1) From 00fb818be5e50b3d4285b3bcb21a952f1a7acbd2 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 12:17:05 -0400 Subject: [PATCH 49/72] Update input_master.py --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 797bdb58..87843ab7 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -597,7 +597,7 @@ def make_dense_cpt_segCountDown_seg_segTransition_spec(self): # noqa return self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, cpt, dirichlet=self.len_seg_strength > 0) - def generate_dense_cpt_objects(self): + def generate_objects(self): # names of dense cpts names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg", "segCountDown_seg_segTransition"] From 5ac9c01c030a5c9977acdab53df259f8c50654a9 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 21 Aug 2020 14:28:46 -0400 Subject: [PATCH 50/72] test make_dense_cpt_dinucleotide --- segway/input_master.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 87843ab7..931b2a22 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -623,9 +623,12 @@ def generate_objects(self): # create DenseCPTs and add to input_master.dense_cpt: InlineSection for i in range(len(names)): input_master.dense_cpt[names[i]] = np.squeeze(DenseCPT(prob[i]), axis=0) + + if self.use_dinucleotide: + print("use dinucleotide here") return input_master.dense_cpt.__str__() - + # TODO # if self.use_dinucleotide: # yield self.make_dense_cpt_seg_dinucleotide_spec() From 0df949e19f4346040a2a1d888a1953a42b1e513a Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Mon, 24 Aug 2020 23:46:58 -0400 Subject: [PATCH 51/72] changed uniform_from_shape in Array subclasses --- segway/gmtk/input_master.py | 163 +++++++++++++----------------------- 1 file changed, 60 insertions(+), 103 deletions(-) diff --git a/segway/gmtk/input_master.py b/segway/gmtk/input_master.py index f084781f..e4360aa7 100644 --- a/segway/gmtk/input_master.py +++ b/segway/gmtk/input_master.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, print_function import sys from collections import OrderedDict import numpy as np @@ -37,55 +37,6 @@ def __new__(cls, *args): input_array = array(args) obj = np.asarray(input_array).view(cls) return obj - - @classmethod - def uniform_from_shape(cls, shape, diag_value=0.0): - """ - Instantiate Array of a specific shape with probabilities set uniformly - in each leaf. - :param shape: Tuple[int]: shape of Array - :param: diag_value: float: optional value for the diagonal entry - Assuming that if len(shape) = 2, then this method is being called for - DenseCPT 'seg_seg' (which is a square matrix), if len(shape) = 3, - then this method is being called for DenseCPT 'seg_subseg_subseg'. - :return: - """ - if len(shape) == 1: - a = np.squeeze(DenseCPT(np.empty(shape)), axis=0) - a.fill(1.0 / shape[-1]) #  number of columns - else: - a = np.empty(shape) - div = shape[-1] - 1 - # if num_subsegs = 1 - if div == 0: - a.fill(1.0) - else: - value = (1.0 - diag_value) / div - a.fill(value) - # len(shape) = 2 => seg_seg => square matrix - if len(shape) == 2: - # set diagonal elements to 0.0 - diag_index = range(shape[0]) - a[diag_index, diag_index] = diag_value - - # len(shape) = 3 => seg_subseg_subseg - # => num_segs x square matrix - if len(shape) == 3: - # "diag_indices" to be set to 0: - # range(seg), diag_index, diag_index - diag_index = [] - for s in range(shape[-1]): - diag_index.append([s] * len(shape[1:])) - final_indices = [] - for i in range(shape[0]): - for item in diag_index: - index = [i] - index.extend(item) - final_indices.append(tuple(index)) - - for index in final_indices: - a[index] = diag_value - return a class Section(OrderedDict): @@ -283,7 +234,6 @@ def __init__(self): self.dpmf = InlineSection() self.dense_cpt = InlineSection() self.deterministic_cpt = InlineSection() - # TODO fix error self.mc = InlineMCSection(mean=self.mean, covar=self.covar) self.mx = InlineMXSection(dpmf=self.dpmf, mc=self.mc) self.name_collection = InlineSection() @@ -312,11 +262,8 @@ def save(self, filename, traindir='segway_output/traindir'): :return: None """ with open(filename, 'w') as filename: - print >> filename, '''# include "''' + traindir + '''/auxiliary/segway.inc"''' - print >> filename, self -# else: -# print('''# include "''' + traindir + '''/auxiliary/segway.inc"''', file=filename) -# print(self, file=filename) + print('# include "' + traindir + '/auxiliary/segway.inc"', file=filename) + print(self, file=filename) class DenseCPT(Array): @@ -347,15 +294,53 @@ def __str__(self): def uniform_from_shape(cls, *shape, **kwargs): """ :param: shape: int: shape of DenseCPT - :param: value: float: optional value for diagonal entry of DenseCPT (default is 0.0) + :param: kwargs: float: optional value for diagonal entry of DenseCPT (default is 0.0) :return: DenseCPT with uniform probabilities and given shape """ - if 'value' not in kwargs.keys(): - kwargs['value'] = 0.0 - a = super(DenseCPT, cls).uniform_from_shape(shape, kwargs['value']) - if a.shape != (1,) and len(shape) != 1: - return np.squeeze(DenseCPT(a)) - return a + if 'self' not in kwargs.keys(): + kwargs['self'] = 0.0 # set default value for diagonal entry to 0 + diag_value = kwargs['self'] + + if len(shape) == 1: + a = np.squeeze(DenseCPT(np.empty(shape)), axis=0) + a.fill(1.0 / shape[-1]) # uniform by number of columns + return a + else: + a = np.empty(shape) + div = shape[-1] - 1 + + # if num_subsegs = 1 + if div == 0: + a.fill(1.0) + else: + value = (1.0 - diag_value) / div + a.fill(value) + # len(shape) = 2 => seg_seg => square matrix + if len(shape) == 2: + # replace diagonal entries + diag_index = range(shape[0]) + a[diag_index, diag_index] = diag_value + + # len(shape) = 3 => seg_subseg_subseg + # => num_segs x square matrix + if len(shape) == 3: + # "diag_indices" to be set to 0: + # range(seg), diag_index, diag_index + diag_index = [] + for s in range(shape[-1]): + diag_index.append([s] * len(shape[1:])) + final_indices = [] + # prepare "diagonal" indices + for i in range(shape[0]): + for item in diag_index: + index = [i] + index.extend(item) + final_indices.append(tuple(index)) + for index in final_indices: # replace diagonal entries + a[index] = diag_value + + return np.squeeze(DenseCPT(a), axis=0) + class NameCollection(list): """ @@ -417,22 +402,7 @@ def get_dimension(self): Return dimension of this Mean object. :return: int: dimension of this Mean object """ - # return return len(self) - - @classmethod - def uniform_from_shape(cls, *shape, **kwargs): - """ - :param: shape: int: shape of Mean - :param: value: float: optional value for diagonal entry (default is 0.0) - :return: Mean with uniform probabilities and given shape - """ - if 'value' not in kwargs.keys(): - kwargs['value'] = 0.0 - a = super(Mean, cls).uniform_from_shape(shape, kwargs['value']) - if a.shape != (1,): - return np.squeeze(Mean(a)) - return Mean(a) class Covar(Array): @@ -458,20 +428,6 @@ def get_dimension(self): """ return len(self) - @classmethod - def uniform_from_shape(cls, *shape, **kwargs): - """ - :param shape: int: shape of Covar - :param: value: float: optional value for diagonal entry (default is 0.0) - :return: Covar with uniform probabilities and given shape - """ - if 'value' not in kwargs.keys(): - kwargs['value'] = 0.0 - a = super(Covar, cls).uniform_from_shape(shape, kwargs['value']) - if a.shape != (1,): - return np.squeeze(Covar(a)) - return Covar(a) - class DPMF(Array): """ @@ -495,19 +451,21 @@ def get_length(self): return len(self) @classmethod - def uniform_from_shape(cls, *shape, **kwargs): + def uniform_from_shape(cls, *shape): """ - :param shape: int: shape of DPMF - :param: value: float: optional value for diagonal entry (default is 0.0) + :param: shape: int: shape of DPMF :return: DPMF with uniform probabilities and given shape - """ - if 'value' not in kwargs.keys(): - kwargs['value'] = 0.0 - a = super(DPMF, cls).uniform_from_shape(shape, kwargs['value']) - if a.shape != (1,): - return np.squeeze(DPMF(a)) - return DPMF(a) + """ + a = np.empty(shape) + if len(shape) != 1: + raise ValueError("DPMF must be one-dimensional.") + else: + value = 1.0 / shape[0] + a.fill(value) + + return DPMF(a).squeeze(axis=0) + class MC: """ A single MC object. @@ -541,7 +499,6 @@ def __init__(self, mean, covar): super(DiagGaussianMC, self).__init__(COMPONENT_TYPE_DIAG_GAUSSIAN) self.mean = mean self.covar = covar - #MC.__init__(self, COMPONENT_TYPE_DIAG_GAUSSIAN) def __str__(self): """ From c0834443aea7ea990f01451a94b76250f1b07545 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 27 Aug 2020 10:47:29 -0400 Subject: [PATCH 52/72] added ParamSpec.generate_dirichlet_obj --- segway/input_master.py | 271 +++++++++++------------------------------ 1 file changed, 69 insertions(+), 202 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 931b2a22..0b02dca9 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python from __future__ import absolute_import, division """input_master.py: write input master files @@ -41,7 +42,7 @@ except TypeError: # Otherwise ignore the attempt pass - + if USE_MFSDG: # because tying not implemented yet COVAR_TIED = False @@ -89,7 +90,6 @@ def array2text(a): delimiter = "\n" * (ndim - 1) return delimiter.join(array2text(row) for row in a) - def make_spec(name, iterable): """ name: str, name of GMTK object type @@ -119,7 +119,6 @@ def make_spec(name, iterable): return "\n".join(all_lines) + "\n" - def prob_transition_from_expected_len(length): # formula from Meta-MEME paper, Grundy WN et al. CABIOS 13:397 # see also Reynolds SM et al. PLoS Comput Biol 4:e1000213 @@ -154,8 +153,7 @@ class ParamSpec(object): "num_track_groups", "track_groups", "num_mix_components", "means", "vars", "num_mix_components", "random_state", "tracks", "resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial", "len_seg_strength", - "use_dinucleotide"] + "seg_countdowns_initial", "len_seg_strength", "num_bases"] jitter_std_bound = 0.2 track_names = [] @@ -170,7 +168,7 @@ def __init__(self, saver): def __str__(self): return make_spec(self.type_name, self.generate_objects()) - + def make_data(self): """ override this in subclasses @@ -181,7 +179,7 @@ def make_data(self): def get_head_track_names(self): """ Return list of head track names. - """ + """ head_track_names = [] for group in self.track_groups: head_track_names.append(group[0].name) @@ -201,14 +199,15 @@ def generate_gmtk_obj_names(self, obj, track_names): :param track_names: list[str]: list of track names :return: list[str]: list of GMTK object names """ + allowed_types = ["mx", "mc_diag", "mc_gamma", "mc_missing", "mean", "covar", "col", "mx_name", "dpmf", "gammascale", "gammashape", "tied_covar"] if not obj in allowed_types: raise ValueError("Undefined GMTK object type: {}".format(obj)) - num_segs = self.num_segs + num_segs = self.num_segs num_subsegs = self.num_subsegs - distribution = self.distribution + distribution = self.distribution num_mix_components = self.num_mix_components names = [] if obj == "covar": @@ -239,25 +238,14 @@ def generate_gmtk_obj_names(self, obj, track_names): for name in track_names: # TODO check component suffix diff if obj == "mc_diag": - line = "mc_{}_seg{}_subseg{}_{}".format(distribution, - i, j, name) - # TODO - - # if obj == "mc_gamma": - # covered in general name generation - # line = "{}_{}_seg{}_subseg{}_{}".format(obj, - # distribution, i, j, name) - - # TODO - elif obj == "mc_missing": - line = "" - + line = "mc_{}_seg{}_subseg{}_{}".format(distribution, i, j, name) else: line = "{}_seg{}_subseg{}_{}".format(obj, i, j, name) names.append(line) return names + def generate_name_collection(self, track_names): """ Generate string representation of NameCollection objects in input master. @@ -266,7 +254,7 @@ def generate_name_collection(self, track_names): # generate list of collection names collection_names = self.generate_gmtk_obj_names(obj="col", track_names=track_names) - #  generate list of all names in NameCollections + # generate list of all names in NameCollections names = self.generate_gmtk_obj_names("mx_name", track_names=track_names) num_tracks = len(track_names) @@ -282,6 +270,7 @@ def generate_name_collection(self, track_names): return input_master.name_collection.__str__() + def make_mean_data(self): num_segs = self.num_segs num_subsegs = self.num_subsegs @@ -318,7 +307,6 @@ def generate_mean_objects(self, track_names): input_master.mean[names_array[i, j, k]] = Mean(means[i, j, k]) return input_master.mean.__str__() - def generate_covar_objects(self, track_names): """ Generate string representation of Covar objects in input master. @@ -365,36 +353,6 @@ def generate_mc_objects(self, track_names): covar=covar_names[i]) return input_master.mc.__str__() - # # TODO if distribution is gamma - # elif self.distribution == DISTRIBUTION_GAMMA: - # option = "mc_gamma" - # names = generate_gmtk_obj_names(option, - # track_names=self.track_names, - # num_segs=self.num_segs, - # num_subsegs=self.num_subsegs, - # distribution=self.distribution, - # num_mix_components=self.num_mix_components) - # # generate gammashape and gammascale names for MC objects - # gamma_scale = generate_gmtk_obj_names("gammascale", - # track_names=self.track_names, - # num_segs=self.num_segs, - # num_subsegs=self.num_subsegs, - # distribution=self.distribution, - # num_mix_components=self.num_mix_components) - # - # gamma_shape = generate_gmtk_obj_names("gammashape", - # track_names=self.track_names, - # num_segs=self.num_segs, - # num_subsegs=self.num_subsegs, - # distribution=self.distribution, - # num_mix_components=self.num_mix_components) - # # create MC objects - # for i in range(len(names)): - # mc_obj = MC(name=names[i], dim=1, type="COMPONENT_TYPE_GAMMA", - # gamma_shape=gamma_shape[i], - # gamma_scale=gamma_scale[i]) - # input_master.update(mc_obj) - def generate_mx_objects(self, track_names): """Generate string representation of MX objects in input master. :param: track_names: list[str]: list of track names @@ -432,25 +390,6 @@ def generate_dpmf_objects(self, track_names): for i in range(len(names)): input_master.dpmf[names[i]] = DPMF(dpmf_values[i]) return input_master.dpmf.__str__() - - -class TableParamSpec(ParamSpec): - copy_attrs = ParamSpec.copy_attrs \ - + ["resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial"] - - # see Segway paper - probs_force_transition = array([0.0, 0.0, 1.0]) - - def make_table_spec(self, name, table, ndim, extra_rows=[]): - header_rows = [name, ndim] - header_rows.extend(table.shape) - - rows = [" ".join(map(str, header_rows))] - rows.extend(extra_rows) - rows.extend([array2text(table), ""]) - - return "\n".join(rows) def calc_prob_transition(self, length): """Calculate probability transition from scaled expected length. @@ -465,6 +404,10 @@ def calc_prob_transition(self, length): def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # first values are the ones where segCountDown = 0 therefore # the transitions to segTransition = 2 occur early on + + # see Segway paper + probs_force_transition = array([0.0, 0.0, 1.0]) + card_seg_countdown = self.card_seg_countdown # by default, when segCountDown is high, never transition @@ -503,8 +446,8 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa # labels with a maximum seg_countdowns_initial = self.seg_countdowns_initial - - res[0, labels_with_maximum] = self.probs_force_transition + res[0, labels_with_maximum] = probs_force_transition + # res[0, labels_with_maximum] = self.probs_force_transition for label in labels_with_maximum: seg_countdown_initial = seg_countdowns_initial[label] minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] @@ -516,88 +459,7 @@ def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa return res - - @staticmethod - def make_dirichlet_name(name): - return "dirichlet_%s" % name - - -class DenseCPTParamSpec(TableParamSpec): - type_name = "DENSE_CPT" - copy_attrs = TableParamSpec.copy_attrs \ - + ["random_state", "len_seg_strength", "use_dinucleotide"] - - def make_table_spec(self, name, table, dirichlet=False): - """ - if dirichlet is True, this table has a corresponding DirichletTable - automatically generated name - """ - ndim = table.ndim - 1 # don't include output dim - - if dirichlet: - extra_rows = ["DirichletTable %s" % self.make_dirichlet_name(name)] - else: - extra_rows = [] - - return TableParamSpec.make_table_spec(self, name, table, ndim, - extra_rows) - - def make_empty_cpt(self): - num_segs = self.num_segs - - return zeros((num_segs, num_segs)) - - def make_dense_cpt_start_seg_spec(self): - num_segs = self.num_segs - cpt = fill_array(1.0 / num_segs, num_segs) - - return self.make_table_spec("start_seg", cpt) - - def make_dense_cpt_seg_subseg_spec(self): - num_subsegs = self.num_subsegs - cpt = fill_array(1.0 / num_subsegs, (self.num_segs, num_subsegs)) - - return self.make_table_spec("seg_subseg", cpt) - - def make_dense_cpt_seg_seg_spec(self): - cpt = make_zero_diagonal_table(self.num_segs) - - return self.make_table_spec("seg_seg", cpt) - - def make_dense_cpt_seg_subseg_subseg_spec(self): - cpt_seg = make_zero_diagonal_table(self.num_subsegs) - cpt = vstack_tile(cpt_seg, self.num_segs, 1) - - return self.make_table_spec("seg_subseg_subseg", cpt) - - def make_dinucleotide_table_row(self): - # simple one-parameter model - gc = self.random_state.uniform() - at = 1 - gc - - a = at / 2 - c = gc / 2 - g = gc - c - t = 1 - a - c - g - - acgt = array([a, c, g, t]) - - # shape: (16,) - return outer(acgt, acgt).ravel() - - def make_dense_cpt_seg_dinucleotide_spec(self): - table = [self.make_dinucleotide_table_row() - for seg_index in range(self.num_segs)] - - return self.make_table_spec("seg_dinucleotide", table) - - def make_dense_cpt_segCountDown_seg_segTransition_spec(self): # noqa - cpt = self.make_dense_cpt_segCountDown_seg_segTransition() - - return self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, cpt, - dirichlet=self.len_seg_strength > 0) - - def generate_objects(self): + def generate_dense_cpt_objects(self): # names of dense cpts names = ["start_seg", "seg_subseg", "seg_seg", "seg_subseg_subseg", "segCountDown_seg_segTransition"] @@ -612,39 +474,32 @@ def generate_objects(self): seg_subseg_subseg = (vstack_tile(cpt_seg, num_segs, 1)) segCountDown = self.make_dense_cpt_segCountDown_seg_segTransition() prob = [start_seg, seg_subseg, seg_seg, seg_subseg_subseg, segCountDown] - - # create corresponding DirichletTable generated name if necessary - for i in range(len(names[0:4])): - self.make_table_spec(names[i], prob[i]) - - # for DenseCPT segCountDown_seg_segTransition: - self.make_table_spec(names[4], prob[4], dirichlet=self.len_seg_strength > 0) - # create DenseCPTs and add to input_master.dense_cpt: InlineSection for i in range(len(names)): input_master.dense_cpt[names[i]] = np.squeeze(DenseCPT(prob[i]), axis=0) - - if self.use_dinucleotide: - print("use dinucleotide here") - + # adding dirichlet row if necessary + if self.len_seg_strength > 0: + dirichlet_row = ["DirichletTable %s" % self.make_dirichlet_name(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION)] + input_master.dense_cpt[NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION].extra_rows = dirichlet_row return input_master.dense_cpt.__str__() - -# TODO -# if self.use_dinucleotide: -# yield self.make_dense_cpt_seg_dinucleotide_spec() - - -class DirichletTabParamSpec(TableParamSpec): - type_name = "DIRICHLET_TAB" - copy_attrs = TableParamSpec.copy_attrs \ - + ["len_seg_strength", "num_bases", "card_seg_countdown", - "num_mix_components"] - - def make_table_spec(self, name, table): - dirichlet_name = self.make_dirichlet_name(name) - return TableParamSpec.make_table_spec(self, dirichlet_name, table, - table.ndim) + def make_dinucleotide_table_row(self): + pass + + def make_seg_dinucleotide(self): + pass + + def make_dirichlet_name(self, name): + return "dirichlet_{}".format(name) + + def make_dirichlet_table_spec(self, name, table): + dirichlet_name = self.make_dirichlet_name(name) + ndim = table.ndim + header_rows = [dirichlet_name, ndim] + header_rows.extend(table.shape) + rows = [" ".join(map(str, header_rows))] + rows.extend([array2text(table), ""]) + return "\n".join(rows) def make_dirichlet_table(self): probs = self.make_dense_cpt_segCountDown_seg_segTransition() @@ -661,14 +516,20 @@ def make_dirichlet_table(self): return pseudocounts - def generate_objects(self): + def generate_dirichlet_objects(self): # XXX: these called functions have confusing/duplicative names if self.len_seg_strength > 0: + header = ["DIRICHLET_TAB_IN_FILE inline"] + header.append("1\n") # only one DirichletTab for segCountDown_seg_segTransition + header.append("0") # index of dirichlet tab dirichlet_table = self.make_dirichlet_table() - yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, + value = self.make_dirichlet_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, dirichlet_table) + header.append(value) + return "\n".join(header) + else: + return "" - class DTParamSpec(ParamSpec): type_name = "DT" copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", @@ -718,13 +579,6 @@ def generate_objects(self): assert supervision_type == SUPERVISION_UNSUPERVISED -class RealMatParamSpec(ParamSpec): - type_name = "REAL_MAT" - - def generate_objects(self): - yield "matrix_weightscale_1x1 1 1 1.0" - - class VirtualEvidenceSpec(ParamSpec): type_name = "VE_CPT" @@ -769,13 +623,8 @@ def make_mapping(self): include_filename = self.gmtk_include_filename_relative dt_spec = DTParamSpec(self) - - if self.len_seg_strength > 0: - dirichlet_spec = DirichletTabParamSpec(self) - else: - dirichlet_spec = "" - - dense_cpt_spec = DenseCPTParamSpec(self) + dirichlet_spec = param_spec.generate_dirichlet_objects() + dense_cpt_spec = param_spec.generate_dense_cpt_objects() # seg_seg num_free_params += fullnum_subsegs * (fullnum_subsegs - 1) @@ -821,3 +670,21 @@ def make_mapping(self): ve_spec = VirtualEvidenceSpec(self) return locals() # dict of vars set in this function + + 674,0-1 Bot + + + + + + + + + + + + + + + + From ba9518705ea8fc723c64e03876c0f2e0303e2fbf Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 27 Aug 2020 10:52:06 -0400 Subject: [PATCH 53/72] added DenseCPT attr extra_row --- segway/gmtk/input_master.py | 93 +++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 30 deletions(-) diff --git a/segway/gmtk/input_master.py b/segway/gmtk/input_master.py index e4360aa7..3e10b1c7 100644 --- a/segway/gmtk/input_master.py +++ b/segway/gmtk/input_master.py @@ -1,5 +1,5 @@ from __future__ import division, print_function -import sys +import sys from collections import OrderedDict import numpy as np from numpy import array, ndarray @@ -20,6 +20,7 @@ def array2text(a): delimiter = "\n" * (ndim - 1) return delimiter.join(array2text(row) for row in a) + class Object(str): def __new__(cls, _name, content, _kind): return str.__new__(cls, content) @@ -28,6 +29,7 @@ def __init__(self, name, content, kind): self.kind = kind self.name = name + class Array(ndarray): def __new__(cls, *args): """ @@ -50,7 +52,7 @@ def __init__(self): Initialize an empty Section object. """ super(Section, self).__init__() - + def __getattr__(self, name): if not name.startswith('_'): return self[name] @@ -63,7 +65,7 @@ def __setattr__(self, name, value): self[name] = value else: OrderedDict.__setattr__(self, name, value) - + def kind(self): """ Return string attribute kind of all GMTK objects in this Section object. @@ -121,7 +123,6 @@ def __init__(self, mean, covar): def __setattr__(self, key, value): OrderedDict.__setattr__(self, key, value) - def __str__(self): """ Returns string representation of all MC objects contained in this @@ -264,7 +265,7 @@ def save(self, filename, traindir='segway_output/traindir'): with open(filename, 'w') as filename: print('# include "' + traindir + '/auxiliary/segway.inc"', file=filename) print(self, file=filename) - + class DenseCPT(Array): """ @@ -280,16 +281,28 @@ def __str__(self): :return: """ line = [] - + num_parents = len(self.shape) - 1 line.append(str(num_parents)) # number of parents cardinality_line = map(str, self.shape) line.append(" ".join(cardinality_line)) # cardinalities + if 'extra_rows' not in self.__dict__.keys(): + extra_rows = [] + else: + extra_rows = self.extra_rows + line.extend(extra_rows) line.append(array2text(self)) line.append("\n") return "\n".join(line) - + + def __setattr__(self, key, value): + if key == 'extra_rows': + if key not in self.__dict__.keys(): + super(DenseCPT, self).__setattr__(key, value) + else: + raise ValueError("Attribute not allowed.") + @classmethod def uniform_from_shape(cls, *shape, **kwargs): """ @@ -297,18 +310,18 @@ def uniform_from_shape(cls, *shape, **kwargs): :param: kwargs: float: optional value for diagonal entry of DenseCPT (default is 0.0) :return: DenseCPT with uniform probabilities and given shape """ - if 'self' not in kwargs.keys(): - kwargs['self'] = 0.0 # set default value for diagonal entry to 0 - diag_value = kwargs['self'] + diag_value = kwargs.pop('self', 0.0) # set default value for diagonal entry to 0 if len(shape) == 1: - a = np.squeeze(DenseCPT(np.empty(shape)), axis=0) - a.fill(1.0 / shape[-1]) # uniform by number of columns - return a + a = np.empty(shape) + cpt = DenseCPT(a) + cpt = np.squeeze(cpt, axis=0) + cpt.fill(1.0 / shape[-1]) #  number of columns + return cpt + else: a = np.empty(shape) div = shape[-1] - 1 - # if num_subsegs = 1 if div == 0: a.fill(1.0) @@ -317,9 +330,9 @@ def uniform_from_shape(cls, *shape, **kwargs): a.fill(value) # len(shape) = 2 => seg_seg => square matrix if len(shape) == 2: - # replace diagonal entries + # set diagonal elements to 0.0 diag_index = range(shape[0]) - a[diag_index, diag_index] = diag_value + a[diag_index, diag_index] = diag_value # len(shape) = 3 => seg_subseg_subseg # => num_segs x square matrix @@ -330,18 +343,21 @@ def uniform_from_shape(cls, *shape, **kwargs): for s in range(shape[-1]): diag_index.append([s] * len(shape[1:])) final_indices = [] - # prepare "diagonal" indices for i in range(shape[0]): - for item in diag_index: + for item in diag_index: index = [i] index.extend(item) final_indices.append(tuple(index)) - for index in final_indices: # replace diagonal entries + + for index in final_indices: a[index] = diag_value - - return np.squeeze(DenseCPT(a), axis=0) + + cpt = DenseCPT(a) + + return np.squeeze(cpt, axis=0) + class NameCollection(list): """ A single NameCollection object. @@ -402,9 +418,9 @@ def get_dimension(self): Return dimension of this Mean object. :return: int: dimension of this Mean object """ - return len(self) - + return len(self) + class Covar(Array): """ A single Covar object. @@ -427,8 +443,8 @@ def get_dimension(self): :return: int: dimension of this Covar object """ return len(self) - - + + class DPMF(Array): """ A single DPMF object. @@ -449,23 +465,23 @@ def __str__(self): def get_length(self): return len(self) - + @classmethod def uniform_from_shape(cls, *shape): """ :param: shape: int: shape of DPMF :return: DPMF with uniform probabilities and given shape - """ + """ a = np.empty(shape) if len(shape) != 1: raise ValueError("DPMF must be one-dimensional.") else: value = 1.0 / shape[0] a.fill(value) - + return DPMF(a).squeeze(axis=0) - + class MC: """ A single MC object. @@ -568,7 +584,7 @@ def __init__(self, cardinality_parents, cardinality, dt): """ if not isinstance(cardinality_parents, tuple): self.cardinality_parents = (cardinality_parents, ) - else: + else: self.cardinality_parents = cardinality_parents self.cardinality = cardinality self.dt = dt @@ -588,3 +604,20 @@ def __str__(self): line.append(self.dt) line.append("\n") return "\n".join(line) + + + + + + + + + + + + + + + + + From 80cdf5afad563a78818a711f92f9775d71c508a2 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Thu, 27 Aug 2020 12:38:45 -0400 Subject: [PATCH 54/72] typo --- segway/input_master.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 0b02dca9..313040f9 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -670,21 +670,3 @@ def make_mapping(self): ve_spec = VirtualEvidenceSpec(self) return locals() # dict of vars set in this function - - 674,0-1 Bot - - - - - - - - - - - - - - - - From 6b22aaed54e8c7e9dad528d6993fc242cf93cc6b Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 12:24:13 -0400 Subject: [PATCH 55/72] removed unused imports, USE_MFSDG options --- segway/input_master.py | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 313040f9..a7efab40 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -8,13 +8,10 @@ ## Copyright 2012, 2013 Michael M. Hoffman -from math import frexp, ldexp -from string import Template import sys from genomedata._util import fill_array -from numpy import (array, empty, float32, outer, set_printoptions, sqrt, tile, - vectorize, where, zeros) +from numpy import (array, empty, set_printoptions, sqrt, tile, where) import numpy as np from six.moves import map, range @@ -28,7 +25,7 @@ VIRTUAL_EVIDENCE_LIST_FILENAME) from .gmtk.input_master import (InputMaster, NameCollection, DenseCPT, - DeterministicCPT, DPMF, MC, MX, Covar, Mean, DiagGaussianMC) + DPMF, MX, Covar, Mean, DiagGaussianMC) # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of # scalars and vectors in the parameter output. By default in newer (> 1.14) @@ -43,7 +40,7 @@ # Otherwise ignore the attempt pass -if USE_MFSDG: +if USE_MFSDG: # because tying not implemented yet COVAR_TIED = False else: @@ -596,6 +593,8 @@ def make_virtual_evidence_spec(self): def generate_objects(self): yield self.make_virtual_evidence_spec() +class RealMatParamSpec: + class InputMasterSaver(Saver): resource_name = "input.master.tmpl" @@ -638,29 +637,19 @@ def make_mapping(self): if distribution in DISTRIBUTIONS_LIKE_NORM: mean_spec = param_spec.generate_mean_objects(head_track_names) covar_spec = param_spec.generate_covar_objects(head_track_names) - if USE_MFSDG: - real_mat_spec = RealMatParamSpec(self) - else: - real_mat_spec = "" + # TODO: class RealMatParamSpec + # for now this is sufficient because util.USE_MFSDG = False by default + real_mat_spec = "" mc_spec = param_spec.generate_mc_objects(head_track_names) if COVAR_TIED: num_free_params += (fullnum_subsegs + 1) * num_track_groups else: num_free_params += (fullnum_subsegs * 2) * num_track_groups + +# TODO: gamma distribution option - elif distribution == DISTRIBUTION_GAMMA: - mean_spec = "" - covar_spec = "" - - # XXX: another option is to calculate an ML estimate for - # the gamma distribution rather than the ML estimate for the - # mean and converting - real_mat_spec = GammaRealMatParamSpec(self) - mc_spec = param_spec.generate_mc_objects(head_track_names) - - num_free_params += (fullnum_subsegs * 2) * num_track_groups else: raise ValueError("distribution %s not supported" % distribution) From f1eca06e014457bc63f7d9bcb3cfc0882175cfbc Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 12:25:09 -0400 Subject: [PATCH 56/72] removed unused import --- segway/gmtk/input_master.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/segway/gmtk/input_master.py b/segway/gmtk/input_master.py index 3e10b1c7..9b893251 100644 --- a/segway/gmtk/input_master.py +++ b/segway/gmtk/input_master.py @@ -1,5 +1,4 @@ from __future__ import division, print_function -import sys from collections import OrderedDict import numpy as np from numpy import array, ndarray @@ -605,19 +604,3 @@ def __str__(self): line.append("\n") return "\n".join(line) - - - - - - - - - - - - - - - - From 4af4d4a6b4408e481a27232fd3df8f3d8f9ef4e8 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 12:27:09 -0400 Subject: [PATCH 57/72] Update input_master.py --- segway/input_master.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index a7efab40..7fece7e1 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -592,8 +592,6 @@ def make_virtual_evidence_spec(self): def generate_objects(self): yield self.make_virtual_evidence_spec() - -class RealMatParamSpec: class InputMasterSaver(Saver): From 5a0440233a9d04eb8c8661791a952616028caa63 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 13:05:21 -0400 Subject: [PATCH 58/72] removed DIST_GAMMA import --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 7fece7e1..0ce1eb60 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -15,7 +15,7 @@ import numpy as np from six.moves import map, range -from ._util import (copy_attrs, data_string, DISTRIBUTION_GAMMA, +from ._util import (copy_attrs, data_string, DISTRIBUTION_NORM, DISTRIBUTION_ASINH_NORMAL, OFFSET_END, OFFSET_START, OFFSET_STEP, resource_substitute, Saver, SEGWAY_ENCODING, From 1a60a19e2318e8faeee3c91b221016d2943aeca3 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 16:24:24 -0400 Subject: [PATCH 59/72] updated print format --- segway/gmtk/input_master.py | 130 +++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 48 deletions(-) diff --git a/segway/gmtk/input_master.py b/segway/gmtk/input_master.py index 9b893251..12f1aa47 100644 --- a/segway/gmtk/input_master.py +++ b/segway/gmtk/input_master.py @@ -18,7 +18,26 @@ def array2text(a): else: delimiter = "\n" * (ndim - 1) return delimiter.join(array2text(row) for row in a) + +def add_extra_line(l): + """ + :param: l: list[str] or str + Return copy of l with "\n" concatenated with the last item. + (l must have at least one item.) + For example: + l = ["a", "b", "c"] + new_l = add_extra_line(l) + new_l + ["a", "b", "c\n"] + """ + if isinstance(l, str): + return "{}\n".format(l) + last_element = "{}\n".format(l[-1]) + new_l = l[:] + new_l[-1] = last_element + return new_l + class Object(str): def __new__(cls, _name, content, _kind): @@ -77,6 +96,7 @@ def kind(self): else: assert section_kind == obj.kind, "Objects must be of same type." return section_kind + class InlineSection(Section): @@ -89,17 +109,35 @@ def __str__(self): # if no gmtk objects if len(self) == 0: return "" + # MC, MX are also one line types but have their own subclasses of InlineSection + one_line_types = ["MEAN", "COVAR", "DPMF"] # TODO: DPMF dim > 1? lines = ["{}_IN_FILE inline".format(self.kind())] lines.append(str(len(self)) + "\n") # total number of gmtk objects for i in range(len(self)): - lines.append(str(i)) # index of gmtk object - lines.append(list(self)[i]) # name of gmtk object - lines.append(list(self.values())[i].__str__()) - # string representation of gmtk object - - return "\n".join(lines) + obj_header = [] + obj_header.append(str(i)) # index of gmtk object + obj_header.append(list(self)[i]) # name of gmtk object + + # special formatting for some GMTK types: + if self.kind() == "NAME_COLLECTION": + # size of this NameCollection object + obj_header.append(str(len(list(self.values())[i]))) + if self.kind() == "DENSE_CPT": + # num_parents and cardinality of this DenseCPT + obj_header.append(list(self.values())[i].get_header_info()) + + if self.kind() in one_line_types: + # string representation of gmtk object + obj_header.append(list(self.values())[i].__str__()) + lines.append(" ".join(obj_header)) + else: + lines.append(" ".join(obj_header)) + # string representation of gmtk object + lines.append(list(self.values())[i].__str__()) + return "\n".join(add_extra_line(add_extra_line(lines))) + class InlineMCSection(InlineSection): """ @@ -132,9 +170,10 @@ def __str__(self): return "" else: lines = ["{}_IN_FILE inline".format(self.kind())] - lines.append(str(len(self)) + "\n") # total number of MC objects + lines.append("{}\n".format(str(len(self)))) # total number of MC objects for i in range(len(self)): - lines.append(str(i)) # index of MC object + obj_line = [] + obj_line.append(str(i)) # index of MC object # check if dimension of Mean and Covar of this MC are the same obj = list(self.values())[i] mean_name = obj.mean @@ -143,14 +182,14 @@ def __str__(self): # TODO delete MC? redefine? raise ValueError("Inconsistent dimensions of mean and covar associated to MC.") else: - lines.append(str(self.mean[mean_name].get_dimension())) + obj_line.append(str(self.mean[mean_name].get_dimension())) # dimension of MC - lines.append(str(obj.component_type)) # component type - lines.append(list(self)[i]) # name of MC - lines.append(obj.__str__()) # string representation of MC obj - - lines.append("\n") - return "\n".join(lines) + obj_line.append(str(obj.component_type)) # component type + obj_line.append(list(self)[i]) # name of MC + obj_line.append(obj.__str__()) # string representation of MC obj + lines.append(" ".join(obj_line)) + + return "\n".join(add_extra_line(lines)) class InlineMXSection(InlineSection): @@ -185,9 +224,10 @@ def __str__(self): return [] else: lines = ["{}_IN_FILE inline".format(self.kind())] - lines.append(str(len(self)) + "\n") # total number of MX objects + lines.append("{}\n".format(str(len(self)))) # total number of MX objects for i in range(len(self)): - lines.append(str(i)) # index of MX object + obj_line = [] + obj_line.append(str(i)) # index of MX object # check if number of components is equal to length of DPMF obj = list(self.values())[i] dpmf_name = obj.dpmf @@ -197,14 +237,14 @@ def __str__(self): raise ValueError( "Dimension of DPMF must be equal to number of components associated with this MX object.") else: - lines.append(str(dpmf_length)) - # dimension of MX - lines.append(list(self)[i]) # name of MX - lines.append(obj.__str__()) + obj_line.append(str(dpmf_length)) # dimension of MX + obj_line.append(list(self)[i]) # name of MX + obj_line.append(obj.__str__()) # string representation of this MX object + lines.append(" ".join(obj_line)) - lines.append("\n") - return "\n".join(lines) + + return "\n".join(add_extra_line(lines)) class InputMaster: @@ -280,18 +320,12 @@ def __str__(self): :return: """ line = [] - - num_parents = len(self.shape) - 1 - line.append(str(num_parents)) # number of parents - cardinality_line = map(str, self.shape) - line.append(" ".join(cardinality_line)) # cardinalities if 'extra_rows' not in self.__dict__.keys(): extra_rows = [] else: extra_rows = self.extra_rows line.extend(extra_rows) - line.append(array2text(self)) - line.append("\n") + line.append("{}\n".format(array2text(self))) return "\n".join(line) @@ -301,6 +335,16 @@ def __setattr__(self, key, value): super(DenseCPT, self).__setattr__(key, value) else: raise ValueError("Attribute not allowed.") + + def get_header_info(self): + """ + Return number of parents, cardinality line (called by Section.__str__()). + """ + line = [] + line.append(str(len(self.shape) - 1)) # number of parents + cardinality_line = map(str, self.shape) + line.append(" ".join(cardinality_line)) # cardinalities + return " ".join(line) @classmethod def uniform_from_shape(cls, *shape, **kwargs): @@ -356,7 +400,6 @@ def uniform_from_shape(cls, *shape, **kwargs): return np.squeeze(cpt, axis=0) - class NameCollection(list): """ A single NameCollection object. @@ -381,18 +424,13 @@ def __str__(self): """ if len(self) == 0: return "" - else: - line = [] - line.append(str(len(self))) - line.extend(self) - line.append("\n") - list.__str__(self) + line = [] + line.extend(self) # names return "\n".join(line) class Mean(Array): """ - TODO A single Mean object. """ kind = "MEAN" @@ -409,8 +447,7 @@ def __str__(self): line = [] line.append(str(self.get_dimension())) # dimension line.append(array2text(self)) - line.append("\n") - return "\n".join(line) + return " ".join(line) def get_dimension(self): """ @@ -433,8 +470,7 @@ def __str__(self): """ line = [str(self.get_dimension())] # dimension line.append(array2text(self)) # covar values - line.append("\n") - return "\n".join(line) + return " ".join(line) def get_dimension(self): """ @@ -459,8 +495,7 @@ def __str__(self): """ line = [str(self.get_length())] # dpmf length line.append(array2text(self)) # dpmf values - line.append("\n") - return "\n".join(line) + return " ".join(line) def get_length(self): return len(self) @@ -558,7 +593,7 @@ def __str__(self): line = [str(len(self.components))] # number of components line.append(self.dpmf) # dpmf name line.append(" ".join(self.components)) # component names - return "\n".join(line) + return " ".join(line) class DeterministicCPT: @@ -575,7 +610,7 @@ class DeterministicCPT: def __init__(self, cardinality_parents, cardinality, dt): """ Initialize a single DeterministicCPT object. - :param parent_cardinality: tuple[int]: cardinality of parents + :param cardinality_parents: tuple[int]: cardinality of parents (if empty, then number of parents = 0 :param cardinality: int: cardinality of self :param dt: name existing Decision Tree (DT) associated with this @@ -600,7 +635,6 @@ def __str__(self): cardinalities.extend(self.cardinality_parents) cardinalities.append(self.cardinality) line.append(" ".join(map(str, cardinalities))) # cardinalities of parent and self - line.append(self.dt) - line.append("\n") + line.append("{}\n".format(self.dt)) return "\n".join(line) From 6ce5c942518b9be09b27e60899e932315c323929 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 18:13:30 -0400 Subject: [PATCH 60/72] Update input_master.py --- segway/gmtk/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/gmtk/input_master.py b/segway/gmtk/input_master.py index 12f1aa47..4d50267d 100644 --- a/segway/gmtk/input_master.py +++ b/segway/gmtk/input_master.py @@ -546,7 +546,7 @@ def __init__(self, mean, covar): :param covar: name of Covar obejct associated to this MC """ # more component types? - super(DiagGaussianMC, self).__init__(COMPONENT_TYPE_DIAG_GAUSSIAN) + super(DiagGaussianMC, self).__init__("COMPONENT_TYPE_DIAG_GAUSSIAN") self.mean = mean self.covar = covar From aeab9f4a8dad3c291d3c38b5f1fa012bf752c5ea Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 18:19:39 -0400 Subject: [PATCH 61/72] Update input_master.py --- segway/gmtk/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/gmtk/input_master.py b/segway/gmtk/input_master.py index 4d50267d..b45815fc 100644 --- a/segway/gmtk/input_master.py +++ b/segway/gmtk/input_master.py @@ -136,7 +136,7 @@ def __str__(self): # string representation of gmtk object lines.append(list(self.values())[i].__str__()) - return "\n".join(add_extra_line(add_extra_line(lines))) + return "\n".join(add_extra_line(lines)) class InlineMCSection(InlineSection): From c3c82911148f9ded17f834767f741f328f5d39f5 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Tue, 1 Sep 2020 19:03:51 -0400 Subject: [PATCH 62/72] updated format of DirichletTab --- segway/input_master.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index 0ce1eb60..d3ce6a34 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -489,15 +489,6 @@ def make_seg_dinucleotide(self): def make_dirichlet_name(self, name): return "dirichlet_{}".format(name) - def make_dirichlet_table_spec(self, name, table): - dirichlet_name = self.make_dirichlet_name(name) - ndim = table.ndim - header_rows = [dirichlet_name, ndim] - header_rows.extend(table.shape) - rows = [" ".join(map(str, header_rows))] - rows.extend([array2text(table), ""]) - return "\n".join(rows) - def make_dirichlet_table(self): probs = self.make_dense_cpt_segCountDown_seg_segTransition() @@ -516,16 +507,22 @@ def make_dirichlet_table(self): def generate_dirichlet_objects(self): # XXX: these called functions have confusing/duplicative names if self.len_seg_strength > 0: - header = ["DIRICHLET_TAB_IN_FILE inline"] - header.append("1\n") # only one DirichletTab for segCountDown_seg_segTransition - header.append("0") # index of dirichlet tab + lines = ["DIRICHLET_TAB_IN_FILE inline"] + lines.append("1\n") # only one DirichletTab for segCountDown_seg_segTransition + row = ["0"] # index of dirichlet tab + row.append(self.make_dirichlet_name(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION)) + # name of dirichlet tab dirichlet_table = self.make_dirichlet_table() - value = self.make_dirichlet_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, - dirichlet_table) - header.append(value) - return "\n".join(header) + dim_shape = [dirichlet_table.ndim] + dim_shape.extend(dirichlet_table.shape) + row.append(" ".join(map(str, dim_shape))) + lines.append(" ".join(row)) + value = array2text(dirichlet_table) + lines.append("{}\n\n".format(value)) + return "\n".join(lines) else: return "" + class DTParamSpec(ParamSpec): type_name = "DT" From 2e44fe12f93583e6528663d1a87178374a11bf91 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 17:13:26 -0400 Subject: [PATCH 63/72] added name generators for each gmtk type --- segway/input_master.py | 348 ++++++++++++++++++++++++++--------------- 1 file changed, 219 insertions(+), 129 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index d3ce6a34..248e1970 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -25,7 +25,7 @@ VIRTUAL_EVIDENCE_LIST_FILENAME) from .gmtk.input_master import (InputMaster, NameCollection, DenseCPT, - DPMF, MX, Covar, Mean, DiagGaussianMC) + DPMF, MX, Covar, Mean, DiagGaussianMC, InlineSection) # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of # scalars and vectors in the parameter output. By default in newer (> 1.14) @@ -172,17 +172,142 @@ def make_data(self): returns: container indexed by (seg_index, subseg_index, track_index) """ return None + + def get_template_component_suffix(self, component_number): + """Returns the subsitution for the component suffix in the GMTK model + template. Empty if there is only one component""" + if self.num_mix_components == 1: + return "" + else: + return "_component{}".format(component_number) def get_head_track_names(self): """ - Return list of head track names. + Returns list containing the first track name in each track group. """ head_track_names = [] for group in self.track_groups: head_track_names.append(group[0].name) return head_track_names + + def generate_collection_names(self): + """ + Generate names of NameCollection objects. + """ + head_track_names = self.get_head_track_names() + names = [] + for name in head_track_names: + names.append("collection_seg_{}".format(name)) + return names + + def generate_name_collection_entries(self): + """ + Generate entries in NameCollection objects. + """ + head_track_names = self.get_head_track_names() + names = [] + for name in head_track_names: + for i in range(self.num_segs): + for j in range(self.num_subsegs): + names.append("mx_seg{}_subseg{}_{}".format(i, j, name)) + return names + + def generate_tied_covar_object_names(self): + """ + Generate tied Covar object names. + """ + head_track_names = self.get_head_track_names() + component_suffix = self.get_template_component_suffix() + names = [] + for track_name in head_track_names: + names.append("covar_{}{}".format(track_name, component_suffix)) + return names + + def generate_covar_object_names(self): + """ + Generate tied Covar object names. + """ + head_track_names = self.get_head_track_names() + component_suffix = self.get_template_component_suffix() + names = [] + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + names.append("covar_seg{}_subseg{}_{}{}".format(i, j, + track_name, + component_suffix)) - def generate_gmtk_obj_names(self, obj, track_names): + return names + + + def generate_mean_object_names(self): + """ + Generate Mean object names. + """ + head_track_names = self.get_head_track_names() + names = [] + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + names.append("mean_seg{}_subseg{}_{}{}".format(i, j, + track_name, + component_suffix)) + + return names + + def generate_mx_object_names(self): + """ + Generate MX object names. + """ + head_track_names = self.get_head_track_names() + names = [] + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + names.append("mx_seg{}_subseg{}_{}".format(i, j, track_name)) + + return names + + def generate_diag_gaussian_mc_object_names(self): + """ + Generate DiagGaussianMC object names. + """ + head_track_names = self.get_head_track_names() + component_suffix = self.get_template_component_suffix() + names = [] + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + names.append("mc_{}_seg{}_subseg{}_{}{}".format(self.distribution, + i, j, track_name, + component_suffix)) + return names + + def generate_gamma_mc_object_names(self): + # todo: gammascale, gammashape + pass + + def generate_missing_mc_object_names(self): + pass + + def generate_dpmf_object_names(self): + """ + Generate DPMF object names. + """ + head_track_names = self.get_head_track_names() + names = [] + if self.num_mix_components == 1: + names.append("dpmf_always") + else: + names.append("") + # TODO (with dirichlet extra rows) +# for i in range(self.num_segs): +# for j in range(self.num_subsegs): +# for track_name in head_track_names: +# names.append("dpmf_seg{}_subseg{}_{}".format(i, j, track_name)) + return names + + def generate_gmtk_object_names(self, gmtk_object_type): """ Generate GMTK object names for the types: NameCollection: "col" @@ -190,83 +315,54 @@ def generate_gmtk_obj_names(self, obj, track_names): Covar: "covar", "tied_covar" Mean: "mean" MX: "mx" - MC: "mc_diag", "mc_gamma", "mc_missing", "gammascale" + MC: "diag_gaussian_mc", "gamma_mc", "missing_mc", "gammascale" DPMF: "dpmf" - :param obj: str: type of gmtk object for which names must be generated - :param track_names: list[str]: list of track names + :param gmtk_object_type: str: type of gmtk object for which names must be generated :return: list[str]: list of GMTK object names """ - - allowed_types = ["mx", "mc_diag", "mc_gamma", "mc_missing", "mean", - "covar", "col", "mx_name", "dpmf", "gammascale", + allowed_types = ["mx", "diag_gaussian_mc", "gamma_mc", "missing_mc", "mean", + "covar", "collection_names", "collection_entries", "dpmf", "gammascale", "gammashape", "tied_covar"] - if not obj in allowed_types: - raise ValueError("Undefined GMTK object type: {}".format(obj)) - num_segs = self.num_segs - num_subsegs = self.num_subsegs - distribution = self.distribution - num_mix_components = self.num_mix_components - names = [] - if obj == "covar": - for name in track_names: - names.append("covar_{}".format(name)) - # todo check component suffix - elif obj == "tied_covar": - for name in track_names: - names.append("covar_{}".format(name)) - - elif obj == "col": - for name in track_names: - names.append("collection_seg_{}".format(name)) - - elif obj == "mx_name": - for name in track_names: - for i in range(num_segs): - for j in range(num_subsegs): - line = "mx_seg{}_subseg{}_{}".format(i, j, name) - names.append(line) - - elif obj == "dpmf" and num_mix_components == 1: - return ["dpmf_always"] - - else: - for i in range(num_segs): - for j in range(num_subsegs): - for name in track_names: - # TODO check component suffix diff - if obj == "mc_diag": - line = "mc_{}_seg{}_subseg{}_{}".format(distribution, i, j, name) - else: - line = "{}_seg{}_subseg{}_{}".format(obj, i, j, name) - names.append(line) - - return names - - - def generate_name_collection(self, track_names): + + if not gmtk_object_types in allowed_types: + raise ValueError("Undefined GMTK object type: {}".format(gmtk_object_type)) + + GMTK_OBJECT_NAME_GENERATORS = {'mx': generate_mx_object_names, + 'diag_gaussian_mc': generate_diag_gaussian_mc_object_names, + 'gamma_mc': generate_gamma_mc_object_names, + 'missing_mc': generate_missing_mc_object_names, + 'mean': generate_mean_object_names, + 'covar': generate_covar_object_names, + 'tied_covar': generate_covar_object_names, + 'collection_names': generate_collection_names, + 'collection_entries': generate_name_collection_entries, + 'dpmf': generate_dpmf_object_names} + + return GMTK_OBJECT_NAME_GENERATORS[gmtk_object_type]() + + def generate_name_collection(self): """ Generate string representation of NameCollection objects in input master. - :param: track_names: list[str]: list of track names - """ + """ # generate list of collection names - collection_names = self.generate_gmtk_obj_names(obj="col", - track_names=track_names) - # generate list of all names in NameCollections - names = self.generate_gmtk_obj_names("mx_name", - track_names=track_names) - num_tracks = len(track_names) - len_name_group = int(len(names) / num_tracks) + # num_track_groups (i.e. one for each head track) number + # of collection names generated + collection_names = self.generate_gmtk_object_names("collection_names") + # generate list of all names in NameCollections + # (num_segs * num_subsegs) number of names generated + names = self.generate_gmtk_object_names("collection_entries") + num_track_groups = self.num_track_groups + len_name_group = int(len(names) / num_track_groups) # names grouped by collection name_groups = [names[i:i + len_name_group] for i in range(0, len(names), len_name_group)] - # create NameCollection objects and add to - # input_master.name_collection: InlineSection + # create NameCollection objects and add to input master + for collection_name, collection_entry in for group_index in range(len(name_groups)): input_master.name_collection[collection_names[group_index]] = \ NameCollection(name_groups[group_index]) - return input_master.name_collection.__str__() - + return str(input_master.name_collection) def make_mean_data(self): num_segs = self.num_segs @@ -286,96 +382,90 @@ def make_mean_data(self): return means_tiled + (stds_tiled * noise) - def generate_mean_objects(self, track_names): + def generate_mean_objects(self): """ Generate string representation of Mean objects in input master. - :param: track_names: list[str]: list of track names """ # generate list of names of Mean objects - names = self.generate_gmtk_obj_names("mean", - track_names=track_names) + names = self.generate_gmtk_object_names("mean") means = self.make_mean_data() # array + num_track_groups = self.num_track_groups # number of head track names # dimensions of means: num_segs x num_subsegs x num_head_tracks # create Mean objects names_array = array(names).reshape((self.num_segs, self.num_subsegs, len(self.track_groups))) for i in range(self.num_segs): for j in range(self.num_subsegs): - for k in range(len(self.track_groups)): + for k in range(num_track_groups): input_master.mean[names_array[i, j, k]] = Mean(means[i, j, k]) - return input_master.mean.__str__() + + return str(input_master.mean) - def generate_covar_objects(self, track_names): + def generate_covar_objects(self): """ Generate string representation of Covar objects in input master. - :param: track_names: list[str]: list of track names """ if COVAR_TIED: - names = self.generate_gmtk_obj_names("tied_covar", - track_names=track_names) + names = self.generate_gmtk_object_names("tied_covar") else: - names = self.generate_gmtk_obj_names("covar", - track_names=track_names) - covars = self.vars # array of variance values - # create Covar objects - for i in range(len(names)): - input_master.covar[names[i]] = Covar(covars[i]) # TODO index error + names = self.generate_gmtk_object_names("covar") + covar_values = self.vars # array of variance values + # creating Covar objects and adding them to input master + dpmf_objects = map(Covar, covar_values) + input_master.covar.update(dict(zip(names, covar_objects))) - return input_master.covar.__str__() + return str(input_master.covar) - def generate_mc_objects(self, track_names): + def generate_mc_objects(self): """ Generate string representation of MC objects in input master. - :param: track_names: list[str]: list of track names """ - # if distribution is norm or asinh_norm + # if distribution is norm or asinh_norm, TODO for missing, gamma if self.distribution in DISTRIBUTIONS_LIKE_NORM: if USE_MFSDG: # TODO - option = "mc_missing" + option = "missing_mc" else: - option = "mc_diag" + option = "diag_gaussian_mc" # generate MC object names - names = self.generate_gmtk_obj_names(option, - track_names=track_names) - + names = self.generate_gmtk_object_names(option) + + # replicate covar names for iteration covar_names = list(input_master.mc.covar) * ( self.num_segs * self.num_subsegs) - # replicate covar names for iteration - mean_names = list(input_master.mc.mean) # list of all mean names + mean_names = list(input_master.mc.mean) - # create MC objects - for i in range(len(names)): - input_master.mc[names[i]] = DiagGaussianMC(mean=mean_names[i], - covar=covar_names[i]) - return input_master.mc.__str__() + # create MC objects and add them to input master + mc_objects = [] + for mean_name, covar_name in zip(mean_names, covar_names): + mc_objects.append(DiagGaussianMC(mean=mean_name, covar=covar_name)) + input_master.mc.update(dict(zip(names, mc_objects))) + + return str(input_master.mc) - def generate_mx_objects(self, track_names): + def generate_mx_objects(self): """Generate string representation of MX objects in input master. - :param: track_names: list[str]: list of track names """ # generate list of MX names - names = self.generate_gmtk_obj_names("mx", - track_names=track_names) - - mc_names = list(input_master.mc) # list of all mc names - dpmf_names = list(input_master.dpmf) # list of all dpmf names + names = self.generate_gmtk_object_names("mx") + mc_names = list(input_master.mc) # list of all mc names + dpmf_names = list(input_master.dpmf) # list of all dpmf names multiple = int(len(names) / len(dpmf_names)) dpmf_names *= multiple # replicate dpmf names for iteration - - # create MX objects - for i in range(len(names)): - input_master.mx[names[i]] = MX(dpmf=dpmf_names[i], - components=mc_names[i]) + mx_object = [] + # parameters required for creating MX object: names of mc, dpmf + for mc_name, dpmf_name in zip(mc_names, dpmf_names): + mx_objects.append(MX(dpmf=dpmf_name, components=mc_name)) + + # adding MX object to input master + input_master.mx.update(dict(zip(names, mx_objects))) return input_master.mx.__str__() def generate_dpmf_objects(self, track_names): """Generate string representation of DPMF objects in input master. - :param: track_names: list[str]: list of track names """ # generate a list of dpmf names - names = self.generate_gmtk_obj_names("dpmf", - track_names=track_names) + names = self.generate_gmtk_object_names("dpmf") # if single dpmf if self.num_mix_components == 1: input_master.dpmf[names[0]] = DPMF(1.0) @@ -383,10 +473,11 @@ def generate_dpmf_objects(self, track_names): # uniform probabilities dpmf_values = str(round(1.0 / self.num_mix_components, ROUND_NDIGITS)) - # create dpmf objects - for i in range(len(names)): - input_master.dpmf[names[i]] = DPMF(dpmf_values[i]) - return input_master.dpmf.__str__() + # creating DPMF objects and adding them to input master + dpmf_objects = map(DPMF, dpmf_values) + input_master.dpmf.update(dict(zip(names, dpmf_objects))) + + return str(input_master.dpmf) def calc_prob_transition(self, length): """Calculate probability transition from scaled expected length. @@ -478,7 +569,7 @@ def generate_dense_cpt_objects(self): if self.len_seg_strength > 0: dirichlet_row = ["DirichletTable %s" % self.make_dirichlet_name(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION)] input_master.dense_cpt[NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION].extra_rows = dirichlet_row - return input_master.dense_cpt.__str__() + return str(input_master.dense_cpt) def make_dinucleotide_table_row(self): pass @@ -523,7 +614,7 @@ def generate_dirichlet_objects(self): else: return "" - + class DTParamSpec(ParamSpec): type_name = "DT" copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", @@ -600,7 +691,7 @@ class InputMasterSaver(Saver): "supervision_type", "use_dinucleotide", "mins", "means", "vars", "gmtk_include_filename_relative", "track_groups", - "num_mix_components", "virtual_evidence", "tracks"] + "num_mix_components", "virtual_evidence"] def make_mapping(self): # the locals of this function are used as the template mapping @@ -608,7 +699,7 @@ def make_mapping(self): # check that they are not used in the input.master template param_spec = ParamSpec(self) num_free_params = 0 - + num_segs = self.num_segs num_subsegs = self.num_subsegs num_track_groups = self.num_track_groups @@ -625,31 +716,30 @@ def make_mapping(self): # segCountDown_seg_segTransition num_free_params += fullnum_subsegs - head_track_names = param_spec.get_head_track_names() - name_collection_spec = param_spec.generate_name_collection(head_track_names) + name_collection_spec = param_spec.generate_name_collection() distribution = self.distribution if distribution in DISTRIBUTIONS_LIKE_NORM: - mean_spec = param_spec.generate_mean_objects(head_track_names) - covar_spec = param_spec.generate_covar_objects(head_track_names) + mean_spec = param_spec.generate_mean_objects() + covar_spec = param_spec.generate_covar_objects() # TODO: class RealMatParamSpec # for now this is sufficient because util.USE_MFSDG = False by default real_mat_spec = "" - mc_spec = param_spec.generate_mc_objects(head_track_names) + mc_spec = param_spec.generate_mc_objects() if COVAR_TIED: num_free_params += (fullnum_subsegs + 1) * num_track_groups else: num_free_params += (fullnum_subsegs * 2) * num_track_groups -# TODO: gamma distribution option + # TODO: gamma distribution option else: raise ValueError("distribution %s not supported" % distribution) - dpmf_spec = param_spec.generate_dpmf_objects(head_track_names) - mx_spec = param_spec.generate_mx_objects(head_track_names) + dpmf_spec = param_spec.generate_dpmf_objects() + mx_spec = param_spec.generate_mx_objects() card_seg = num_segs ve_spec = VirtualEvidenceSpec(self) From 88e5b605d8e04ba2a6bb6cd770bd972d47685054 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 19:25:12 -0400 Subject: [PATCH 64/72] Update input_master.py --- segway/input_master.py | 1 - 1 file changed, 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 248e1970..e8e8668f 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -357,7 +357,6 @@ def generate_name_collection(self): name_groups = [names[i:i + len_name_group] for i in range(0, len(names), len_name_group)] # create NameCollection objects and add to input master - for collection_name, collection_entry in for group_index in range(len(name_groups)): input_master.name_collection[collection_names[group_index]] = \ NameCollection(name_groups[group_index]) From 0e475f94e246b9f67bb751ff7c0df85f240a687f Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 19:35:20 -0400 Subject: [PATCH 65/72] fixed typos --- segway/input_master.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index e8e8668f..d9391b2d 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -25,7 +25,7 @@ VIRTUAL_EVIDENCE_LIST_FILENAME) from .gmtk.input_master import (InputMaster, NameCollection, DenseCPT, - DPMF, MX, Covar, Mean, DiagGaussianMC, InlineSection) + DPMF, MX, Covar, Mean, DiagGaussianMC) # NB: Currently Segway relies on older (Numpy < 1.14) printed representations of # scalars and vectors in the parameter output. By default in newer (> 1.14) @@ -245,6 +245,7 @@ def generate_mean_object_names(self): Generate Mean object names. """ head_track_names = self.get_head_track_names() + component_suffix = self.get_template_component_suffix() names = [] for i in range(self.num_segs): for j in range(self.num_subsegs): @@ -294,17 +295,13 @@ def generate_dpmf_object_names(self): """ Generate DPMF object names. """ - head_track_names = self.get_head_track_names() names = [] if self.num_mix_components == 1: names.append("dpmf_always") else: names.append("") # TODO (with dirichlet extra rows) -# for i in range(self.num_segs): -# for j in range(self.num_subsegs): -# for track_name in head_track_names: -# names.append("dpmf_seg{}_subseg{}_{}".format(i, j, track_name)) + return names def generate_gmtk_object_names(self, gmtk_object_type): @@ -324,7 +321,7 @@ def generate_gmtk_object_names(self, gmtk_object_type): "covar", "collection_names", "collection_entries", "dpmf", "gammascale", "gammashape", "tied_covar"] - if not gmtk_object_types in allowed_types: + if not gmtk_object_type in allowed_types: raise ValueError("Undefined GMTK object type: {}".format(gmtk_object_type)) GMTK_OBJECT_NAME_GENERATORS = {'mx': generate_mx_object_names, @@ -409,7 +406,7 @@ def generate_covar_objects(self): names = self.generate_gmtk_object_names("covar") covar_values = self.vars # array of variance values # creating Covar objects and adding them to input master - dpmf_objects = map(Covar, covar_values) + covar_objects = map(Covar, covar_values) input_master.covar.update(dict(zip(names, covar_objects))) return str(input_master.covar) @@ -451,7 +448,7 @@ def generate_mx_objects(self): dpmf_names = list(input_master.dpmf) # list of all dpmf names multiple = int(len(names) / len(dpmf_names)) dpmf_names *= multiple # replicate dpmf names for iteration - mx_object = [] + mx_objects = [] # parameters required for creating MX object: names of mc, dpmf for mc_name, dpmf_name in zip(mc_names, dpmf_names): mx_objects.append(MX(dpmf=dpmf_name, components=mc_name)) From dd1b4f85bcf93f09c3cbba3da45bd516a23e09b6 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 19:40:48 -0400 Subject: [PATCH 66/72] tracks attr in InputMasterSaver --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index d9391b2d..b3b62722 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -687,7 +687,7 @@ class InputMasterSaver(Saver): "supervision_type", "use_dinucleotide", "mins", "means", "vars", "gmtk_include_filename_relative", "track_groups", - "num_mix_components", "virtual_evidence"] + "num_mix_components", "virtual_evidence", "tracks"] def make_mapping(self): # the locals of this function are used as the template mapping From 5f8f0f19cf7477f66ba440916341d0a1847ef806 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 19:51:10 -0400 Subject: [PATCH 67/72] Update input_master.py --- segway/input_master.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index b3b62722..df083b9a 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -324,16 +324,16 @@ def generate_gmtk_object_names(self, gmtk_object_type): if not gmtk_object_type in allowed_types: raise ValueError("Undefined GMTK object type: {}".format(gmtk_object_type)) - GMTK_OBJECT_NAME_GENERATORS = {'mx': generate_mx_object_names, - 'diag_gaussian_mc': generate_diag_gaussian_mc_object_names, - 'gamma_mc': generate_gamma_mc_object_names, - 'missing_mc': generate_missing_mc_object_names, - 'mean': generate_mean_object_names, - 'covar': generate_covar_object_names, - 'tied_covar': generate_covar_object_names, - 'collection_names': generate_collection_names, - 'collection_entries': generate_name_collection_entries, - 'dpmf': generate_dpmf_object_names} + GMTK_OBJECT_NAME_GENERATORS = {'mx': self.generate_mx_object_names, + 'diag_gaussian_mc': self.generate_diag_gaussian_mc_object_names, + 'gamma_mc': self.generate_gamma_mc_object_names, + 'missing_mc': self.generate_missing_mc_object_names, + 'mean': self.generate_mean_object_names, + 'covar': self.generate_covar_object_names, + 'tied_covar': self.generate_covar_object_names, + 'collection_names': self.generate_collection_names, + 'collection_entries': self.generate_name_collection_entries, + 'dpmf': self.generate_dpmf_object_names} return GMTK_OBJECT_NAME_GENERATORS[gmtk_object_type]() From f06c9b9fe9c08cb9e091fb0c32d4c92b2e912c6d Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 20:00:34 -0400 Subject: [PATCH 68/72] Update input_master.py --- segway/input_master.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index df083b9a..095cebb6 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -217,10 +217,11 @@ def generate_tied_covar_object_names(self): Generate tied Covar object names. """ head_track_names = self.get_head_track_names() - component_suffix = self.get_template_component_suffix() names = [] - for track_name in head_track_names: - names.append("covar_{}{}".format(track_name, component_suffix)) + for component_number in range(self.num_mix_components): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append("covar_{}{}".format(track_name, component_suffix)) return names def generate_covar_object_names(self): @@ -228,12 +229,13 @@ def generate_covar_object_names(self): Generate tied Covar object names. """ head_track_names = self.get_head_track_names() - component_suffix = self.get_template_component_suffix() names = [] - for i in range(self.num_segs): - for j in range(self.num_subsegs): - for track_name in head_track_names: - names.append("covar_seg{}_subseg{}_{}{}".format(i, j, + for component_number in range(self.num_mix_components): + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append("covar_seg{}_subseg{}_{}{}".format(i, j, track_name, component_suffix)) @@ -245,12 +247,13 @@ def generate_mean_object_names(self): Generate Mean object names. """ head_track_names = self.get_head_track_names() - component_suffix = self.get_template_component_suffix() names = [] - for i in range(self.num_segs): - for j in range(self.num_subsegs): - for track_name in head_track_names: - names.append("mean_seg{}_subseg{}_{}{}".format(i, j, + for component_number in range(self.num_mix_components): + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append("mean_seg{}_subseg{}_{}{}".format(i, j, track_name, component_suffix)) @@ -274,12 +277,13 @@ def generate_diag_gaussian_mc_object_names(self): Generate DiagGaussianMC object names. """ head_track_names = self.get_head_track_names() - component_suffix = self.get_template_component_suffix() names = [] - for i in range(self.num_segs): - for j in range(self.num_subsegs): - for track_name in head_track_names: - names.append("mc_{}_seg{}_subseg{}_{}{}".format(self.distribution, + for component_number in range(self.num_mix_components): + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + component_suffix = self.get_template_component_suffix(component_number) + names.append("mc_{}_seg{}_subseg{}_{}{}".format(self.distribution, i, j, track_name, component_suffix)) return names From 254bc5491866cf62d3ac0c73e5e701c7371003f0 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 20:03:16 -0400 Subject: [PATCH 69/72] Update input_master.py --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 095cebb6..576ba6ae 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -461,7 +461,7 @@ def generate_mx_objects(self): input_master.mx.update(dict(zip(names, mx_objects))) return input_master.mx.__str__() - def generate_dpmf_objects(self, track_names): + def generate_dpmf_objects(self): """Generate string representation of DPMF objects in input master. """ # generate a list of dpmf names From bd5020042ad2316171869df32ffc77791b7f1b35 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Sun, 6 Sep 2020 20:49:02 -0400 Subject: [PATCH 70/72] Update input_master.py --- segway/input_master.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index 576ba6ae..f6a059dc 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -334,7 +334,7 @@ def generate_gmtk_object_names(self, gmtk_object_type): 'missing_mc': self.generate_missing_mc_object_names, 'mean': self.generate_mean_object_names, 'covar': self.generate_covar_object_names, - 'tied_covar': self.generate_covar_object_names, + 'tied_covar': self.generate_tied_covar_object_names, 'collection_names': self.generate_collection_names, 'collection_entries': self.generate_name_collection_entries, 'dpmf': self.generate_dpmf_object_names} From 1e969b7e619888e034fb11355dcb69f76f75b577 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 18 Sep 2020 00:52:10 -0400 Subject: [PATCH 71/72] added gmtk format strings --- segway/input_master.py | 89 +++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/segway/input_master.py b/segway/input_master.py index f6a059dc..f1560b5e 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -189,45 +189,54 @@ def get_head_track_names(self): for group in self.track_groups: head_track_names.append(group[0].name) return head_track_names - + + def generate_collection_names(self): """ Generate names of NameCollection objects. """ + COLLECTION_NAME_FORMAT_STRING = "collection_seg_{track_name}" head_track_names = self.get_head_track_names() names = [] for name in head_track_names: - names.append("collection_seg_{}".format(name)) + names.append(COLLECTION_NAME_FORMAT_STRING.format(track_name=name)) return names def generate_name_collection_entries(self): """ Generate entries in NameCollection objects. """ + COLLECTION_ENTRY_FORMAT_STRING = \ + "mx_seg{seg_index}_subseg{subseg_index}_{track_name}" head_track_names = self.get_head_track_names() names = [] for name in head_track_names: for i in range(self.num_segs): for j in range(self.num_subsegs): - names.append("mx_seg{}_subseg{}_{}".format(i, j, name)) + names.append(COLLECTION_ENTRY_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=name)) return names def generate_tied_covar_object_names(self): """ Generate tied Covar object names. """ + TIED_COVAR_FORMAT_STRING = "covar_{track_name}{suffix}" head_track_names = self.get_head_track_names() names = [] for component_number in range(self.num_mix_components): for track_name in head_track_names: component_suffix = self.get_template_component_suffix(component_number) - names.append("covar_{}{}".format(track_name, component_suffix)) + names.append(TIED_COVAR_FORMAT_STRING.format(track_name=track_name, + suffix=component_suffix)) return names def generate_covar_object_names(self): """ Generate tied Covar object names. """ + COVAR_FORMAT_STRING = "covar_seg{seg_index}_subseg{subseg_index}_{track_name}{suffix}" head_track_names = self.get_head_track_names() names = [] for component_number in range(self.num_mix_components): @@ -235,17 +244,18 @@ def generate_covar_object_names(self): for j in range(self.num_subsegs): for track_name in head_track_names: component_suffix = self.get_template_component_suffix(component_number) - names.append("covar_seg{}_subseg{}_{}{}".format(i, j, - track_name, - component_suffix)) + names.append(COVAR_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name, + suffix=component_suffix)) - return names - + return names def generate_mean_object_names(self): """ Generate Mean object names. """ + MEAN_FORMAT_STRING = "mean_seg{seg_index}_subseg{subseg_index}_{track_name}{suffix}" head_track_names = self.get_head_track_names() names = [] for component_number in range(self.num_mix_components): @@ -253,9 +263,10 @@ def generate_mean_object_names(self): for j in range(self.num_subsegs): for track_name in head_track_names: component_suffix = self.get_template_component_suffix(component_number) - names.append("mean_seg{}_subseg{}_{}{}".format(i, j, - track_name, - component_suffix)) + names.append(MEAN_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name, + suffix=component_suffix)) return names @@ -263,12 +274,15 @@ def generate_mx_object_names(self): """ Generate MX object names. """ + MX_FORMAT_STRING = "mx_seg{seg_index}_subseg{subseg_index}_{track_name}" head_track_names = self.get_head_track_names() names = [] for i in range(self.num_segs): for j in range(self.num_subsegs): for track_name in head_track_names: - names.append("mx_seg{}_subseg{}_{}".format(i, j, track_name)) + names.append(MX_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name)) return names @@ -276,6 +290,8 @@ def generate_diag_gaussian_mc_object_names(self): """ Generate DiagGaussianMC object names. """ + DIAG_GAUSSIAN_FORMAT_STRING = \ + "mc_{distribution}_seg{seg_index}_subseg{subseg_index}_{track_name}{suffix}" head_track_names = self.get_head_track_names() names = [] for component_number in range(self.num_mix_components): @@ -283,55 +299,56 @@ def generate_diag_gaussian_mc_object_names(self): for j in range(self.num_subsegs): for track_name in head_track_names: component_suffix = self.get_template_component_suffix(component_number) - names.append("mc_{}_seg{}_subseg{}_{}{}".format(self.distribution, - i, j, track_name, - component_suffix)) + names.append(DIAG_GAUSSIAN_FORMAT_STRING.format(distribution=self.distribution, + seg_index=i, + subseg_index=j, + track_name=track_name, + suffix=component_suffix)) return names def generate_gamma_mc_object_names(self): - # todo: gammascale, gammashape - pass - - def generate_missing_mc_object_names(self): - pass - + GAMMA_MC_FORMAT_STRING = \ + "mc_gamma_seg{seg_index}_subseg{subseg_index}_{track_name}" + names = [] + head_track_names = self.get_head_track_names() + for i in range(self.num_segs): + for j in range(self.num_subsegs): + for track_name in head_track_names: + names.append(GAMMA_MC_FORMAT_STRING.format(seg_index=i, + subseg_index=j, + track_name=track_name)) + return names + def generate_dpmf_object_names(self): """ Generate DPMF object names. """ + # to do for num_mix_components > 1: + DPMF_FORMAT_STRING = "" names = [] if self.num_mix_components == 1: names.append("dpmf_always") else: + # TODO (with dirichlet extra rows) names.append("") - # TODO (with dirichlet extra rows) - + return names def generate_gmtk_object_names(self, gmtk_object_type): """ Generate GMTK object names for the types: - NameCollection: "col" - entries in NameCollection: "mx_name" + name of NameCollection: "collection_names" + entries in NameCollection: "collection_entries" Covar: "covar", "tied_covar" Mean: "mean" MX: "mx" - MC: "diag_gaussian_mc", "gamma_mc", "missing_mc", "gammascale" + MC: "diag_gaussian_mc" DPMF: "dpmf" :param gmtk_object_type: str: type of gmtk object for which names must be generated :return: list[str]: list of GMTK object names """ - allowed_types = ["mx", "diag_gaussian_mc", "gamma_mc", "missing_mc", "mean", - "covar", "collection_names", "collection_entries", "dpmf", "gammascale", - "gammashape", "tied_covar"] - - if not gmtk_object_type in allowed_types: - raise ValueError("Undefined GMTK object type: {}".format(gmtk_object_type)) - GMTK_OBJECT_NAME_GENERATORS = {'mx': self.generate_mx_object_names, 'diag_gaussian_mc': self.generate_diag_gaussian_mc_object_names, - 'gamma_mc': self.generate_gamma_mc_object_names, - 'missing_mc': self.generate_missing_mc_object_names, 'mean': self.generate_mean_object_names, 'covar': self.generate_covar_object_names, 'tied_covar': self.generate_tied_covar_object_names, From 0a041f5d688a4c7a56c609a59c47845dc0dec7b5 Mon Sep 17 00:00:00 2001 From: Aparna Gopalakrishnan <48969923+aparnagopalakrishnan7@users.noreply.github.com> Date: Fri, 18 Sep 2020 01:13:17 -0400 Subject: [PATCH 72/72] Update input_master.py --- segway/input_master.py | 1 - 1 file changed, 1 deletion(-) diff --git a/segway/input_master.py b/segway/input_master.py index f1560b5e..8d3dc282 100644 --- a/segway/input_master.py +++ b/segway/input_master.py @@ -324,7 +324,6 @@ def generate_dpmf_object_names(self): Generate DPMF object names. """ # to do for num_mix_components > 1: - DPMF_FORMAT_STRING = "" names = [] if self.num_mix_components == 1: names.append("dpmf_always")