diff --git a/segway/data/input.master.tmpl b/segway/data/input.master.tmpl deleted file mode 100644 index a150d031..00000000 --- a/segway/data/input.master.tmpl +++ /dev/null @@ -1,74 +0,0 @@ -#include "${include_filename}" - -#if CARD_SEG != $card_seg -#error Specified number of segment labels (CARD_SEG) does not match the number used for this input master file ($card_seg) -#endif - -$dt_spec -$name_collection_spec -$dirichlet_spec - -DETERMINISTIC_CPT_IN_FILE inline -#if CARD_SUPERVISIONLABEL == -1 -5 -#else /* CARD_SUPERVISIONLABEL != -1 */ -6 -#endif /* CARD_SUPERVISIONLABEL == -1 */ - -0 seg_segCountDown -1 -CARD_SEG CARD_SEGCOUNTDOWN -map_seg_segCountDown - -1 frameIndex_ruler -1 -CARD_FRAMEINDEX CARD_RULER -map_frameIndex_ruler - -2 segTransition_ruler_seg_segCountDown_segCountDown -4 -CARD_SEGTRANSITION CARD_RULER CARD_SEG CARD_SEGCOUNTDOWN CARD_SEGCOUNTDOWN -map_segTransition_ruler_seg_segCountDown_segCountDown - -3 seg_seg_copy -1 -CARD_SEG CARD_SEG -internal:copyParent - -4 subseg_subseg_copy -1 -CARD_SUBSEG CARD_SUBSEG -internal:copyParent - -#if CARD_SUPERVISIONLABEL != -1 -5 supervisionLabel_seg_alwaysTrue -2 -CARD_SUPERVISIONLABEL CARD_SEG CARD_BOOLEAN -map_supervisionLabel_seg_alwaysTrue -#endif - -#if VIRTUAL_EVIDENCE == 1 -$ve_spec -#endif - -#ifndef INPUT_PARAMS_FILENAME - -$dense_cpt_spec -$mean_spec -$covar_spec -$dpmf_spec -$mc_spec -$mx_spec - -#else - -DENSE_CPT_IN_FILE INPUT_PARAMS_FILENAME ascii -MEAN_IN_FILE INPUT_PARAMS_FILENAME ascii -COVAR_IN_FILE INPUT_PARAMS_FILENAME ascii -DPMF_IN_FILE INPUT_PARAMS_FILENAME ascii -MC_IN_FILE INPUT_PARAMS_FILENAME ascii -MX_IN_FILE INPUT_PARAMS_FILENAME ascii - -#endif - -$real_mat_spec diff --git a/segway/data/map_frameIndex_ruler.dt.txt b/segway/data/map_frameIndex_ruler.dt.txt index 1283d3d2..33defd70 100644 --- a/segway/data/map_frameIndex_ruler.dt.txt +++ b/segway/data/map_frameIndex_ruler.dt.txt @@ -1,4 +1,3 @@ -map_frameIndex_ruler 1 % is the frameIndex divisible by RULER_SCALE -1 { mod(p0, RULER_SCALE) == 0 } diff --git a/segway/data/map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl b/segway/data/map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl index 3e294f64..2082c9f4 100644 --- a/segway/data/map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl +++ b/segway/data/map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl @@ -1,4 +1,3 @@ -map_segTransition_ruler_seg_segCountDown_segCountDown 4 0 2 2 default % segTransition(0) == 2 (seg transition): diff --git a/segway/data/map_seg_segCountDown.dt.tmpl b/segway/data/map_seg_segCountDown.dt.tmpl index 42e42f8d..6188a76d 100644 --- a/segway/data/map_seg_segCountDown.dt.tmpl +++ b/segway/data/map_seg_segCountDown.dt.tmpl @@ -1,4 +1,3 @@ -map_seg_segCountDown 1 % this is only used at the beginning of an observation track 0 $tree diff --git a/segway/data/map_seg_subseg_obs.dt.txt b/segway/data/map_seg_subseg_obs.dt.txt index eb5a3d11..0e29aa9e 100644 --- a/segway/data/map_seg_subseg_obs.dt.txt +++ b/segway/data/map_seg_subseg_obs.dt.txt @@ -1,5 +1,2 @@ -% XXXopt: good candidate for a linear combination C deterministic mapper - -map_seg_subseg_obs 2 % num parents -1 { p0*CARD_SUBSEG + p1 } diff --git a/segway/data/map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt b/segway/data/map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt index cdec97da..ce84c788 100644 --- a/segway/data/map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt +++ b/segway/data/map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt @@ -1,4 +1,3 @@ -map_supervisionLabel_seg_alwaysTrue 2 % num parents % first parent is the label: 0 = no label; 1+ = (desired-1) % second parent is target diff --git a/segway/data/preamble.tmpl b/segway/data/preamble.tmpl new file mode 100644 index 00000000..75ab2f26 --- /dev/null +++ b/segway/data/preamble.tmpl @@ -0,0 +1,5 @@ +#include "$include_file" + +#if CARD_SEG != $card_seg +#error Specified number of segment labels (CARD_SEG) does not match the number used for this input master file ($card_seg) +#endif diff --git a/segway/gmtk/input_master.py b/segway/gmtk/params.py similarity index 59% rename from segway/gmtk/input_master.py rename to segway/gmtk/params.py index 2f059319..26f8e6ce 100644 --- a/segway/gmtk/input_master.py +++ b/segway/gmtk/params.py @@ -1,11 +1,14 @@ from __future__ import annotations -from typing import List, Optional, Union +from typing import get_args, List, Optional, Union from numpy import array, asarray, empty, ndarray, squeeze -INPUT_MASTER_PREAMBLE = \ - """#define COMPONENT_TYPE_DIAG_GAUSSIAN 0""" +DEFAULT_PREAMBLE = \ +""" +#define COMPONENT_TYPE_DIAG_GAUSSIAN 0 +#define COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN 5 +""" COMPONENT_TYPE_DIAG_GAUSSIAN = 0 @@ -17,6 +20,10 @@ OBJ_KIND_DETERMINISTICCPT = "DETERMINISTIC_CPT" OBJ_KIND_MC = "MC" OBJ_KIND_MX = "MX" +OBJ_KIND_DT = "DT" +OBJ_KIND_VECPT = "VE_CPT" +OBJ_KIND_GENERICSTRING = "GENERIC_STRING" +OBJ_KIND_DIRICHLETTAB = "DIRICHLET_TAB" # "kind" refers to a kind of GMTK object. GMTK often calls these classes @@ -36,6 +43,15 @@ def array2text(a: Array) -> str: return delimiter.join(array2text(row) for row in a) +def make_cardinality_line(num_parents: int, cardinalities: ndarray) -> str: + """ + Create a string representation of the cardinality of an array with the + specified shape. This string includes the number of parents and the + cardinality of each parent. + """ + return " ".join(map(str, [num_parents, *cardinalities])) + + # Types of array data NumericArrayLike = Union[float, int, ndarray] @@ -50,10 +66,12 @@ def __new__(cls, *args: NumericArrayLike, keep_shape: bool = False) \ (dimension 1) from a single scalar input value. """ # Ensure all arguments belong to the correct type - if not all(isinstance(arg, NumericArrayLike) for arg in args): + # Python before v3.10 does not support using Union objects as the + # classinfo argument of isinstance. A tuple is used instead, + if not all(isinstance(arg, (float, int, ndarray)) for arg in args): # If union iterable, fix. Otherwise, hardwrite - raise TypeError("Argument has incompatible type. " - f"Expected {NumericArrayLike}") + raise TypeError(f"Expected {get_args(NumericArrayLike)}. " + "Argument has incompatible type.") input_array = array(args) @@ -68,13 +86,11 @@ def __new__(cls, *args: NumericArrayLike, keep_shape: bool = False) \ return res -class DenseCPT(Array): +class MultiDimArray(Array): """ - A single DenseCPT object. + Abstract class providing features for multidimensional arrays which are + represented on multiple lines. """ - kind = OBJ_KIND_DENSECPT - - # XXX: check if sums to 1.0 def __str__(self) -> str: """ @@ -87,10 +103,59 @@ def get_header_info(self) -> str: Return number of parents, cardinality line, for header in input.master section. """ - line = [str(len(self.shape) - 1)] # number of parents - cardinality_line = map(str, self.shape) - line.append(" ".join(cardinality_line)) # cardinalities - return " ".join(line) + return make_cardinality_line(len(self.shape) - 1, self.shape) + + +class DenseCPT(MultiDimArray): + """ + A single DenseCPT object. + """ + kind = OBJ_KIND_DENSECPT + + # todo check if sums to 1.0 + + def __new__(cls, *args: NumericArrayLike, keep_shape: bool = False) \ + -> Array: + """ + Create a new DenseCPT containing the provided value(s). + Verify the args sum to 1.0 (within floating point error) and set the + dirichlet_name attribute to None for no Dirichlet table referenced. + """ + # Create the new DenseCPT. Ensure all args have valid types. + res = super().__new__(cls, args, keep_shape=keep_shape) + # Sum all probabilities in the DenseCPT. Aggregate ints and floats + # arguments directly and aggregate the sum of ndarray arguments. + args_sum = sum(arg if isinstance(arg, (int, float)) else arg.sum() + for arg in args) + # Permit 1e-05 error for floating point errors. + if abs(args_sum - 1.0) > (10 ** (-5)): + raise ValueError("Expected arguments to sum to 1.0 but instead " + f"summed to {args_sum}") + + res.dirichlet_name = None + + return res + + def set_dirichlet_table(self, dirichlet_name: str): + """ + Set the name of a Dirichlet table which this CPT will reference. + + :param: dirichlet_name: str: Dirichlet table name for this DenseCPT to + reference within its header + """ + self.dirichlet_name = dirichlet_name + + def get_header_info(self) -> str: + """ + Return multidimensional array header with dimensions. + If the dirichlet_name attribute is not None, include an additional line + in the header referencing the DirichletTable with that name. + """ + res = MultiDimArray.get_header_info(self) + if self.dirichlet_name: + dirichlet_line = f"DirichletTable dirichlet_{self.dirichlet_name}" + res = "\n".join((res, dirichlet_line)) + return res @classmethod def uniform_from_shape(cls, *shape: int, @@ -150,6 +215,20 @@ def uniform_from_shape(cls, *shape: int, return DenseCPT(values, keep_shape=True) +class DirichletTable(MultiDimArray): + """ + A single DirichletTable object. + """ + kind = OBJ_KIND_DIRICHLETTAB + + def get_header_info(self) -> str: + """ + Return number of parents, cardinality line, for header in + input.master section. + """ + return make_cardinality_line(len(self.shape), self.shape) + + class NameCollection(list): """ A single NameCollection object. @@ -181,7 +260,7 @@ def get_header_info(self) -> str: return str(len(self)) -class OneLineKind(Array): +class OneLineArray(Array): """ An abstract Python class acting as the parent for Python classes representing Array-like GMTK kinds that have one-line string @@ -200,7 +279,7 @@ def get_header_info(self) -> str: return " ".join(line) -class Mean(OneLineKind): +class Mean(OneLineArray): """ A single Mean object. __init__ and __str__ methods defined in superclass. @@ -208,7 +287,7 @@ class Mean(OneLineKind): kind = OBJ_KIND_MEAN -class Covar(OneLineKind): +class Covar(OneLineArray): """ A single Covar object. __init__ and __str__ methods defined in superclass. @@ -216,6 +295,36 @@ class Covar(OneLineKind): kind = OBJ_KIND_COVAR +class VirtualEvidence: + """ + A Virtual Evidence object, with syntax inferred from the input.master + template implementation. + Intended to be printed as one line. + Attributes: + num_segs: int: number of segments for this Virtual Evidence + ve_list_filename: str: filename (or C preprocessor macro) containing + the Virtual Evidence list + """ + kind = OBJ_KIND_VECPT + + def __init__(self, num_segs: int, ve_list_filename: str): + """ + Initialize a sinlge Virtual Evidence object. + :param num_segs: int: number of segments + :param ve_list_filename: str: filename containing Virtual Evidence list + """ + self.num_segs = num_segs + self.ve_list_filename = ve_list_filename + + def get_header_info(self) -> str: + """ + Return string representation of own information, for header in + input.master section. + """ + + return f"1 {self.num_segs} 2 {self.ve_list_filename} nfs:{self.num_segs} nis:0 fmt:ascii END" + + # These types must be defined before they are referenced, so these constants # are defined here rather than the top of the file. @@ -250,13 +359,33 @@ def convert( return cls(value) -class DPMF(OneLineKind): +class DPMF(OneLineArray): """ A single DPMF object. """ kind = OBJ_KIND_DPMF - # todo check if sums to 1.0 + def __new__(cls, *args: NumericArrayLike, keep_shape: bool = False) \ + -> Array: + """ + Create a new DPMF containing the provided value(s). + Verify the args sum to 1.0 (within floating point error) and set the + pseudocount attribute to None for no pseudocount. + """ + # Create the new DenseCPT. Ensure all args have valid types. + res = super().__new__(cls, args, keep_shape=keep_shape) + # Sum all probabilities in the DenseCPT. Aggregate ints and floats + # arguments directly and aggregate the sum of ndarray arguments. + args_sum = sum(arg if isinstance(arg, (int, float)) else arg.sum() + for arg in args) + # Permit 1e-05 error for floating point errors. + if abs(args_sum - 1.0) > (10 ** (-5)): + raise ValueError("Expected arguments to sum to 1.0 but instead " + f"summed to {args_sum}") + + res.pseudocount = None + + return res @classmethod def uniform_from_shape(cls, shape: int) -> DPMF: @@ -270,20 +399,41 @@ def uniform_from_shape(cls, shape: int) -> DPMF: return DPMF(dpmf_values, keep_shape=True) + def set_dirichlet_pseudocount(self, pseudocount: int): + """ + Set a pseudocount to include in the header alongside the + DirichletConst label. + """ + self.pseudocount = pseudocount + + def get_header_info(self) -> str: + """ + Return string representation of own information, for header in + input.master section. + If the pseudocount attribute was set, include the DirichletConst label + and that value in the header before the weights. + """ + line = [str(len(self))] # dimension + if self.pseudocount: + line.append(f"DirichletConst {self.pseudocount}") + line.append(array2text(self)) # array values + + return " ".join(line) + class MC: """ A single mixture component (MC) object. Attributes: - component_type: str: type of MC + component_type: str: type of MC, such as "COMPONENT_TYPE_DIAG_GAUSSIAN" + or "COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN" """ kind = OBJ_KIND_MC def __init__(self, component_type: str): """ Initialize a single MC object. - :param component_type: str: type of MC, such as - COMPONENT_TYPE_DIAG_GAUSSIAN + :param component_type: str: type of MC """ self.component_type = component_type @@ -295,7 +445,7 @@ def get_header_info(self) -> str: class DiagGaussianMC(MC, object): """ Attributes: - component_type = 0 + component_type = "COMPONENT_TYPE_DIAG_GAUSSIAN" = 0 mean: str: name of Mean object associated to this MC covar: str: name of Covar obejct associated to this MC """ @@ -317,6 +467,30 @@ def __str__(self) -> str: return " ".join([self.mean, self.covar]) +class MissingFeatureDiagGaussianMC(MC, object): + """ + Attributes: + component_type = "COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN" + mean: str: name of Mean object associated to this MC + covar: str: name of Covar obejct associated to this MC + """ + def __init__(self, mean: str, covar: str): + """ + Initialize a single MissingFeatureDiagGaussianMC object. + :param mean: name of Mean object associated to this MC + :param covar: name of Covar obejct associated to this MC + """ + super().__init__("COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN") + self.mean = mean + self.covar = covar + + def __str__(self) -> str: + """ + Return string representation of this MC object. + """ + return " ".join([self.mean, self.covar, "matrix_weightscale_1x1"]) + + class MX: """ A single mixture (MX) object. @@ -369,14 +543,14 @@ class DeterministicCPT: kind = OBJ_KIND_DETERMINISTICCPT def __init__(self, cardinality_parents: Union[tuple[int], int], - cardinality: int, dt: str): + cardinality: str, dt: str): """ Initialize a single DeterministicCPT object. :param cardinality_parents: tuple[int]: cardinality of parents - (if empty, then number of parents = 0 + (if empty, then number of parents = 0) :param cardinality: int: cardinality of self - :param dt: name existing Decision Tree (DT) associated with this - DeterministicCPT + :param dt: str: name existing Decision Tree (DT) associated with + this DeterministicCPT """ if not isinstance(cardinality_parents, tuple): self.cardinality_parents = (cardinality_parents, ) @@ -389,7 +563,7 @@ def __str__(self) -> str: """ Return string representation of this DeterministicCPT. """ - line = [str(len(self.cardinality_parents))] # lines + line = [str(len(self.cardinality_parents))] # number of parents cardinalities = list(self.cardinality_parents) cardinalities.append(self.cardinality) @@ -399,10 +573,35 @@ def __str__(self) -> str: return "\n".join(line) def get_header_info(self) -> str: + # No additional header information needed in input.master. + return "" + + +class GenericString(str): + """ + A class storing a generic string to write to input.master. + Attributes: + contents: str: Generic string to write to input.master + """ + kind = OBJ_KIND_GENERICSTRING + + def get_header_info(self) -> str: + # No additional header information + return "" + + +class DecisionTree(GenericString): + """ + A Decision Tree object. + """ + kind = OBJ_KIND_DT + + def __init__(self, tree: str): """ - No additional header information needed in input.master. + Initialize a DecisionTree object. + :param tree: String representation of the tree """ - return "" + super().__init__(tree) class Section(dict): @@ -411,8 +610,11 @@ class Section(dict): Key: name of GMTK object Value: GMTK object Attributes: - kind: str: specifies the kind of GMTK object - (default assumes that `self` has no kind) + kind: str: specifies the kind of GMTK object (default is None) + str_before: str: string to print before the section, often a + preprocessor rule + str_after: str: string to print after the section, often a + preprocessor rule """ def __init__(self, kind: Optional[str] = None): """ @@ -420,6 +622,8 @@ def __init__(self, kind: Optional[str] = None): """ super().__init__() self.kind = kind + self.str_before = None + self.str_after = None def __setitem__( self, @@ -431,9 +635,9 @@ def __setitem__( if cls is not None: value = convert(cls, value) - # self.kind is undefined for objects that don't support type conversion + # self.kind is undefined for objects that dont support type conversion if not self.kind: - # Set self.kind as the kind of first GMTK type value passed + # set self.kind as the kind of first GMTK type value passed # consistency of kind for all values are checked in InlineSection # as the whole dictionary could be checked at once self.kind = value.kind @@ -463,40 +667,67 @@ def __str__(self) -> str: if len(self) == 0: return "" - # The section generates the index and name of GMTK object + # if str_before is set, use it to begin the section's lines + lines = [] + if self.str_before: + lines.append(self.str_before) + + # if stored items are generic strings, return them out without + # any additional formatting. Otherwise, apply formatting + if self.kind == OBJ_KIND_GENERICSTRING: + lines.extend(self.get_unformatted_lines()) + else: + lines.extend(self.get_formatted_lines()) + + # if str_after is set, use it to end the section's lines + if self.str_after: + lines.append(self.str_after) + + return "\n".join(lines + [""]) + + def get_formatted_lines(self) -> List[str]: + """ + Format the GMTK objects with a section header and object headers + """ lines = self.get_header_lines() for index, (key, value) in enumerate(self.items()): - # Use section information to generate index and name of GMTK object - obj_header = [str(index), key] + obj_header = [str(index), key] # Index and name of GMTK object - # Use GMTK object to generate additional special header information - # Append this to the index and name above + # Special header information for some GMTK types obj_header.append(value.get_header_info()) # Use rstrip to remove the trailing space for GMTK types with # no additional header information lines.append(" ".join(obj_header).rstrip()) - # One line kind objects have all data included in the header - # If not one line kind, write the object's remaining data lines - if not isinstance(value, OneLineKind): + # Unless a class where all data is on one line (OneLineArray and + # VirtualEvidence), write the object's remaining lines + if not isinstance(value, (OneLineArray, VirtualEvidence)): lines.append(str(value)) - return "\n".join(lines + [""]) + return lines + + def get_unformatted_lines(self) -> List[str]: + """ + Extract the string representation of all GMTK objects, with no + headers or additional formatting. Intended for representing + Generic String objects. + """ + return [str(value) for value in self.values()] class InlineMCSection(InlineSection): """ Special InlineSection subclass which contains MC objects. Attributes: - mean: the InlineSection object stored at InputMaster.mean - covar: the InlineSection object stored at InputMaster.covar + mean: InlineSection object referred to by InputMaster.mean + covar: InlineSection object referred to by InputMaster.covar """ def __init__(self, mean: InlineSection, covar: InlineSection): """ - :param mean: InlineSection: the InlineSection object stored at + :param mean: InlineSection: InlineSection object referred to by InputMaster.mean - :param covar: InlineSection: the InlineSection object stored at + :param covar: InlineSection: InlineSection object referred to by InputMaster.covar """ super().__init__(OBJ_KIND_MC) @@ -511,7 +742,12 @@ def __str__(self) -> str: if len(self) == 0: return "" - lines = self.get_header_lines() + # if str_before is set, use it to begin the section's lines + lines = [] + if self.str_before: + lines.append(self.str_before) + + lines.extend(self.get_header_lines()) for index, (name, obj) in enumerate(list(self.items())): # check if dimension of Mean and Covar of this MC are the same mean_ndim = len(self.mean[obj.mean]) @@ -528,6 +764,10 @@ def __str__(self) -> str: obj_line.append(str(obj)) lines.append(" ".join(obj_line)) + # if str_after is set, use it to end the section's lines + if self.str_after: + lines.append(self.str_after) + return "\n".join(lines + [""]) @@ -535,16 +775,20 @@ class InlineMXSection(InlineSection): """ Special InlineSection subclass which contains MX objects. Attributes: - dpmf: the InlineSection object stored at InputMaster.dpmf + dpmf: InlineSection object referred to by InputMaster.dpmf + components: InlineSection object referred to by InputMaster.mc """ - def __init__(self, dpmf: InlineSection): + def __init__(self, dpmf: InlineSection, mc: InlineSection): """ - :param dpmf: InlineSection: the InlineSection object stored at + :param dpmf: InlineSection: InlineSection object referred to by InputMaster.dpmf + :param components: InlineSection: InlineSection object referred to by + InputMaster.mc """ super().__init__(OBJ_KIND_MX) self.dpmf = dpmf + self.mc = mc def __str__(self) -> str: """ @@ -554,7 +798,12 @@ def __str__(self) -> str: if len(self) == 0: return "" - lines = self.get_header_lines() + # if str_before is set, use it to begin the section's lines + lines = [] + if self.str_before: + lines.append(self.str_before) + + lines.extend(self.get_header_lines()) for index, (name, obj) in enumerate(list(self.items())): # Assert number of components is equal to length of DPMF dpmf_ndim = len(self.dpmf[obj.dpmf]) @@ -571,6 +820,10 @@ def __str__(self) -> str: # string representation of this MX object lines.append(" ".join(obj_line)) + # if str_after is set, use it to end the section's lines + if self.str_after: + lines.append(self.str_after) + return "\n".join(lines + [""]) @@ -591,37 +844,42 @@ class InputMaster: input master """ - def __init__(self): + def __init__(self, preamble=DEFAULT_PREAMBLE): """ Initialize InputMaster instance with empty attributes (InlineSection and its subclasses). """ - self.deterministic_cpt = InlineSection(OBJ_KIND_DETERMINISTICCPT) + self.preamble = preamble + self.dt = InlineSection(OBJ_KIND_DT) self.name_collection = InlineSection(OBJ_KIND_NAMECOLLECTION) + self.dirichlet = InlineSection(OBJ_KIND_DIRICHLETTAB) + self.deterministic_cpt = InlineSection(OBJ_KIND_DETERMINISTICCPT) + self.virtual_evidence = InlineSection(OBJ_KIND_VECPT) + self.dense_cpt = InlineSection(OBJ_KIND_DENSECPT) self.mean = InlineSection(OBJ_KIND_MEAN) self.covar = InlineSection(OBJ_KIND_COVAR) - self.dense_cpt = InlineSection(OBJ_KIND_DENSECPT) self.dpmf = InlineSection(OBJ_KIND_DPMF) self.mc = InlineMCSection(mean=self.mean, covar=self.covar) - self.mx = InlineMXSection(dpmf=self.dpmf) + self.mx = InlineMXSection(dpmf=self.dpmf, mc=self.mc) def __str__(self) -> str: """ Return string representation of all the attributes (GMTK types) by calling the attributes' (InlineSection and its subclasses) `__str__()`. """ - sections = [self.deterministic_cpt, self.name_collection, self.mean, - self.covar, self.dense_cpt, self.dpmf, self.mc, self.mx] + sections = [self.preamble, self.dt, self.name_collection, + self.dirichlet, self.deterministic_cpt, + self.virtual_evidence, self.dense_cpt, self.mean, + self.covar, self.dpmf, self.mc, self.mx] - return "\n".join(str(section) for section in sections) + return "\n".join([str(section) for section in sections]) def save(self, filename: str) -> None: """ - Write the specified InputMaster object as a string representation to - the provided filename - Open filename for writing and write the string representation + Opens filename for writing and writes out + the contents of its attributes. :param: filename: str: path to input master file + (default assumes path to `traindir` is "traindir") """ - with open(filename, "w") as outfile: - print(INPUT_MASTER_PREAMBLE, file=outfile) - print(self, file=outfile) + with open(filename, "w") as file: + print(self, file=file) diff --git a/segway/input_master.py b/segway/input_master.py new file mode 100644 index 00000000..bad99868 --- /dev/null +++ b/segway/input_master.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python + +"""input_master.py: write input master files +""" + +__version__ = "$Revision$" + +# Copyright 2012, 2013 Michael M. Hoffman + +from numpy import (array, empty, outer, set_printoptions, sqrt, tile, where) +from six.moves import range + +from ._util import (data_string, DISTRIBUTION_ASINH_NORMAL, DISTRIBUTION_NORM, + fill_array, make_default_filename, OFFSET_END, + OFFSET_START, OFFSET_STEP, resource_substitute, + SUPERVISION_SEMISUPERVISED, SUPERVISION_UNSUPERVISED, + USE_MFSDG, VIRTUAL_EVIDENCE_LIST_FILENAME) +from .gmtk.params import (DecisionTree, DenseCPT, DeterministicCPT, + DiagGaussianMC, DirichletTable, DPMF, + InputMaster, MissingFeatureDiagGaussianMC, MX, + VirtualEvidence) + +# NB: Currently Segway relies on older (Numpy < 1.14) printed representations +# of scalars and vectors in the parameter output. By default in newer (> 1.14) +# versions printed output "giv[es] the shortest unique representation". +# See Numpy 1.14 release notes: https://docs.scipy.org/doc/numpy/release.html +# Under heading 'Many changes to array printing, disableable with the new +# "legacy" printing mode' +try: + # If it is a possibility, use the older printing style + set_printoptions(legacy="1.13") +except TypeError: + # Otherwise ignore the attempt + pass + +if USE_MFSDG: + # because tying not implemented yet + COVAR_TIED = False +else: + COVAR_TIED = True + + +NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION = "segCountDown_seg_segTransition" + +# define the pseudocount for training the mixture distribution weights +GAUSSIAN_MIXTURE_WEIGHTS_PSEUDOCOUNT = 100 + +# Cardinality of Segment Transition +CARD_SEGTRANSITION = 3 + +# Cardinality of Boolean variables +CARD_BOOL = 2 +CARD_RULER = CARD_BOOL + +# XXX: should be options +LEN_SEG_EXPECTED = 100000 +LEN_SUBSEG_EXPECTED = 100 + +JITTER_ORDERS_MAGNITUDE = 5 # log10(2**5) = 1.5 decimal orders of magnitude + +DISTRIBUTIONS_LIKE_NORM = frozenset([DISTRIBUTION_NORM, + DISTRIBUTION_ASINH_NORMAL]) + +# Number of digits for rounding input.master means. +# This allows consistency between Python 2 and Python 3 +# TODO[PY2-EOL]: remove +ROUND_NDIGITS = 12 + +INPUT_MASTER_NAME = "input.master.tmpl" + + +def vstack_tile(array_like, *reps): + reps = list(reps) + [1] + + return tile(array_like, reps) + + +def prob_transition_from_expected_len(length): + # formula from Meta-MEME paper, Grundy WN et al. CABIOS 13:397 + # see also Reynolds SM et al. PLoS Comput Biol 4:e1000213 + # ("duration modeling") + return length / (1 + length) + + +def save_input_master(runner, input_master_filename, params_dirpath=None, + instance_index=None): + """ + Save the input.master file using the Segway Toolkit. + """ + + # Preamble + include_file = runner.gmtk_include_filename_relative + card_seg = runner.num_segs + preamble = make_preamble(include_file=include_file, + card_seg=card_seg) + + # Initialize InputMaster option + input_master = InputMaster(preamble=preamble) + + # Decision Trees (DT_IN_FILE) + # Decision Tree describing ruler from current frameIndex + map_frameIndex_ruler_tree = data_string("map_frameIndex_ruler.dt.txt") + input_master.dt["map_frameIndex_ruler"] = \ + DecisionTree(map_frameIndex_ruler_tree) + + # Decision Tree describing initial segCountDown from current segment + # variable + map_seg_segCountDown_tree = \ + make_segCountDown_tree_spec(runner, "map_seg_segCountDown.dt.tmpl") + input_master.dt["map_seg_segCountDown"] = \ + DecisionTree(map_seg_segCountDown_tree) + + # Decision Tree describing segCountDown from current segTransition, + # current ruler, current segment, and previous segCountDown + map_segTransition_ruler_seg_segCountDown_segCountDown_tree = \ + make_segCountDown_tree_spec(runner, + "map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl") + input_master.dt["map_segTransition_ruler_seg_segCountDown_segCountDown"] = \ + DecisionTree(map_segTransition_ruler_seg_segCountDown_segCountDown_tree) + + # XXXopt: good candidate for a linear combination C deterministic mapper + map_seg_subseg_obs_tree = data_string("map_seg_subseg_obs.dt.txt") + input_master.dt["map_seg_subseg_obs"] = \ + DecisionTree(map_seg_subseg_obs_tree) + + if runner.supervision_type == SUPERVISION_SEMISUPERVISED: + # Decision Tree used set segment under supervision + map_supervisionLabel_seg_alwaysTrue = \ + data_string("map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt") + input_master.dt["map_supervisionLabel_seg_alwaysTrue"] = \ + DecisionTree(map_supervisionLabel_seg_alwaysTrue) + else: + assert runner.supervision_type == SUPERVISION_UNSUPERVISED + + # Name Collection (NAME_COLLECTION_IN_LINE) + # For each track, create a name collection with a name for each choice of + # segment and subsegment + card_seg = runner.num_segs + card_subseg = runner.num_subsegs + for track_index, track_group in enumerate(runner.track_groups): + track_name = track_group[0].name + name_collection_name = f"collection_seg_{track_name}" + name_collection_items = [] + for seg_index in range(card_seg): + seg_name = f"seg{seg_index}" + for subseg_index in range(card_subseg): + subseg_name = f"subseg{subseg_index}" + mx_name = f"mx_{seg_name}_{subseg_name}_{track_name}" + name_collection_items.append(mx_name) + # Name Collection (NAME_COLLECTION_IN_LINE) + input_master.name_collection[name_collection_name] = \ + name_collection_items + + # Dirichlet Table (DIRICHLET_TAB_IN_FILE) + if runner.len_seg_strength > 0: + # Make Dirichlet table data + probs = make_dense_cpt_segCountDown_seg_segTransition(runner) + total_pseudocounts = runner.len_seg_strength * runner.num_bases + divisor = runner.card_seg_countdown * runner.num_segs + pseudocounts_per_row = total_pseudocounts / divisor + pseudocounts = (probs * pseudocounts_per_row).astype(int) + + input_master.dirichlet["dirichlet_segCountDown_seg_segTransition"] = \ + DirichletTable(pseudocounts, keep_shape=True) + + # Deterministic CPTs (DETERMINISTIC_CPT_IN_FILE) for the unsupervised case + card_segCountDown = runner.card_seg_countdown + # DeterministicCPT using map_seg_segCountDown Decision Tree to describe + # initial segCountDown from current segment variable + input_master.deterministic_cpt["seg_segCountDown"] = \ + DeterministicCPT((card_seg, ), card_segCountDown, + "map_seg_segCountDown") + card_frameIndex = runner.max_frames + # DeterministicCPT using map_frameIndex_ruler Decision Tree to describe + # ruler from current frameIndex + input_master.deterministic_cpt["frameIndex_ruler"] = \ + DeterministicCPT((card_frameIndex, ), CARD_RULER, + "map_frameIndex_ruler") + # DeterministicCPT using + # map_segTransition_ruler_seg_segCountDown_segCountDown Decision Tree to + # describe segCountDown from current segTransition, current ruler, current + # segment, and previous segCountDown + input_master.deterministic_cpt["segTransition_ruler_seg_segCountDown_segCountDown"] = \ + DeterministicCPT((CARD_SEGTRANSITION, CARD_RULER, card_seg, + card_segCountDown), card_segCountDown, + "map_segTransition_ruler_seg_segCountDown_segCountDown") + # DeterministicCPT which sets a segment from the previous segment value + input_master.deterministic_cpt["seg_seg_copy"] = \ + DeterministicCPT((card_seg, ), card_seg, + "internal:copyParent") + # DeterministicCPT which sets a subsegment from the previous subsegment + # value + input_master.deterministic_cpt["subseg_subseg_copy"] = \ + DeterministicCPT((card_subseg, ), card_subseg, + "internal:copyParent") + + card_supervisionlabel = runner.card_supervision_label + # Additional Deterministic CPT for the semisupervised case + if card_supervisionlabel != -1: + # Deterministic CPT which uses map_supervisionLabel_seg_alwaysTrue + # Decision Tree to set a segment under supervision + input_master.deterministic_cpt["supervisionLabel_seg_alwaysTrue"] = \ + DeterministicCPT((card_supervisionlabel, card_seg), + CARD_BOOL, + "map_supervisionLabel_seg_alwaysTrue") + + # Virtual Evidence (VE_CPT_IN_FILE) + input_master.virtual_evidence.line_before = "#if VIRTUAL_EVIDENCE == 1" + input_master.virtual_evidence["seg_virtualEvidence"] = \ + VirtualEvidence(card_seg, VIRTUAL_EVIDENCE_LIST_FILENAME) + input_master.virtual_evidence.line_after = "#endif" + + # DenseCPT begins the block conditional on INPUT_PARAMS_FILENAME + input_master.dense_cpt.line_before = "#ifndef INPUT_PARAMS_FILENAME" + # Dense CPTs (DENSE_CPT_IN_FILE) + # Dense CPT describing initial segment probabilities + input_master.dense_cpt["start_seg"] = DenseCPT.uniform_from_shape(card_seg) + # Dense CPT describing subsegment probabilities given segment + input_master.dense_cpt["seg_subseg"] = \ + DenseCPT(fill_array(1.0 / card_subseg, (card_seg, card_subseg)), + keep_shape=True) + # DenseCPT describing segment to segment transition + input_master.dense_cpt["seg_seg"] = \ + DenseCPT.uniform_from_shape(card_seg, card_seg, + self_transition=0) + # Dense CPT describing segment-subsegment to subsegment transition + input_master.dense_cpt["seg_subseg_subseg"] = \ + DenseCPT.uniform_from_shape(card_seg, card_subseg, card_subseg, + self_transition=0) + # Dense CPT describing SEGTRANSITION variable probability from + # SEGCOUNTDOWN and segment variables + input_master.dense_cpt[NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION] = \ + make_dense_cpt_segCountDown_seg_segTransition_cpt(runner) + if runner.use_dinucleotide: + input_master.dense_cpt["seg_dinucleotide"] = \ + make_dense_cpt_seg_dinucleotide_cpt() + + distribution = runner.distribution + + # Objects for normal distributions + if distribution in DISTRIBUTIONS_LIKE_NORM: + # For each choice of track, segment, and subsegment, create + # num_mix_component Gaussian mixture components each with a unique + # mean and covariance value + for component in range(runner.num_mix_components): + mean_data = make_mean_data(runner) + covar_data = make_covar_data(runner) + for seg_index in range(card_seg): + seg_name = f"seg{seg_index}" + for subseg_index in range(card_subseg): + subseg_name = f"subseg{subseg_index}" + for track_index, track_group in enumerate(runner.track_groups): + track_name = track_group[0].name + + if runner.num_mix_components == 1: + component_suffix = "" + else: + component_suffix = f"_component{component}" + + # Mean (MEAN_IN_FILE) + mean_name = f"mean_{seg_name}_{subseg_name}_{track_name}{component_suffix}" + input_master.mean[mean_name] = \ + mean_data[seg_index, subseg_index, track_index] + + # Covar (COVAR_IN_FILE) + # If COVAR_TIED, write one covar per track and + # component + if COVAR_TIED: + covar_name = \ + f"covar_{track_name}{component_suffix}" + if seg_index == 0 and subseg_index == 0: + input_master.covar[covar_name] = \ + covar_data[seg_index, subseg_index, + track_index] + else: # Otherwise, write for every seg and subseg + covar_name = f"covar_{seg_name}_{subseg_name}_{track_name}{component_suffix}" + input_master.covar[covar_name] = \ + covar_data[seg_index, subseg_index, + track_index] + + # Diag Gaussian MC with mean and covar name + # (MC_IN_FILE) + mc_name = f"mc_{distribution}_{seg_name}_{subseg_name}_{track_name}{component_suffix}" + if USE_MFSDG: # Add weights to end of Gaussian + input_master.mc[mc_name] = \ + MissingFeatureDiagGaussianMC(mean=mean_name, + covar=covar_name) + else: + input_master.mc[mc_name] = \ + DiagGaussianMC(mean=mean_name, covar=covar_name) + + else: + raise ValueError("distribution %s not supported" % distribution) + + # DPMF (DPMF_IN_FILE) + if runner.num_mix_components == 1: + # Create a uniform DPMF over one variable + input_master.dpmf["dpmf_always"] = DPMF.uniform_from_shape(1) + else: + # For each choice of segment and subsegment, create a unique DPMF with + # a uniform distribution over num_mix_components variables + for seg_index in range(card_seg): + seg_name = f"seg{seg_index}" + for subseg_index in range(card_subseg): + subseg_name = f"subseg{subseg_index}" + for track_index, track_group in enumerate(runner.track_groups): + track_name = track_group[0].name + + dpmf_name = f"dpmf_{seg_name}_{subseg_name}_{track_name}" + dpmf_obj = \ + DPMF.uniform_from_shape(runner.num_mix_components) + dpmf_obj.set_dirichlet_pseudocount(GAUSSIAN_MIXTURE_WEIGHTS_PSEUDOCOUNT) + input_master.dpmf[dpmf_name] = dpmf_obj + + # Mixtures (MX_IN_FILE) + # For each choice of track, segment, and subsegment, create a Gaussian + # mixture model from that choice's mixture components and DPMF + for seg_index in range(card_seg): + seg_name = f"seg{seg_index}" + for subseg_index in range(card_subseg): + subseg_name = f"subseg{subseg_index}" + for track_index, track_group in enumerate(runner.track_groups): + track_name = track_group[0].name + mx_name = f"mx_{seg_name}_{subseg_name}_{track_name}" + + if runner.num_mix_components == 1: + dpmf_name = "dpmf_always" + else: + dpmf_name = f"dpmf_{seg_name}_{subseg_name}_{track_name}" + + mx_components = [] + for component in range(runner.num_mix_components): + if runner.num_mix_components == 1: + component_suffix = "" + else: + component_suffix = f"_component{component}" + mx_component_name = f"mc_{distribution}_{seg_name}_{subseg_name}_{track_name}{component_suffix}" + mx_components.append(mx_component_name) + + input_master.mx[mx_name] = MX(dpmf_name, mx_components) + + # Mixture collection ends the block conditional on INPUT_PARAMS_FILENAME + input_master.mx.line_after = \ +""" +#else + +DENSE_CPT_IN_FILE INPUT_PARAMS_FILENAME ascii +MEAN_IN_FILE INPUT_PARAMS_FILENAME ascii +COVAR_IN_FILE INPUT_PARAMS_FILENAME ascii +DPMF_IN_FILE INPUT_PARAMS_FILENAME ascii +MC_IN_FILE INPUT_PARAMS_FILENAME ascii +MX_IN_FILE INPUT_PARAMS_FILENAME ascii + +#endif +""" + + if not input_master_filename: + input_master_filename = \ + make_default_filename(input_master_filename, params_dirpath, + instance_index) + + input_master.save(input_master_filename) + + +def make_preamble(include_file, card_seg): + preamble_tmpl = "preamble.tmpl" + return resource_substitute(preamble_tmpl)(include_file=include_file, + card_seg=card_seg) + + +def make_segCountDown_tree_spec(runner, resourcename): + num_segs = runner.num_segs + seg_countdowns_initial = runner.seg_countdowns_initial + + header = ([str(num_segs)] + + [str(num_seg) for num_seg in range(num_segs - 1)] + + ["default"]) + + lines = [" ".join(header)] + + for seg_countdown_initial in seg_countdowns_initial: + lines.append(f" -1 {seg_countdown_initial}") + + tree = "\n".join(lines) + + return resource_substitute(resourcename)(tree=tree) + + +def make_dense_cpt_segCountDown_seg_segTransition(runner): # noqa + # first values are the ones where segCountDown = 0 therefore + # the transitions to segTransition = 2 occur early on + card_seg_countdown = runner.card_seg_countdown + + # by default, when segCountDown is high, never transition + res = empty((card_seg_countdown, runner.num_segs, CARD_SEGTRANSITION)) + + prob_seg_self_self, prob_seg_self_other = \ + calc_prob_transition(runner.resolution, LEN_SEG_EXPECTED) + + prob_subseg_self_self, prob_subseg_self_other = \ + calc_prob_transition(runner.resolution, LEN_SUBSEG_EXPECTED) + + # 0: no transition + # 1: subseg transition (no transition when CARD_SUBSEG == 1) + # 2: seg transition + probs_allow_transition = \ + array([prob_seg_self_self * prob_subseg_self_self, + prob_seg_self_self * prob_subseg_self_other, + prob_seg_self_other]) + + probs_prevent_transition = array([prob_subseg_self_self, + prob_subseg_self_other, + 0.0]) + + # find the labels with maximum segment lengths and those without + table = runner.seg_table + ends = table[:, OFFSET_END] + bitmap_without_maximum = ends == 0 + + # where() returns a tuple; this unpacks it + labels_with_maximum, = where(~bitmap_without_maximum) + labels_without_maximum, = where(bitmap_without_maximum) + + # labels without a maximum + res[0, labels_without_maximum] = probs_allow_transition + res[1:, labels_without_maximum] = probs_prevent_transition + + # labels with a maximum + seg_countdowns_initial = runner.seg_countdowns_initial + + res[0, labels_with_maximum] = array([0.0, 0.0, 1.0]) + for label in labels_with_maximum: + seg_countdown_initial = seg_countdowns_initial[label] + minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] + + seg_countdown_allow = seg_countdown_initial - minimum + 1 + + res[1:seg_countdown_allow, label] = probs_allow_transition + res[seg_countdown_allow:, label] = probs_prevent_transition + + res = res.squeeze() # Remove leading dimension of size 1 + return res + + +def make_dense_cpt_segCountDown_seg_segTransition_cpt(runner): + probs = make_dense_cpt_segCountDown_seg_segTransition(runner) + res = DenseCPT(probs, keep_shape=True) + + if runner.len_seg_strength > 0: + res.set_dirichlet_table(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION) + + return res + + +def calc_prob_transition(resolution, length): + """Calculate probability transition from scaled expected length. + """ + length_scaled = length // resolution + + prob_self_self = prob_transition_from_expected_len(length_scaled) + prob_self_other = 1.0 - prob_self_self + + return prob_self_self, prob_self_other + + +def make_dense_cpt_seg_dinucleotide_cpt(runner): + dinucleotide_table = [make_dinucleotide_table_row(runner) + for _ in range(runner.num_segs)] + return DenseCPT(dinucleotide_table) + + +def make_dinucleotide_table_row(runner): + # simple one-parameter model + gc = runner.random_state.uniform() + at = 1 - gc + + a = at / 2 + c = gc / 2 + g = gc - c + t = 1 - a - c - g + + acgt = array([a, c, g, t]) + + # shape: (16,) + return outer(acgt, acgt).ravel() + + +def make_mean_data(runner): + num_segs = runner.num_segs + num_subsegs = runner.num_subsegs + means = runner.means # indexed by track_index + + # maximum likelihood, adjusted by no more than 0.2*sd + stds = sqrt(runner.vars) + + # tile the means of each track (num_segs, num_subsegs times) + means_tiled = vstack_tile(means, num_segs, num_subsegs) + stds_tiled = vstack_tile(stds, num_segs, num_subsegs) + + jitter_std_bound = 0.2 + noise = runner.random_state.uniform(-jitter_std_bound, + jitter_std_bound, stds_tiled.shape) + + return means_tiled + (stds_tiled * noise) + + +def make_covar_data(runner): + return vstack_tile(runner.vars, runner.num_segs, runner.num_subsegs) diff --git a/segway/run.py b/segway/run.py index a091a29e..95812f7a 100644 --- a/segway/run.py +++ b/segway/run.py @@ -63,9 +63,9 @@ from .cluster import (is_running_locally, JobTemplateFactory, make_native_spec, RestartableJob, RestartableJobDict, Session) from .include import IncludeSaver +from .input_master import (INPUT_MASTER_NAME, save_input_master) from .observations import Observations from .output import IdentifySaver, PosteriorSaver -from .segway_input_master import InputMasterSaver from .structure import StructureSaver from .task import MSG_SUCCESS from .version import __version__ @@ -253,7 +253,6 @@ # "posterior%s.%%d.bedgraph.gz" BEDGRAPH_FILEBASEFMT = extjoin(PREFIX_POSTERIOR, "%%d", EXT_BEDGRAPH, EXT_GZ) - # "posterior%s.bed.gz" POSTERIOR_BED_FILEBASENAME = extjoin(PREFIX_POSTERIOR, EXT_BED, EXT_GZ) @@ -549,7 +548,7 @@ def __init__(self, runner, session, instance_index, num_segs): self.num_segs = num_segs self.instance_index = instance_index self.runner.input_master_filename = make_default_filename( - InputMasterSaver.resource_name, runner.params_dirpath, + INPUT_MASTER_NAME, runner.params_dirpath, instance_index) Thread.__init__(self) @@ -2043,9 +2042,19 @@ def save_input_master(self, instance_index=None, new=False): else: input_master_filename = self.input_master_filename - _, input_master_filename_is_new = \ - InputMasterSaver(self)(input_master_filename, self.params_dirpath, - self.clobber, instance_index) + if input_master_filename: + is_new = self.clobber or not Path(input_master_filename).exists() + else: + input_master_filename = make_default_filename(INPUT_MASTER_NAME, + self.params_dirpath, + instance_index) + is_new = True + + if is_new: + save_input_master(self, input_master_filename, self.params_dirpath, + instance_index) + + return input_master_filename def load_supervision(self): # The semi-supervised mode changes the DBN structure so there is an @@ -3227,10 +3236,8 @@ def init_train(self): # must be before file creation. Otherwise # input_master_filename_is_new will be wrong - input_master_filename, input_master_filename_is_new = \ - InputMasterSaver(self)(self.input_master_filename, - self.params_dirpath, self.clobber) - self.input_master_filename = input_master_filename + self.input_master_filename = \ + self.save_input_master() # save file locations to tab-delimited file self.save_tab_file(self, TRAIN_OPTION_TYPES, TRAIN_FILEBASENAME) @@ -3249,10 +3256,6 @@ def init_train(self): self.random_state = RandomState(instance_random_seed) self.save_input_master(index, True) - if not input_master_filename_is_new: - # do not overwrite existing file - input_master_filename = None - def get_thread_run_func(self): if len(self.num_segs_range) > 1 or self.num_instances > 1: return self.run_train_multithread @@ -3418,7 +3421,7 @@ def recover_filename(self, resource): # only want "input.master" not "input.0.master" if there is # only one instance if (not self.instance_make_new_params and - resource == InputMasterSaver.resource_name): + resource == INPUT_MASTER_NAME): instance_index = None old_filename = make_default_filename(resource, @@ -3455,7 +3458,7 @@ def recover_train_instance(self, last_log_likelihood, log_likelihood, # into our current one. If just using train-run-round, these will # already be present. self.input_master_filename = \ - self.recover_filename(InputMasterSaver.resource_name) + self.recover_filename(INPUT_MASTER_NAME) log_likelihood_tab_filename = self.log_likelihood_tab_filename recover_log_likelihood_tab_filepath.copy2( log_likelihood_tab_filename) @@ -4055,7 +4058,7 @@ def __init__(self, *args, **kwargs): group.add_argument("-i", "--input-master", metavar="FILE", help="use or create input master in FILE" " (default %s)" % - make_default_filename(InputMasterSaver.resource_name, + make_default_filename(INPUT_MASTER_NAME, DIRPATH_PARAMS)) group.add_argument("-s", "--structure", metavar="FILE", diff --git a/segway/segway_input_master.py b/segway/segway_input_master.py deleted file mode 100644 index 53efd984..00000000 --- a/segway/segway_input_master.py +++ /dev/null @@ -1,925 +0,0 @@ -#!/usr/bin/env python - -"""input_master.py: write input master files -""" - -__version__ = "$Revision$" - -# Copyright 2012, 2013 Michael M. Hoffman - -from math import frexp, ldexp -from string import Template -import sys - -from genomedata._util import fill_array -from numpy import (array, empty, float32, outer, set_printoptions, sqrt, tile, - vectorize, where, zeros) -from six.moves import map, range - -from ._util import (copy_attrs, data_string, DISTRIBUTION_ASINH_NORMAL, - DISTRIBUTION_GAMMA, DISTRIBUTION_NORM, - OFFSET_END, OFFSET_START, OFFSET_STEP, - resource_substitute, Saver, SEGWAY_ENCODING, - SUPERVISION_SEMISUPERVISED, - SUPERVISION_SUPERVISED, - SUPERVISION_UNSUPERVISED, USE_MFSDG, - VIRTUAL_EVIDENCE_LIST_FILENAME) - -# NB: Currently Segway relies on older (Numpy < 1.14) printed representations -# of scalars and vectors in the parameter output. By default in newer (> 1.14) -# versions printed output "giv[es] the shortest unique representation". -# See Numpy 1.14 release notes: https://docs.scipy.org/doc/numpy/release.html -# Under heading 'Many changes to array printing, disableable with the new -# "legacy" printing mode' -try: - # If it is a possibility, use the older printing style - set_printoptions(legacy="1.13") -except TypeError: - # Otherwise ignore the attempt - pass - -if USE_MFSDG: - # because tying not implemented yet - COVAR_TIED = False -else: - COVAR_TIED = True - -ABSOLUTE_FUDGE = 0.001 - -# define the pseudocount for training the mixture distribution weights -GAUSSIAN_MIXTURE_WEIGHTS_PSEUDOCOUNT = 100 - -# here to avoid duplication -NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION = "segCountDown_seg_segTransition" - -CARD_SEGTRANSITION = 3 - -# XXX: should be options -LEN_SEG_EXPECTED = 100000 -LEN_SUBSEG_EXPECTED = 100 - -JITTER_ORDERS_MAGNITUDE = 5 # log10(2**5) = 1.5 decimal orders of magnitude - -DISTRIBUTIONS_LIKE_NORM = frozenset([DISTRIBUTION_NORM, - DISTRIBUTION_ASINH_NORMAL]) - -# Number of digits for rounding input.master means. -# This allows consistency between Python 2 and Python 3 -# TODO[PY2-EOL]: remove -ROUND_NDIGITS = 12 - - -def vstack_tile(array_like, *reps): - reps = list(reps) + [1] - - return tile(array_like, reps) - - -def array2text(a): - ndim = a.ndim - if ndim == 1: - return " ".join(map(str, a)) - else: - delimiter = "\n" * (ndim - 1) - return delimiter.join(array2text(row) for row in a) - - -def make_spec(name, iterable): - """ - name: str, name of GMTK object type - iterable: iterable of strs - """ - items = list(iterable) - - header_lines = ["%s_IN_FILE inline" % name, str(len(items)), ""] - - indexed_items = ["%d %s" % indexed_item - for indexed_item in enumerate(items)] - - all_lines = header_lines + indexed_items - - # In Python 2, convert from unicode to bytes to prevent - # __str__method from being called twice - # Specifically in the string template standard library provided by Python - # 2, there is a call to a string escape sequence + tuple, e.g.: - # print("%s" % (some_string,)) - # This "some_string" has its own __str__ method called *twice* if if it is - # a unicode string in Python 2. Python 3 does not have this issue. This - # causes downstream issues since strings are generated often in our case - # for random numbers. Calling __str__ twice will often cause re-iterating - # the RNG which makes for inconsitent results between Python versions. - if sys.version[0] == "2": - all_lines = [line.encode(SEGWAY_ENCODING) for line in all_lines] - - return "\n".join(all_lines) + "\n" - - -def prob_transition_from_expected_len(length): - # formula from Meta-MEME paper, Grundy WN et al. CABIOS 13:397 - # see also Reynolds SM et al. PLoS Comput Biol 4:e1000213 - # ("duration modeling") - return length / (1 + length) - - -def make_zero_diagonal_table(length): - if length == 1: - return array([1.0]) # always return to self - - prob_self_self = 0.0 - prob_self_other = (1.0 - prob_self_self) / (length - 1) - - # set everywhere (diagonal to be rewritten) - res = fill_array(prob_self_other, (length, length)) - - # set diagonal - range_cpt = range(length) - res[range_cpt, range_cpt] = prob_self_self - - return res - - -def format_indexed_strs(fmt, num): - full_fmt = fmt + "%d" - return [full_fmt % index for index in range(num)] - - -def jitter_cell(cell, random_state): - """ - adds some random noise - """ - # get the binary exponent and subtract JITTER_ORDERS_MAGNITUDE - # e.g. 3 * 2**10 --> 1 * 2**5 - max_noise = ldexp(1, frexp(cell)[1] - JITTER_ORDERS_MAGNITUDE) - - return cell + random_state.uniform(-max_noise, max_noise) - - -jitter = vectorize(jitter_cell) - - -class ParamSpec(object): - """ - base class for parameter specifications used in input.master files - """ - type_name = None - object_tmpl = None - copy_attrs = ["distribution", "mins", "num_segs", "num_subsegs", - "num_track_groups", "track_groups", "num_mix_components"] - - def __init__(self, saver): - # copy all variables from saver that it copied from Runner - # XXX: override in subclasses to only copy subset - copy_attrs(saver, self, self.copy_attrs) - - def get_track_lt_min(self, track_index): - """ - returns a value less than a minimum in a track - """ - # XXX: refactor into a new function - min_track = self.mins[track_index] - - # fudge the minimum by a very small amount. this is not - # continuous, but hopefully we won't get values where it - # matters - # XXX: restore this after GMTK issues fixed - # if min_track == 0.0: - # min_track_fudged = FUDGE_TINY - # else: - # min_track_fudged = min_track - ldexp(abs(min_track), FUDGE_EP) - - # this happens for really big numbers or really small - # numbers; you only have 7 orders of magnitude to play - # with on a float32 - min_track_f32 = float32(min_track) - - assert min_track_f32 - float32(ABSOLUTE_FUDGE) != min_track_f32 - return min_track - ABSOLUTE_FUDGE - - def make_segnames(self): - return format_indexed_strs("seg", self.num_segs) - - def make_subsegnames(self): - return format_indexed_strs("subseg", self.num_subsegs) - - def get_template_component_suffix(self, component_number): - """Returns the subsitution for the component suffix in the GMTK model - template. Empty if there is only one component""" - if self.num_mix_components == 1: - return "" - else: - return "_component{}".format(component_number) - - def generate_tmpl_mappings(self): - # need segnames because in the tied covariance case, the - # segnames are replaced by "any" (see .make_covar_spec()), - # and only one mapping is produced - num_subsegs = self.num_subsegs - - track_groups = self.track_groups - num_track_groups = self.num_track_groups - - for seg_index, segname in enumerate(self.make_segnames()): - seg_offset = num_track_groups * num_subsegs * seg_index - - for subseg_index, subsegname in enumerate(self.make_subsegnames()): - subseg_offset = seg_offset + (num_track_groups * subseg_index) - - for track_group_index, track_group in enumerate(track_groups): - track_offset = subseg_offset + track_group_index - head_trackname = track_group[0].name - - # XXX: change name of index to track_offset in templates - # XXX: change name of track_index to track_group_index - yield dict(seg=segname, subseg=subsegname, - track=head_trackname, seg_index=seg_index, - subseg_index=subseg_index, - track_index=track_group_index, - index=track_offset, - distribution=self.distribution) - - def make_data(self): - """ - override this in subclasses - - returns: container indexed by (seg_index, subseg_index, track_index) - """ - return None - - def generate_objects(self): - """ - returns: iterable of strs containing GMTK parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - - data = self.make_data() - - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - - yield substitute(mapping) - - def __str__(self): - return make_spec(self.type_name, self.generate_objects()) - - -class DTParamSpec(ParamSpec): - type_name = "DT" - copy_attrs = ParamSpec.copy_attrs + ["seg_countdowns_initial", - "supervision_type"] - - def make_segCountDown_tree_spec(self, resourcename): # noqa - num_segs = self.num_segs - seg_countdowns_initial = self.seg_countdowns_initial - - header = ([str(num_segs)] + - [str(num_seg) for num_seg in range(num_segs - 1)] + - ["default"]) - - lines = [" ".join(header)] - - for seg, seg_countdown_initial in enumerate(seg_countdowns_initial): - lines.append(" -1 %d" % seg_countdown_initial) - - tree = "\n".join(lines) - - return resource_substitute(resourcename)(tree=tree) - - def make_map_seg_segCountDown_dt_spec(self): # noqa - return self.make_segCountDown_tree_spec("map_seg_segCountDown.dt.tmpl") - - def make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec(self): # noqa - template_name = \ - "map_segTransition_ruler_seg_segCountDown_segCountDown.dt.tmpl" - return self.make_segCountDown_tree_spec(template_name) - - def generate_objects(self): - yield data_string("map_frameIndex_ruler.dt.txt") - yield self.make_map_seg_segCountDown_dt_spec() - yield self.make_map_segTransition_ruler_seg_segCountDown_segCountDown_dt_spec() # noqa - yield data_string("map_seg_subseg_obs.dt.txt") - - supervision_type = self.supervision_type - if supervision_type == SUPERVISION_SEMISUPERVISED: - yield data_string("map_supervisionLabel_seg_alwaysTrue_semisupervised.dt.txt") # noqa - elif supervision_type == SUPERVISION_SUPERVISED: - # XXX: does not exist yet - yield data_string("map_supervisionLabel_seg_alwaysTrue_supervised.dt.txt") # noqa - else: - assert supervision_type == SUPERVISION_UNSUPERVISED - - -class TableParamSpec(ParamSpec): - copy_attrs = ParamSpec.copy_attrs \ - + ["resolution", "card_seg_countdown", "seg_table", - "seg_countdowns_initial"] - - # see Segway paper - probs_force_transition = array([0.0, 0.0, 1.0]) - - def make_table_spec(self, name, table, ndim, extra_rows=[]): - header_rows = [name, ndim] - header_rows.extend(table.shape) - - rows = [" ".join(map(str, header_rows))] - rows.extend(extra_rows) - rows.extend([array2text(table), ""]) - - return "\n".join(rows) - - def calc_prob_transition(self, length): - """Calculate probability transition from scaled expected length. - """ - length_scaled = length // self.resolution - - prob_self_self = prob_transition_from_expected_len(length_scaled) - prob_self_other = 1.0 - prob_self_self - - return prob_self_self, prob_self_other - - def make_dense_cpt_segCountDown_seg_segTransition(self): # noqa - # first values are the ones where segCountDown = 0 therefore - # the transitions to segTransition = 2 occur early on - card_seg_countdown = self.card_seg_countdown - - # by default, when segCountDown is high, never transition - res = empty((card_seg_countdown, self.num_segs, CARD_SEGTRANSITION)) - - prob_seg_self_self, prob_seg_self_other = \ - self.calc_prob_transition(LEN_SEG_EXPECTED) - - prob_subseg_self_self, prob_subseg_self_other = \ - self.calc_prob_transition(LEN_SUBSEG_EXPECTED) - - # 0: no transition - # 1: subseg transition (no transition when CARD_SUBSEG == 1) - # 2: seg transition - probs_allow_transition = \ - array([prob_seg_self_self * prob_subseg_self_self, - prob_seg_self_self * prob_subseg_self_other, - prob_seg_self_other]) - - probs_prevent_transition = array([prob_subseg_self_self, - prob_subseg_self_other, - 0.0]) - - # find the labels with maximum segment lengths and those without - table = self.seg_table - ends = table[:, OFFSET_END] - bitmap_without_maximum = ends == 0 - - # where() returns a tuple; this unpacks it - labels_with_maximum, = where(~bitmap_without_maximum) - labels_without_maximum, = where(bitmap_without_maximum) - - # labels without a maximum - res[0, labels_without_maximum] = probs_allow_transition - res[1:, labels_without_maximum] = probs_prevent_transition - - # labels with a maximum - seg_countdowns_initial = self.seg_countdowns_initial - - res[0, labels_with_maximum] = self.probs_force_transition - for label in labels_with_maximum: - seg_countdown_initial = seg_countdowns_initial[label] - minimum = table[label, OFFSET_START] // table[label, OFFSET_STEP] - - seg_countdown_allow = seg_countdown_initial - minimum + 1 - - res[1:seg_countdown_allow, label] = probs_allow_transition - res[seg_countdown_allow:, label] = probs_prevent_transition - - return res - - @staticmethod - def make_dirichlet_name(name): - return "dirichlet_%s" % name - - -class DenseCPTParamSpec(TableParamSpec): - type_name = "DENSE_CPT" - copy_attrs = TableParamSpec.copy_attrs \ - + ["random_state", "len_seg_strength", "use_dinucleotide"] - - def make_table_spec(self, name, table, dirichlet=False): - """ - if dirichlet is True, this table has a corresponding DirichletTable - automatically generated name - """ - ndim = table.ndim - 1 # don't include output dim - - if dirichlet: - extra_rows = ["DirichletTable %s" % self.make_dirichlet_name(name)] - else: - extra_rows = [] - - return TableParamSpec.make_table_spec(self, name, table, ndim, - extra_rows) - - def make_empty_cpt(self): - num_segs = self.num_segs - - return zeros((num_segs, num_segs)) - - def make_dense_cpt_start_seg_spec(self): - num_segs = self.num_segs - cpt = fill_array(1.0 / num_segs, num_segs) - - return self.make_table_spec("start_seg", cpt) - - def make_dense_cpt_seg_subseg_spec(self): - num_subsegs = self.num_subsegs - cpt = fill_array(1.0 / num_subsegs, (self.num_segs, num_subsegs)) - - return self.make_table_spec("seg_subseg", cpt) - - def make_dense_cpt_seg_seg_spec(self): - cpt = make_zero_diagonal_table(self.num_segs) - - return self.make_table_spec("seg_seg", cpt) - - def make_dense_cpt_seg_subseg_subseg_spec(self): - cpt_seg = make_zero_diagonal_table(self.num_subsegs) - cpt = vstack_tile(cpt_seg, self.num_segs, 1) - - return self.make_table_spec("seg_subseg_subseg", cpt) - - def make_dinucleotide_table_row(self): - # simple one-parameter model - gc = self.random_state.uniform() - at = 1 - gc - - a = at / 2 - c = gc / 2 - g = gc - c - t = 1 - a - c - g - - acgt = array([a, c, g, t]) - - # shape: (16,) - return outer(acgt, acgt).ravel() - - def make_dense_cpt_seg_dinucleotide_spec(self): - table = [self.make_dinucleotide_table_row() - for seg_index in range(self.num_segs)] - - return self.make_table_spec("seg_dinucleotide", table) - - def make_dense_cpt_segCountDown_seg_segTransition_spec(self): # noqa - cpt = self.make_dense_cpt_segCountDown_seg_segTransition() - - return self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, cpt, - dirichlet=self.len_seg_strength > 0) - - def generate_objects(self): - yield self.make_dense_cpt_start_seg_spec() - yield self.make_dense_cpt_seg_subseg_spec() - yield self.make_dense_cpt_seg_seg_spec() - yield self.make_dense_cpt_seg_subseg_subseg_spec() - yield self.make_dense_cpt_segCountDown_seg_segTransition_spec() - - if self.use_dinucleotide: - yield self.make_dense_cpt_seg_dinucleotide_spec() - - -class DirichletTabParamSpec(TableParamSpec): - type_name = "DIRICHLET_TAB" - copy_attrs = TableParamSpec.copy_attrs \ - + ["len_seg_strength", "num_bases", "card_seg_countdown", - "num_mix_components"] - - def make_table_spec(self, name, table): - dirichlet_name = self.make_dirichlet_name(name) - - return TableParamSpec.make_table_spec(self, dirichlet_name, table, - table.ndim) - - def make_dirichlet_table(self): - probs = self.make_dense_cpt_segCountDown_seg_segTransition() - - # XXX: the ratio is not exact as num_bases is not the same as - # the number of base-base transitions. It is surely close - # enough, though - total_pseudocounts = self.len_seg_strength * self.num_bases - divisor = self.card_seg_countdown * self.num_segs - pseudocounts_per_row = total_pseudocounts / divisor - - # astype(int) means flooring the floats - pseudocounts = (probs * pseudocounts_per_row).astype(int) - - return pseudocounts - - def generate_objects(self): - # XXX: these called functions have confusing/duplicative names - if self.len_seg_strength > 0: - dirichlet_table = self.make_dirichlet_table() - yield self.make_table_spec(NAME_SEGCOUNTDOWN_SEG_SEGTRANSITION, - dirichlet_table) - - -class NameCollectionParamSpec(ParamSpec): - type_name = "NAME_COLLECTION" - header_tmpl = "collection_seg_${track} ${fullnum_subsegs}" - row_tmpl = "mx_${seg}_${subseg}_${track}" - - def generate_objects(self): - num_segs = self.num_segs - num_subsegs = self.num_subsegs - track_groups = self.track_groups - - substitute_header = Template(self.header_tmpl).substitute - substitute_row = Template(self.row_tmpl).substitute - - fullnum_subsegs = num_segs * num_subsegs - - for track_group in track_groups: - head_trackname = track_group[0].name - - # XXX: rename in template: track -> head_trackname - mapping = dict(track=head_trackname, - fullnum_subsegs=fullnum_subsegs) - - rows = [substitute_header(mapping)] - for seg_index in range(num_segs): - seg = "seg%d" % seg_index - - for subseg_index in range(num_subsegs): - subseg = "subseg%d" % subseg_index - mapping = dict(seg=seg, subseg=subseg, - track=head_trackname) - - rows.append(substitute_row(mapping)) - - yield "\n".join(rows) - - -class MeanParamSpec(ParamSpec): - type_name = "MEAN" - object_tmpl = \ - "mean_${seg}_${subseg}_${track}${component_suffix} 1 ${datum}" - jitter_std_bound = 0.2 - - copy_attrs = ParamSpec.copy_attrs + ["means", "num_mix_components", - "random_state", "vars"] - - def make_data(self): - num_segs = self.num_segs - num_subsegs = self.num_subsegs - means = self.means # indexed by track_index - - # maximum likelihood, adjusted by no more than 0.2*sd - stds = sqrt(self.vars) - - # tile the means of each track (num_segs, num_subsegs times) - means_tiled = vstack_tile(means, num_segs, num_subsegs) - stds_tiled = vstack_tile(stds, num_segs, num_subsegs) - - jitter_std_bound = self.jitter_std_bound - noise = self.random_state.uniform(-jitter_std_bound, jitter_std_bound, - stds_tiled.shape) - - return means_tiled + (stds_tiled * noise) - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - - for component in range(self.num_mix_components): - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, - track_index] - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) - - mapping["datum"] = mapping["datum"] - yield substitute(mapping) - - -class CovarParamSpec(ParamSpec): - type_name = "COVAR" - object_tmpl = \ - "covar_${seg}_${subseg}_${track}${component_suffix} 1 ${datum}" - - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components", "vars"] - - def make_data(self): - return vstack_tile(self.vars, self.num_segs, self.num_subsegs) - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - for component in range(self.num_mix_components): - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, - track_index] - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) - - mapping["datum"] = mapping["datum"] - yield substitute(mapping) - - -class TiedCovarParamSpec(CovarParamSpec): - object_tmpl = "covar_${track}${component_suffix} 1 ${datum}" - - def make_segnames(self): - return ["any"] - - def make_subsegnames(self): - return ["any"] - - -class RealMatParamSpec(ParamSpec): - type_name = "REAL_MAT" - - def generate_objects(self): - yield "matrix_weightscale_1x1 1 1 1.0" - - -class GammaRealMatParamSpec(RealMatParamSpec): - scale_tmpl = "gammascale_${seg}_${subseg}_${track} 1 1 ${datum}" - shape_tmpl = "gammashape_${seg}_${subseg}_${track} 1 1 ${datum}" - - copy_attrs = ParamSpec.copy_attrs \ - + ["means", "random_state", "vars"] - - def generate_objects(self): - means = self.means - vars = self.vars - - substitute_scale = Template(self.scale_tmpl).substitute - substitute_shape = Template(self.shape_tmpl).substitute - - # random start values are equivalent to the random start - # values of a Gaussian: - # - # means = scales * shapes - # vars = shapes * scales**2 - # - # therefore: - scales = vars / means - shapes = (means ** 2) / vars - - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - - scale = jitter(scales[track_index], self.random_state) - yield substitute_scale(dict(datum=scale, **mapping)) - - shape = jitter(shapes[track_index], self.random_state) - yield substitute_shape(dict(datum=shape, **mapping)) - - -class MCParamSpec(ParamSpec): - type_name = "MC" - - -class NormMCParamSpec(MCParamSpec): - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components"] - - if USE_MFSDG: - # dimensionality component_type name mean covar weights - object_tmpl = "1 COMPONENT_TYPE_MISSING_FEATURE_SCALED_DIAG_GAUSSIAN" \ - " mc_${distribution}_${seg}_${subseg}_${track}" \ - " mean_${seg}_${subseg}_${track} covar_${seg}_${subseg}_${track}" \ - " matrix_weightscale_1x1" - else: - # dimensionality component_type name mean covar - object_tmpl = "1 COMPONENT_TYPE_DIAG_GAUSSIAN" \ - " mc_${distribution}_${seg}_${subseg}_${track}${component_suffix}"\ - " mean_${seg}_${subseg}_${track}${component_suffix}" \ - " covar_${track}${component_suffix}" - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - substitute = Template(self.object_tmpl).substitute - for component in range(self.num_mix_components): - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - mapping["track"] = mapping["track"] - mapping["component_suffix"] = \ - self.get_template_component_suffix(component) - - yield substitute(mapping) - - -class GammaMCParamSpec(MCParamSpec): - object_tmpl = "1 COMPONENT_TYPE_GAMMA mc_gamma_${seg}_${subseg}_${track}" \ - " ${min_track} gammascale_${seg}_${subseg}_${track}" \ - " gammashape_${seg}_${subseg}_${track}" - - -class MXParamSpec(ParamSpec): - type_name = "MX" - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - object_tmpl = "1 mx_${seg}_${subseg}_${track} ${num_mix_components} " - - # If the number of mixture components is one - if self.num_mix_components == 1: - # Set the dense probabily mass function containing component - # responsibilites to be set to always 1 for 1 component - object_tmpl += "dpmf_always" - # Otherwise set the dense probability mass function based on number - # of components from the GMTK DPMF definition - else: - object_tmpl += "dpmf_${seg}_${subseg}_${track}" - - for component in range(self.num_mix_components): - add = " mc_${distribution}_${seg}_${subseg}_${track}%s" % ( - self.get_template_component_suffix(component)) - object_tmpl += add - substitute = Template(object_tmpl).substitute - - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - track_index = mapping["track_index"] - mapping["num_mix_components"] = self.num_mix_components - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, track_index] - yield substitute(mapping) - - -class DPMFParamSpec(DenseCPTParamSpec): - type_name = "DPMF" - copy_attrs = ParamSpec.copy_attrs + ["num_mix_components"] - - def generate_objects(self): - """ - returns: iterable of strs containing gmtk parameter objects starting - with names - """ - # If the number of mixture components is one - if self.num_mix_components == 1: - # Create a dense probability mass function of one value of 1 - # to fix the number of mixture components to one - yield "dpmf_always 1 1.0" - # Otherwise - else: - # Create a dense probability mass function of dirichlet constants - # with the same amount of mixture components - object_tmpl = "dpmf_${seg}_${subseg}_${track} " \ - "${num_mix_components} " \ - "DirichletConst %s ${weights}" % \ - GAUSSIAN_MIXTURE_WEIGHTS_PSEUDOCOUNT - component_weight = str(round(1.0 / self.num_mix_components, - ROUND_NDIGITS)) - weights = (" " + component_weight) * self.num_mix_components - substitute = Template(object_tmpl).substitute - data = self.make_data() - for mapping in self.generate_tmpl_mappings(): - mapping["weights"] = weights - track_index = mapping["track_index"] - mapping["num_mix_components"] = self.num_mix_components - if self.distribution == DISTRIBUTION_GAMMA: - mapping["min_track"] = self.get_track_lt_min(track_index) - - if data is not None: - seg_index = mapping["seg_index"] - subseg_index = mapping["subseg_index"] - mapping["datum"] = data[seg_index, subseg_index, - track_index] - yield substitute(mapping) - - -class VirtualEvidenceSpec(ParamSpec): - type_name = "VE_CPT" - - # According to GMTK specification (tksrc/GMTK_VECPT.cc) - # this should be of the format: - # CPT_name num_par par_card self_card VE_CPT_FILE - # nfs:nfloats nis:nints ... fmt:obsformat ... END - object_tmpl = "seg_virtualEvidence 1 %s 2 %s nfs:%s nis:0 fmt:ascii END" - copy_attrs = ParamSpec.copy_attrs + ["virtual_evidence", "num_segs"] - - def make_virtual_evidence_spec(self): - return self.object_tmpl % (self.num_segs, - VIRTUAL_EVIDENCE_LIST_FILENAME, - self.num_segs) - - def generate_objects(self): - yield self.make_virtual_evidence_spec() - - -class InputMasterSaver(Saver): - resource_name = "input.master.tmpl" - copy_attrs = ["num_bases", "num_segs", "num_subsegs", - "num_track_groups", "card_seg_countdown", - "seg_countdowns_initial", "seg_table", "distribution", - "len_seg_strength", "resolution", "random_state", - "supervision_type", "use_dinucleotide", "mins", "means", - "vars", "gmtk_include_filename_relative", "track_groups", - "num_mix_components", "virtual_evidence"] - - def make_mapping(self): - # the locals of this function are used as the template mapping - # use caution before deleting or renaming any variables - # check that they are not used in the input.master template - num_free_params = 0 - - num_segs = self.num_segs - num_subsegs = self.num_subsegs - num_track_groups = self.num_track_groups - fullnum_subsegs = num_segs * num_subsegs - - include_filename = self.gmtk_include_filename_relative - - dt_spec = DTParamSpec(self) - - if self.len_seg_strength > 0: - dirichlet_spec = DirichletTabParamSpec(self) - else: - dirichlet_spec = "" - - dense_cpt_spec = DenseCPTParamSpec(self) - - # seg_seg - num_free_params += fullnum_subsegs * (fullnum_subsegs - 1) - - # segCountDown_seg_segTransition - num_free_params += fullnum_subsegs - - distribution = self.distribution - if distribution in DISTRIBUTIONS_LIKE_NORM: - mean_spec = MeanParamSpec(self) - if COVAR_TIED: - covar_spec = TiedCovarParamSpec(self) - else: - covar_spec = CovarParamSpec(self) - - if USE_MFSDG: - real_mat_spec = RealMatParamSpec(self) - else: - real_mat_spec = "" - - mc_spec = NormMCParamSpec(self) - - if COVAR_TIED: - num_free_params += (fullnum_subsegs + 1) * num_track_groups - else: - num_free_params += (fullnum_subsegs * 2) * num_track_groups - elif distribution == DISTRIBUTION_GAMMA: - mean_spec = "" - covar_spec = "" - - # XXX: another option is to calculate an ML estimate for - # the gamma distribution rather than the ML estimate for the - # mean and converting - real_mat_spec = GammaRealMatParamSpec(self) - mc_spec = GammaMCParamSpec(self) - - num_free_params += (fullnum_subsegs * 2) * num_track_groups - else: - raise ValueError("distribution %s not supported" % distribution) - - mx_spec = MXParamSpec(self) - name_collection_spec = NameCollectionParamSpec(self) - card_seg = num_segs - dpmf_spec = DPMFParamSpec(self) - - ve_spec = VirtualEvidenceSpec(self) - - return locals() # dict of vars set in this function diff --git a/test/simplesubseg/touchstone/identifydir-full/posterior0.0.track.gz b/test/simplesubseg/touchstone/identifydir-full/posterior0.0.track.gz new file mode 100644 index 00000000..cff3923f Binary files /dev/null and b/test/simplesubseg/touchstone/identifydir-full/posterior0.0.track.gz differ diff --git a/test/simplesubseg/touchstone/identifydir-full/posterior0.1.track.gz b/test/simplesubseg/touchstone/identifydir-full/posterior0.1.track.gz new file mode 100644 index 00000000..4e4a8073 Binary files /dev/null and b/test/simplesubseg/touchstone/identifydir-full/posterior0.1.track.gz differ diff --git a/test/simplesubseg/touchstone/identifydir-full/posterior1.0.track.gz b/test/simplesubseg/touchstone/identifydir-full/posterior1.0.track.gz new file mode 100644 index 00000000..f03a675e Binary files /dev/null and b/test/simplesubseg/touchstone/identifydir-full/posterior1.0.track.gz differ diff --git a/test/simplesubseg/touchstone/identifydir-full/posterior1.1.track.gz b/test/simplesubseg/touchstone/identifydir-full/posterior1.1.track.gz new file mode 100644 index 00000000..6ab6ff58 Binary files /dev/null and b/test/simplesubseg/touchstone/identifydir-full/posterior1.1.track.gz differ