diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7b08948 --- /dev/null +++ b/Makefile @@ -0,0 +1,67 @@ +CC=gcc +LDLIBS=-lm -lc +LDFLAGS=${LDLIBS} +FLAGS_DEBUG=-g -Wall -DDEBUG -DLOGLEVEL=0 -Wextra -Wno-unused-function +VERSION=-DVERSION="\"$(shell git log -n1 --pretty=format:%h%d) added cd_acc/do\"" + +#CFLAGS=-O0 -std=c11 ${VERSION} ${FLAGS_DEBUG} +#CFLAGS=-O2 -ftree-vectorize -msse2 -ftree-vectorizer-verbose=5 -std=c11 ${VERSION} ${FLAGS_DEBUG} +CFLAGS=-O3 -std=c11 ${VERSION} + +srcfiles=$(wildcard src/*.c) +objects=$(srcfiles:%.c=%.o) + + +# compile (fast) production version +default: cesar +.PHONY: cesar +cesar: CESAR + mv CESAR cesar + +CESAR: ${objects} + ${CC} ${CFLAGS} -o $@ $^ ${LDLIBS} + + +# compile debugging version +.PHONY: debug +debug: ${objects} + ${CC} ${FLAGS_DEBUG} -o $@ $^ ${LDLIBS} + + +# dev setup +workspace: + virtualenv venv + + +# dev doc +.PHONY: doc +doc: + mkdir -p doc/ + doxygen doxygen.conf + + +.PHONY: test +test: ${objects} cesar + tests.sh ./cesar + make -C test + +.PHONY: test/valgrind +test/valgrind: ${objects} cesar + test/run.sh cesar + make -C test valgrind + +# cleaners +.PHONY: clean +clean: cleanCode cleanTest + rm -rf doc + mkdir doc + +cleanCode: + rm -fv *.o + rm -fv src/*.o + rm -fv src/*.gch + rm -f cesar + rm -f debug + +cleanTest: + make -i -C test clean diff --git a/doxygen.conf b/doxygen.conf new file mode 100644 index 0000000..bff79e1 --- /dev/null +++ b/doxygen.conf @@ -0,0 +1,1519 @@ +# Doxyfile 1.6.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project +# +# All text after a hash (#) is considered a comment and will be ignored +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" ") + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = CESAR 2.0 + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = 0.1 + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc/doxygen + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful is your file systems +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 2 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it parses. +# With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this tag. +# The format is ext=language, where ext is a file extension, and language is one of +# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, +# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = c=C h=C + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen to replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penality. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will rougly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols + +SYMBOL_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = YES + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespace are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or define consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and defines in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = NO + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by +# doxygen. The layout file controls the global structure of the generated output files +# in an output format independent way. The create the layout file that represents +# doxygen's defaults, run doxygen with the -l option. You can optionally specify a +# file name after the option, if omitted DoxygenLayout.xml will be used as the name +# of the layout file. + +LAYOUT_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be abled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = src/ + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx +# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 + +FILE_PATTERNS = *.h *.c + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = NO + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# If the HTML_TIMESTAMP tag is set to YES then the generated HTML +# documentation will contain the timesstamp. + +HTML_TIMESTAMP = NO + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER +# are set, an additional index file will be generated that can be used as input for +# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated +# HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. +# For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's +# filter section matches. +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [1..20]) +# that doxygen will group on one line in the generated HTML documentation. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# When the SEARCHENGINE tag is enable doxygen will generate a search box for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP) or Qt help (GENERATE_QHP) +# there is already a search function so this one should typically +# be disabled. + +SEARCHENGINE = YES + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = YES + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, a4wide, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4wide + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option is superseded by the HAVE_DOT option below. This is only a +# fallback. It is recommended to install and use dot, since it yields more +# powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = YES + +# By default doxygen will write a font called FreeSans.ttf to the output +# directory and reference it in all dot files that doxygen generates. This +# font does not include all possible unicode characters however, so when you need +# these (or just want a differently looking font) you can specify the font name +# using DOT_FONTNAME. You need need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +DOT_FONTNAME = FreeSans + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = YES + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/extra/example1.fa b/extra/example1.fa new file mode 100644 index 0000000..0671c4c --- /dev/null +++ b/extra/example1.fa @@ -0,0 +1,5 @@ +>human +gCCTGGGAACTTCACCTACCACATCCCTGTCAGTAGTGGCACCCCACTGCACCTCAGCCTGACTCTGCAGATGaa +#### +>mouse +CTTTCCTCATTTCCTCAGGCTTCAGTATAGCATGAGGCTGAGGAGGAGAGAGGGAGACCGGCAAAGTGGCCTTGCTTAGGTACCATCTTTGCCCCTTTAGGCTTGGCAACTTCACCTACCACATCCCTGTCAGCAGCAGCACACCACTGCACCTCAGCCTGACCCTGCAGATGAAGTGAGTGCTGGTGTGTGGGTATGTGTGGGGGACCATGTGGAAGCCCTCAGAAAAGTGAAAGCCAAGTGCTTACTAAATTTATTACGTGGAGGGTCCAGGC diff --git a/extra/example2.fa b/extra/example2.fa new file mode 100644 index 0000000..4bc93a4 --- /dev/null +++ b/extra/example2.fa @@ -0,0 +1,7 @@ +>human +GTCACAATCATTGGTTACACCCTGGGGATTCCTGACGTCATCATGGGGATCACCTTCCTGGCTGCTGGGACCAGCGTGCCTGACTGCATGGCCAGCCTCATTGTGGCCAGACAAg +#### +>mouse +CTCCAAGGTTACCATCATCGGCTACACACTAGGGATCCCTGATGTCATCATGGGGATCACCTTCCTGGCTGCCGGAACCAGCGTGCCAGACTGCATGGCCAGCCTCATTGTAGCCAGACAAGGTGG +>sheep +TCCCAGGTCACGATCATCGGCTACACGCTGGGGATTCCTGACGTCATCATGGGGAGACAAGGTGGGGCCCACGTGGGGAGGGCTGGGAAGGGAAGCCAGGCCTCCCTACTTAGGGGGTAGGGGGAGCTTGCCTGG diff --git a/extra/samples/example1.in b/extra/samples/example1.in new file mode 100644 index 0000000..0cba66f --- /dev/null +++ b/extra/samples/example1.in @@ -0,0 +1,4 @@ +>reference +gCCTGGGAACTTCACCTACCACATCCCTGTCAGTAGTGGCACCCCACTGCACCTCAGCCTGACTCTGCAGATGaa +>mouse +CTTTCCTCATTTCCTCAGGCTTCAGTATAGCATGAGGCTGAGGAGGAGAGAGGGAGACCGGCAAAGTGGCCTTGCTTAGGTACCATCTTTGCCCCTTTAGGCTTGGCAACTTCACCTACCACATCCCTGTCAGCAGCAGCACACCACTGCACCTCAGCCTGACCCTGCAGATGAAGTGAGTGCTGGTGTGTGGGTATGTGTGGGGGACCATGTGGAAGCCCTCAGAAAAGTGAAAGCCAAGTGCTTACTAAATTTATTACGTGGAGGGTCCAGGC diff --git a/extra/samples/example1.out b/extra/samples/example1.out new file mode 100644 index 0000000..d12656b --- /dev/null +++ b/extra/samples/example1.out @@ -0,0 +1,4 @@ +>referenceExon + gCCTGGGAACTTCACCTACCACATCCCTGTCAGTAGTGGCACCCCACTGCACCTCAGCCTGACTCTGCAGATGaa +>mouse +ctttcctcatttcctcaggcttcagtatagcatgaggctgaggaggagagagggagaccggcaaagtggccttgcttaggtaccatctttgcccctttaggCTTGGCAACTTCACCTACCACATCCCTGTCAGCAGCAGCACACCACTGCACCTCAGCCTGACCCTGCAGATGaagtgagtgctggtgtgtgggtatgtgtgggggaccatgtggaagccctcagaaaagtgaaagccaagtgcttactaaatttattacgtggagggtccaggc diff --git a/extra/samples/sample0.in b/extra/samples/sample0.in new file mode 100644 index 0000000..320cf6d --- /dev/null +++ b/extra/samples/sample0.in @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatagacCCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample0.out b/extra/samples/sample0.out new file mode 100644 index 0000000..320cf6d --- /dev/null +++ b/extra/samples/sample0.out @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatagacCCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample1.in b/extra/samples/sample1.in new file mode 100644 index 0000000..bce00c7 --- /dev/null +++ b/extra/samples/sample1.in @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatag--CCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample1.out b/extra/samples/sample1.out new file mode 100644 index 0000000..bce00c7 --- /dev/null +++ b/extra/samples/sample1.out @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatag--CCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample2.in b/extra/samples/sample2.in new file mode 100644 index 0000000..6584b8f --- /dev/null +++ b/extra/samples/sample2.in @@ -0,0 +1,4 @@ +>referenceExon + aCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatagCCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample2.out b/extra/samples/sample2.out new file mode 100644 index 0000000..35310c4 --- /dev/null +++ b/extra/samples/sample2.out @@ -0,0 +1,4 @@ +>referenceExon + aCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatag-CCCAAACCCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample3.in b/extra/samples/sample3.in new file mode 100644 index 0000000..4bb0700 --- /dev/null +++ b/extra/samples/sample3.in @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatagacCCCAAACCCAAACCCAAACCCAAACCCgtaaaaa diff --git a/extra/samples/sample3.out b/extra/samples/sample3.out new file mode 100644 index 0000000..bb0c746 --- /dev/null +++ b/extra/samples/sample3.out @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatagacCCCAAACCCAAACCCAAACCCAAACCC--gtaaaaa diff --git a/extra/samples/sample4.in b/extra/samples/sample4.in new file mode 100644 index 0000000..3af23c9 --- /dev/null +++ b/extra/samples/sample4.in @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCC +>query +cttccccatttttatctcatagacCCC diff --git a/extra/samples/sample4.out b/extra/samples/sample4.out new file mode 100644 index 0000000..2a31299 --- /dev/null +++ b/extra/samples/sample4.out @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCC +>query +cttccccatttttatctcatagac------------------------------------------------------------------------------------------CCC------------ diff --git a/extra/samples/sample5.in b/extra/samples/sample5.in new file mode 100644 index 0000000..88deb8c --- /dev/null +++ b/extra/samples/sample5.in @@ -0,0 +1,4 @@ +>referenceExon + aaCCTAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCC +>query +cttccccatttttatctcatagacCCT diff --git a/extra/samples/sample5.out b/extra/samples/sample5.out new file mode 100644 index 0000000..e8ff643 --- /dev/null +++ b/extra/samples/sample5.out @@ -0,0 +1,4 @@ +>referenceExon + aaCCTAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCCAAACCC +>query +cttccccatttttatctcatagacCCT------------------------------------------------------------------------------------------------------ diff --git a/extra/samples/sample6.in b/extra/samples/sample6.in new file mode 100644 index 0000000..53985b8 --- /dev/null +++ b/extra/samples/sample6.in @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatagacCCCAAACCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample6.out b/extra/samples/sample6.out new file mode 100644 index 0000000..02dea63 --- /dev/null +++ b/extra/samples/sample6.out @@ -0,0 +1,4 @@ +>referenceExon + aaCCCAAACCCAAACCCAAACCCAAACCCaa +>query +cttccccatttttatctcatagacCCCAAA-CCAAACCCAAACCCAAACCCaagtaaaaa diff --git a/extra/samples/sample7.in b/extra/samples/sample7.in new file mode 100644 index 0000000..ac43b82 --- /dev/null +++ b/extra/samples/sample7.in @@ -0,0 +1,4 @@ +>referenceExon + CCTGGGAACTTCACCTACTGGCACATCCCTGTCAGTAGTGGCACCCCACTGCACCTCAGCCTGACTCTGCAGATG +>mouse +cac------------------------ATCCCTGTCAGCAGCAGCACACCACTGCACCTCAGCCTGACCCTGCAGATGgtgagtgctggtgtgtgggtatgtgtgggggaccatgtggaagccctcagaaaagtgaaagccaagtgcttactaaatttattacgtggagggtccaggc diff --git a/extra/samples/sample7.out b/extra/samples/sample7.out new file mode 100644 index 0000000..f609488 --- /dev/null +++ b/extra/samples/sample7.out @@ -0,0 +1,4 @@ +>referenceExon +CCTGGGAACTTCACCTACTGGCACATCCCTGTCAGTAGTGGCACCCCACTGCACCTCAGCCTGACTCTGCAGATG +>mouse +---------------------CACATCCCTGTCAGCAGCAGCACACCACTGCACCTCAGCCTGACCCTGCAGATGgtgagtgctggtgtgtgggtatgtgtgggggaccatgtggaagccctcagaaaagtgaaagccaagtgcttactaaatttattacgtggagggtccaggc diff --git a/extra/tables/human/acc_profile.txt b/extra/tables/human/acc_profile.txt new file mode 100644 index 0000000..0aeb784 --- /dev/null +++ b/extra/tables/human/acc_profile.txt @@ -0,0 +1,23 @@ +A T C G +0.228 0.371 0.251 0.150 +0.214 0.381 0.255 0.150 +0.200 0.395 0.258 0.148 +0.180 0.408 0.264 0.147 +0.161 0.427 0.269 0.142 +0.146 0.443 0.272 0.139 +0.134 0.456 0.276 0.134 +0.122 0.466 0.284 0.129 +0.112 0.485 0.281 0.122 +0.103 0.503 0.278 0.116 +0.094 0.521 0.280 0.106 +0.085 0.555 0.260 0.101 +0.087 0.522 0.283 0.107 +0.098 0.495 0.294 0.113 +0.107 0.464 0.325 0.104 +0.114 0.462 0.335 0.089 +0.085 0.509 0.344 0.062 +0.087 0.555 0.297 0.061 +0.239 0.285 0.272 0.204 +0.057 0.295 0.646 0.002 +1.000 0.000 0.000 0.000 +0.000 0.000 0.000 1.000 diff --git a/extra/tables/human/do_profile.txt b/extra/tables/human/do_profile.txt new file mode 100644 index 0000000..e3f1080 --- /dev/null +++ b/extra/tables/human/do_profile.txt @@ -0,0 +1,7 @@ +A T C G +0.000 0.000 0.000 1.000 +0.000 0.992 0.008 0.000 +0.611 0.027 0.025 0.336 +0.701 0.114 0.073 0.112 +0.088 0.077 0.056 0.779 +0.179 0.483 0.149 0.190 diff --git a/extra/tables/human/eth_codon_sub.txt b/extra/tables/human/eth_codon_sub.txt new file mode 100644 index 0000000..db058be --- /dev/null +++ b/extra/tables/human/eth_codon_sub.txt @@ -0,0 +1,65 @@ +AAA 0.40849 0.01506 0.26198 0.01904 0.01527 0.00650 0.01149 0.00774 0.09164 0.00886 0.06529 0.01076 0.00660 0.00143 0.00546 0.00199 0.03077 0.00710 0.01811 0.00826 0.00398 0.00212 0.00365 0.00235 0.04568 0.02826 0.03891 0.02978 0.00271 0.00126 0.00142 0.00159 0.01551 0.00399 0.00889 0.00437 0.00663 0.00287 0.00496 0.00357 0.00548 0.00272 0.00424 0.00339 0.00423 0.00159 0.00221 0.00206 0.00000 0.00093 0.00000 0.00119 0.00521 0.00275 0.00405 0.00343 0.00000 0.00141 0.00127 0.00163 0.00240 0.00044 0.00188 0.00054 +AAC 0.01100 0.41485 0.00959 0.25289 0.00916 0.01865 0.00960 0.01375 0.00594 0.06004 0.00586 0.04124 0.00198 0.00335 0.00212 0.00227 0.00651 0.02338 0.00584 0.01640 0.00191 0.00320 0.00236 0.00226 0.00323 0.00648 0.00382 0.00479 0.00094 0.00154 0.00076 0.00105 0.00487 0.02458 0.00468 0.01611 0.00379 0.00645 0.00491 0.00461 0.00562 0.01425 0.00697 0.01034 0.00173 0.00277 0.00133 0.00205 0.00000 0.00490 0.00000 0.00365 0.00484 0.00759 0.00499 0.00589 0.00000 0.00411 0.00048 0.00314 0.00098 0.00121 0.00101 0.00096 +AAG 0.29591 0.01484 0.46315 0.01686 0.01135 0.00675 0.01565 0.00657 0.06807 0.00889 0.09736 0.00994 0.00415 0.00151 0.00852 0.00177 0.02260 0.00788 0.02791 0.00789 0.00319 0.00209 0.00438 0.00206 0.04198 0.03598 0.05322 0.03283 0.00210 0.00128 0.00207 0.00145 0.01041 0.00391 0.01442 0.00412 0.00500 0.00321 0.00602 0.00320 0.00395 0.00301 0.00626 0.00326 0.00309 0.00161 0.00278 0.00172 0.00000 0.00102 0.00000 0.00117 0.00413 0.00263 0.00413 0.00291 0.00000 0.00157 0.00212 0.00158 0.00185 0.00037 0.00225 0.00053 +AAT 0.01230 0.22372 0.00965 0.35450 0.00971 0.01103 0.00955 0.01893 0.00611 0.03574 0.00553 0.06048 0.00225 0.00204 0.00231 0.00365 0.00734 0.01464 0.00582 0.02313 0.00232 0.00241 0.00236 0.00313 0.00360 0.00378 0.00354 0.00675 0.00105 0.00094 0.00075 0.00183 0.00592 0.01542 0.00501 0.02542 0.00453 0.00409 0.00434 0.00605 0.00568 0.00863 0.00573 0.01384 0.00191 0.00185 0.00136 0.00276 0.00000 0.00315 0.00000 0.00568 0.00509 0.00479 0.00474 0.00690 0.00000 0.00269 0.00051 0.00472 0.00127 0.00070 0.00115 0.00126 +ACA 0.00913 0.00750 0.00600 0.00899 0.25029 0.13459 0.19326 0.15274 0.00817 0.01420 0.00526 0.01523 0.01986 0.00584 0.01536 0.00737 0.00560 0.00217 0.00379 0.00283 0.01224 0.00653 0.00870 0.00787 0.00267 0.00206 0.00270 0.00253 0.00425 0.00196 0.00286 0.00240 0.00384 0.00164 0.00303 0.00234 0.03211 0.01449 0.02058 0.01735 0.00429 0.00314 0.00381 0.00377 0.01548 0.00686 0.00917 0.00835 0.00000 0.00058 0.00000 0.00094 0.03303 0.01719 0.02545 0.02118 0.00000 0.00201 0.00073 0.00287 0.00561 0.00101 0.00409 0.00135 +ACC 0.00368 0.01445 0.00338 0.00966 0.12735 0.27128 0.15326 0.14524 0.00311 0.02754 0.00306 0.01691 0.00817 0.01213 0.00809 0.00755 0.00277 0.00339 0.00247 0.00257 0.00521 0.01038 0.00571 0.00659 0.00181 0.00266 0.00188 0.00208 0.00183 0.00310 0.00199 0.00203 0.00185 0.00284 0.00185 0.00191 0.01364 0.02925 0.01658 0.01571 0.00251 0.00550 0.00274 0.00392 0.00703 0.01272 0.00621 0.00776 0.00000 0.00112 0.00000 0.00109 0.01656 0.02719 0.01767 0.01793 0.00000 0.00354 0.00047 0.00265 0.00231 0.00178 0.00237 0.00124 +ACG 0.00240 0.00274 0.00289 0.00309 0.06746 0.05654 0.09850 0.05631 0.00190 0.00596 0.00256 0.00568 0.00401 0.00227 0.00719 0.00266 0.00151 0.00077 0.00162 0.00083 0.00238 0.00239 0.00353 0.00211 0.00110 0.00104 0.00134 0.00083 0.00115 0.00088 0.00126 0.00097 0.00095 0.00072 0.00115 0.00068 0.00725 0.00561 0.01040 0.00546 0.00121 0.00131 0.00165 0.00123 0.00399 0.00271 0.00395 0.00312 0.00000 0.00029 0.00000 0.00041 0.00836 0.00688 0.01102 0.00670 0.00000 0.00077 0.00036 0.00107 0.00146 0.00042 0.00168 0.00053 +ACT 0.00395 0.00960 0.00296 0.01494 0.13029 0.13094 0.13760 0.20890 0.00312 0.01788 0.00272 0.02573 0.00854 0.00653 0.00841 0.01267 0.00326 0.00220 0.00226 0.00341 0.00616 0.00640 0.00592 0.00999 0.00147 0.00186 0.00159 0.00288 0.00199 0.00184 0.00188 0.00302 0.00199 0.00217 0.00191 0.00300 0.01560 0.01487 0.01514 0.02512 0.00264 0.00349 0.00294 0.00526 0.00740 0.00730 0.00644 0.01160 0.00000 0.00087 0.00000 0.00141 0.01903 0.01644 0.01878 0.02496 0.00000 0.00251 0.00036 0.00380 0.00274 0.00120 0.00261 0.00179 +AGA 0.04383 0.00389 0.02882 0.00452 0.00654 0.00263 0.00435 0.00293 0.28665 0.00702 0.17898 0.00863 0.00367 0.00076 0.00259 0.00093 0.01140 0.00452 0.00656 0.00499 0.00136 0.00094 0.00176 0.00091 0.15015 0.07900 0.10946 0.09469 0.00143 0.00072 0.00092 0.00089 0.00318 0.00097 0.00176 0.00108 0.00238 0.00127 0.00199 0.00139 0.00867 0.00287 0.00520 0.00310 0.00220 0.00077 0.00097 0.00091 0.00000 0.00054 0.00000 0.00064 0.00193 0.00124 0.00168 0.00148 0.00000 0.00142 0.00227 0.00178 0.00126 0.00022 0.00090 0.00034 +AGC 0.00598 0.05551 0.00531 0.03735 0.01604 0.03288 0.01929 0.02368 0.00991 0.36420 0.01051 0.23871 0.00275 0.00449 0.00280 0.00327 0.00473 0.00982 0.00423 0.00721 0.00268 0.00491 0.00337 0.00323 0.00534 0.01107 0.00541 0.00744 0.00097 0.00177 0.00101 0.00141 0.00350 0.00931 0.00350 0.00623 0.00695 0.01246 0.00848 0.00855 0.01324 0.03782 0.01570 0.02320 0.00277 0.00442 0.00208 0.00354 0.00000 0.00239 0.00000 0.00204 0.01118 0.01804 0.01438 0.01360 0.00000 0.01623 0.00090 0.01193 0.00126 0.00122 0.00112 0.00103 +AGG 0.02740 0.00337 0.03617 0.00359 0.00369 0.00227 0.00515 0.00224 0.15703 0.00653 0.25775 0.00725 0.00203 0.00066 0.00346 0.00076 0.00714 0.00432 0.00762 0.00393 0.00113 0.00084 0.00184 0.00076 0.10130 0.07793 0.11840 0.07455 0.00099 0.00076 0.00101 0.00067 0.00180 0.00078 0.00242 0.00084 0.00173 0.00121 0.00229 0.00103 0.00398 0.00250 0.00907 0.00238 0.00124 0.00083 0.00130 0.00072 0.00000 0.00046 0.00000 0.00063 0.00119 0.00098 0.00157 0.00105 0.00000 0.00142 0.00446 0.00140 0.00094 0.00018 0.00092 0.00029 +AGT 0.00519 0.02725 0.00425 0.04518 0.01230 0.01443 0.01315 0.02436 0.00871 0.17062 0.00833 0.28774 0.00236 0.00190 0.00217 0.00399 0.00376 0.00484 0.00287 0.00781 0.00227 0.00221 0.00221 0.00324 0.00370 0.00420 0.00392 0.00833 0.00078 0.00079 0.00067 0.00132 0.00297 0.00496 0.00277 0.00752 0.00596 0.00565 0.00577 0.00853 0.00981 0.01644 0.01097 0.02710 0.00206 0.00213 0.00162 0.00356 0.00000 0.00132 0.00000 0.00247 0.00919 0.00856 0.00960 0.01208 0.00000 0.00834 0.00074 0.01247 0.00094 0.00056 0.00100 0.00106 +ATA 0.00219 0.00090 0.00122 0.00115 0.01102 0.00479 0.00637 0.00556 0.00254 0.00135 0.00160 0.00162 0.19608 0.08460 0.02113 0.08774 0.00126 0.00054 0.00080 0.00080 0.00156 0.00090 0.00123 0.00102 0.00105 0.00060 0.00077 0.00083 0.01595 0.00859 0.00903 0.00872 0.00101 0.00027 0.00069 0.00040 0.00474 0.00216 0.00362 0.00270 0.00102 0.00061 0.00083 0.00062 0.03922 0.02017 0.02211 0.02166 0.00000 0.00041 0.00000 0.00061 0.00257 0.00124 0.00180 0.00121 0.00000 0.00053 0.00037 0.00065 0.01789 0.00215 0.01059 0.00280 +ATC 0.00101 0.00324 0.00095 0.00224 0.00691 0.01517 0.00770 0.00905 0.00112 0.00470 0.00111 0.00279 0.18032 0.36846 0.02111 0.22364 0.00116 0.00139 0.00095 0.00125 0.00114 0.00196 0.00137 0.00127 0.00062 0.00097 0.00074 0.00092 0.01435 0.02784 0.01379 0.01781 0.00065 0.00074 0.00061 0.00041 0.00338 0.00675 0.00469 0.00436 0.00069 0.00166 0.00104 0.00101 0.03416 0.07875 0.03548 0.04577 0.00000 0.00151 0.00000 0.00128 0.00176 0.00241 0.00179 0.00178 0.00000 0.00186 0.00048 0.00147 0.01339 0.00850 0.01114 0.00576 +ATG 0.00423 0.00224 0.00584 0.00277 0.01992 0.01108 0.02671 0.01278 0.00419 0.00321 0.00639 0.00348 0.04935 0.02313 0.58867 0.02730 0.00406 0.00167 0.00445 0.00235 0.00306 0.00211 0.00373 0.00228 0.00220 0.00199 0.00313 0.00209 0.02734 0.01925 0.03256 0.01949 0.00169 0.00073 0.00232 0.00077 0.00923 0.00577 0.01064 0.00608 0.00201 0.00153 0.00332 0.00170 0.02529 0.01522 0.03171 0.01708 0.00000 0.00124 0.00000 0.00153 0.00620 0.00345 0.00700 0.00418 0.00000 0.00126 0.00206 0.00152 0.02594 0.00453 0.03811 0.00543 +ATT 0.00123 0.00193 0.00097 0.00350 0.00763 0.00826 0.00788 0.01538 0.00120 0.00300 0.00113 0.00512 0.16368 0.19576 0.02181 0.31668 0.00111 0.00106 0.00098 0.00186 0.00137 0.00132 0.00113 0.00201 0.00090 0.00065 0.00070 0.00113 0.01400 0.01601 0.01217 0.02313 0.00071 0.00051 0.00074 0.00096 0.00419 0.00373 0.00414 0.00630 0.00087 0.00105 0.00089 0.00159 0.03429 0.04170 0.03137 0.06628 0.00000 0.00098 0.00000 0.00168 0.00194 0.00186 0.00190 0.00256 0.00000 0.00122 0.00050 0.00199 0.01350 0.00508 0.01210 0.00825 +CAA 0.01490 0.00431 0.00969 0.00550 0.00454 0.00237 0.00351 0.00309 0.01154 0.00339 0.00824 0.00377 0.00184 0.00079 0.00254 0.00087 0.26231 0.02332 0.14369 0.02704 0.01374 0.00618 0.01156 0.00656 0.02448 0.01132 0.01691 0.01436 0.00687 0.00262 0.00367 0.00333 0.01305 0.00297 0.00911 0.00341 0.00407 0.00209 0.00371 0.00240 0.00290 0.00150 0.00246 0.00161 0.00267 0.00122 0.00149 0.00156 0.00000 0.00213 0.00000 0.00263 0.00511 0.00283 0.00366 0.00321 0.00000 0.00153 0.00150 0.00175 0.00326 0.00073 0.00284 0.00090 +CAC 0.00361 0.01630 0.00355 0.01154 0.00185 0.00305 0.00187 0.00219 0.00481 0.00740 0.00524 0.00511 0.00083 0.00100 0.00110 0.00087 0.02452 0.41696 0.02240 0.28133 0.00407 0.00786 0.00564 0.00486 0.01100 0.02839 0.01124 0.02084 0.00228 0.00488 0.00196 0.00360 0.00216 0.00490 0.00240 0.00309 0.00143 0.00212 0.00186 0.00146 0.00120 0.00281 0.00144 0.00173 0.00099 0.00142 0.00084 0.00093 0.00000 0.02363 0.00000 0.01766 0.00233 0.00371 0.00263 0.00296 0.00000 0.00500 0.00100 0.00393 0.00153 0.00365 0.00148 0.00267 +CAG 0.02161 0.00954 0.02948 0.01075 0.00756 0.00521 0.00928 0.00529 0.01636 0.00748 0.02167 0.00709 0.00289 0.00161 0.00685 0.00188 0.35405 0.05250 0.52647 0.05483 0.01993 0.01301 0.03017 0.01156 0.03288 0.02725 0.05789 0.02780 0.01045 0.00687 0.01124 0.00720 0.01992 0.00656 0.02826 0.00642 0.00643 0.00528 0.00923 0.00477 0.00391 0.00358 0.00643 0.00350 0.00407 0.00261 0.00379 0.00287 0.00000 0.00459 0.00000 0.00470 0.00821 0.00593 0.00939 0.00574 0.00000 0.00310 0.00572 0.00338 0.00622 0.00148 0.00764 0.00156 +CAT 0.00339 0.00922 0.00287 0.01470 0.00194 0.00187 0.00163 0.00275 0.00429 0.00439 0.00384 0.00665 0.00099 0.00073 0.00124 0.00123 0.02293 0.22692 0.01887 0.33894 0.00376 0.00480 0.00463 0.00671 0.00924 0.01386 0.00858 0.02509 0.00201 0.00314 0.00181 0.00514 0.00242 0.00305 0.00220 0.00439 0.00150 0.00111 0.00167 0.00174 0.00119 0.00168 0.00126 0.00264 0.00091 0.00089 0.00076 0.00115 0.00000 0.01282 0.00000 0.02091 0.00221 0.00230 0.00209 0.00359 0.00000 0.00311 0.00121 0.00552 0.00166 0.00209 0.00149 0.00300 +CCA 0.00250 0.00165 0.00178 0.00225 0.01289 0.00579 0.00716 0.00760 0.00178 0.00249 0.00170 0.00296 0.00296 0.00102 0.00249 0.00139 0.01784 0.00503 0.01051 0.00575 0.32539 0.17662 0.22917 0.19929 0.00451 0.00264 0.00359 0.00313 0.01060 0.00323 0.00523 0.00455 0.00230 0.00101 0.00202 0.00122 0.01506 0.00722 0.01036 0.00890 0.00218 0.00185 0.00212 0.00198 0.00489 0.00226 0.00283 0.00250 0.00000 0.00052 0.00000 0.00069 0.02995 0.01275 0.01891 0.01569 0.00000 0.00078 0.00066 0.00126 0.00521 0.00075 0.00437 0.00094 +CCC 0.00113 0.00233 0.00098 0.00198 0.00580 0.00975 0.00608 0.00667 0.00105 0.00386 0.00106 0.00243 0.00143 0.00147 0.00145 0.00114 0.00678 0.00820 0.00579 0.00620 0.14911 0.30688 0.17731 0.17279 0.00231 0.00448 0.00240 0.00324 0.00369 0.00639 0.00264 0.00411 0.00115 0.00139 0.00130 0.00097 0.00721 0.01193 0.00906 0.00791 0.00134 0.00233 0.00173 0.00169 0.00220 0.00303 0.00172 0.00199 0.00000 0.00112 0.00000 0.00079 0.01305 0.02477 0.01391 0.01529 0.00000 0.00146 0.00026 0.00117 0.00205 0.00154 0.00207 0.00097 +CCG 0.00076 0.00067 0.00081 0.00076 0.00303 0.00210 0.00352 0.00242 0.00077 0.00104 0.00092 0.00095 0.00077 0.00040 0.00100 0.00038 0.00497 0.00231 0.00527 0.00235 0.07588 0.06954 0.12303 0.06422 0.00157 0.00154 0.00262 0.00144 0.00259 0.00131 0.00272 0.00166 0.00068 0.00047 0.00079 0.00045 0.00387 0.00313 0.00691 0.00291 0.00072 0.00089 0.00109 0.00068 0.00113 0.00088 0.00113 0.00089 0.00000 0.00028 0.00000 0.00028 0.00667 0.00483 0.00890 0.00515 0.00000 0.00043 0.00045 0.00053 0.00143 0.00030 0.00188 0.00038 +CCT 0.00146 0.00193 0.00114 0.00302 0.00821 0.00726 0.00631 0.01221 0.00119 0.00299 0.00112 0.00419 0.00193 0.00112 0.00184 0.00203 0.00844 0.00595 0.00604 0.01019 0.19753 0.20286 0.19224 0.31933 0.00265 0.00284 0.00220 0.00598 0.00430 0.00442 0.00312 0.00900 0.00143 0.00105 0.00135 0.00142 0.01009 0.00899 0.00938 0.01445 0.00160 0.00188 0.00169 0.00281 0.00256 0.00214 0.00207 0.00391 0.00000 0.00082 0.00000 0.00139 0.01758 0.01622 0.01606 0.02968 0.00000 0.00143 0.00034 0.00259 0.00261 0.00096 0.00293 0.00206 +CGA 0.01001 0.00097 0.00814 0.00122 0.00098 0.00070 0.00116 0.00063 0.06878 0.00173 0.05289 0.00168 0.00069 0.00019 0.00062 0.00032 0.01108 0.00473 0.00604 0.00493 0.00157 0.00095 0.00165 0.00093 0.14856 0.08404 0.09460 0.10057 0.00154 0.00076 0.00078 0.00070 0.00080 0.00028 0.00062 0.00030 0.00063 0.00042 0.00065 0.00033 0.00159 0.00071 0.00139 0.00095 0.00052 0.00025 0.00029 0.00025 0.00000 0.00040 0.00000 0.00066 0.00080 0.00051 0.00078 0.00048 0.00000 0.00138 0.00182 0.00146 0.00055 0.00015 0.00056 0.00019 +CGC 0.00872 0.00274 0.00983 0.00181 0.00107 0.00145 0.00154 0.00112 0.05098 0.00506 0.05731 0.00269 0.00056 0.00042 0.00079 0.00033 0.00722 0.01721 0.00705 0.01042 0.00130 0.00260 0.00228 0.00141 0.11837 0.27708 0.11422 0.16725 0.00093 0.00259 0.00106 0.00145 0.00058 0.00077 0.00085 0.00052 0.00058 0.00115 0.00113 0.00057 0.00103 0.00274 0.00137 0.00131 0.00038 0.00064 0.00046 0.00044 0.00000 0.00165 0.00000 0.00127 0.00069 0.00149 0.00118 0.00084 0.00000 0.00620 0.00170 0.00285 0.00067 0.00061 0.00061 0.00027 +CGG 0.01170 0.00157 0.01417 0.00165 0.00136 0.00100 0.00193 0.00094 0.06881 0.00241 0.08484 0.00244 0.00070 0.00032 0.00122 0.00034 0.01050 0.00664 0.01459 0.00628 0.00172 0.00136 0.00378 0.00106 0.12984 0.11128 0.18720 0.11799 0.00157 0.00111 0.00188 0.00119 0.00085 0.00048 0.00140 0.00048 0.00092 0.00067 0.00166 0.00071 0.00137 0.00135 0.00315 0.00099 0.00061 0.00051 0.00069 0.00044 0.00000 0.00056 0.00000 0.00064 0.00097 0.00089 0.00158 0.00075 0.00000 0.00206 0.00512 0.00183 0.00089 0.00026 0.00121 0.00028 +CGT 0.00572 0.00126 0.00558 0.00200 0.00081 0.00071 0.00076 0.00108 0.03800 0.00212 0.03410 0.00331 0.00048 0.00025 0.00052 0.00035 0.00569 0.00786 0.00447 0.01173 0.00096 0.00117 0.00133 0.00184 0.08810 0.10402 0.07532 0.15866 0.00073 0.00098 0.00063 0.00158 0.00056 0.00036 0.00048 0.00050 0.00055 0.00054 0.00063 0.00064 0.00085 0.00102 0.00086 0.00166 0.00034 0.00033 0.00030 0.00044 0.00000 0.00088 0.00000 0.00158 0.00064 0.00072 0.00069 0.00109 0.00000 0.00239 0.00105 0.00408 0.00051 0.00029 0.00056 0.00043 +CTA 0.00077 0.00037 0.00053 0.00046 0.00202 0.00092 0.00157 0.00111 0.00085 0.00041 0.00067 0.00046 0.01368 0.00577 0.01004 0.00644 0.00403 0.00127 0.00249 0.00140 0.00480 0.00197 0.00354 0.00196 0.00201 0.00085 0.00148 0.00108 0.10620 0.04915 0.06203 0.05216 0.00050 0.00015 0.00039 0.00017 0.00189 0.00090 0.00143 0.00106 0.00044 0.00025 0.00039 0.00031 0.00805 0.00397 0.00533 0.00398 0.00000 0.00069 0.00000 0.00094 0.00279 0.00104 0.00206 0.00118 0.00000 0.00042 0.00088 0.00070 0.07200 0.00381 0.05304 0.00424 +CTC 0.00078 0.00130 0.00070 0.00089 0.00202 0.00337 0.00260 0.00222 0.00092 0.00161 0.00111 0.00101 0.01591 0.02419 0.01526 0.01589 0.00332 0.00589 0.00354 0.00470 0.00315 0.00739 0.00385 0.00436 0.00212 0.00516 0.00227 0.00313 0.10612 0.26700 0.10385 0.14785 0.00052 0.00054 0.00061 0.00033 0.00188 0.00320 0.00265 0.00206 0.00054 0.00091 0.00067 0.00056 0.00902 0.01816 0.00898 0.01018 0.00000 0.00315 0.00000 0.00233 0.00198 0.00322 0.00200 0.00197 0.00000 0.00272 0.00113 0.00185 0.07661 0.01939 0.07639 0.01149 +CTG 0.00188 0.00137 0.00243 0.00153 0.00633 0.00465 0.00801 0.00487 0.00254 0.00199 0.00318 0.00183 0.03601 0.02580 0.05557 0.02600 0.01002 0.00508 0.01246 0.00582 0.01099 0.00658 0.01728 0.00662 0.00470 0.00453 0.00827 0.00434 0.28833 0.22357 0.37963 0.22568 0.00137 0.00064 0.00186 0.00062 0.00595 0.00430 0.00903 0.00451 0.00115 0.00120 0.00225 0.00120 0.02453 0.01807 0.03125 0.01804 0.00000 0.00307 0.00000 0.00381 0.00703 0.00406 0.00858 0.00432 0.00000 0.00221 0.00507 0.00275 0.21527 0.01736 0.27841 0.01931 +CTT 0.00084 0.00076 0.00068 0.00151 0.00213 0.00191 0.00248 0.00316 0.00099 0.00111 0.00085 0.00146 0.01399 0.01340 0.01338 0.01988 0.00366 0.00376 0.00321 0.00665 0.00385 0.00412 0.00424 0.00768 0.00170 0.00250 0.00211 0.00439 0.09753 0.12805 0.09079 0.22610 0.00055 0.00041 0.00053 0.00053 0.00216 0.00201 0.00243 0.00294 0.00056 0.00063 0.00062 0.00088 0.00865 0.00905 0.00770 0.01443 0.00000 0.00195 0.00000 0.00295 0.00232 0.00186 0.00201 0.00325 0.00000 0.00159 0.00108 0.00239 0.07121 0.00989 0.07363 0.01629 +GAA 0.01709 0.00734 0.01015 0.01010 0.00708 0.00360 0.00504 0.00430 0.00731 0.00571 0.00472 0.00677 0.00335 0.00101 0.00240 0.00127 0.02968 0.00467 0.01839 0.00649 0.00403 0.00239 0.00358 0.00253 0.00401 0.00208 0.00311 0.00319 0.00193 0.00093 0.00114 0.00114 0.39774 0.06078 0.26124 0.06884 0.01546 0.00619 0.01168 0.00739 0.01880 0.00680 0.01319 0.00830 0.00774 0.00281 0.00428 0.00352 0.00000 0.00088 0.00000 0.00127 0.00515 0.00280 0.00418 0.00343 0.00000 0.00069 0.00061 0.00102 0.00193 0.00040 0.00179 0.00057 +GAC 0.00347 0.02932 0.00302 0.02079 0.00239 0.00438 0.00301 0.00370 0.00177 0.01201 0.00163 0.00896 0.00070 0.00091 0.00082 0.00072 0.00534 0.00838 0.00479 0.00647 0.00140 0.00229 0.00195 0.00147 0.00111 0.00216 0.00139 0.00163 0.00045 0.00076 0.00042 0.00067 0.04806 0.43708 0.05027 0.26165 0.00421 0.00744 0.00592 0.00459 0.00701 0.01655 0.00702 0.01040 0.00170 0.00305 0.00124 0.00205 0.00000 0.00228 0.00000 0.00184 0.00256 0.00364 0.00299 0.00264 0.00000 0.00132 0.00022 0.00121 0.00061 0.00054 0.00049 0.00036 +GAG 0.01224 0.00882 0.01758 0.01069 0.00698 0.00452 0.00761 0.00515 0.00508 0.00715 0.00795 0.00789 0.00285 0.00119 0.00412 0.00165 0.02591 0.00649 0.03262 0.00738 0.00442 0.00337 0.00523 0.00297 0.00390 0.00381 0.00639 0.00346 0.00188 0.00136 0.00193 0.00138 0.32656 0.07947 0.47190 0.07833 0.01369 0.00828 0.02097 0.00840 0.01223 0.00812 0.02184 0.00829 0.00699 0.00358 0.00705 0.00374 0.00000 0.00109 0.00000 0.00149 0.00499 0.00373 0.00551 0.00391 0.00000 0.00108 0.00115 0.00110 0.00196 0.00051 0.00215 0.00057 +GAT 0.00390 0.01967 0.00326 0.03509 0.00349 0.00301 0.00290 0.00525 0.00202 0.00822 0.00179 0.01390 0.00108 0.00052 0.00089 0.00139 0.00627 0.00540 0.00480 0.00953 0.00173 0.00164 0.00193 0.00203 0.00123 0.00151 0.00141 0.00234 0.00053 0.00048 0.00042 0.00088 0.05572 0.26784 0.05072 0.43347 0.00512 0.00450 0.00523 0.00810 0.00733 0.00956 0.00728 0.01881 0.00211 0.00179 0.00146 0.00383 0.00000 0.00142 0.00000 0.00322 0.00305 0.00251 0.00290 0.00412 0.00000 0.00094 0.00026 0.00182 0.00060 0.00030 0.00066 0.00065 +GCA 0.00420 0.00329 0.00280 0.00445 0.03406 0.01529 0.02203 0.01940 0.00316 0.00652 0.00261 0.00783 0.00906 0.00303 0.00755 0.00429 0.00533 0.00178 0.00342 0.00232 0.01518 0.00861 0.01179 0.01026 0.00183 0.00120 0.00194 0.00181 0.00420 0.00194 0.00285 0.00258 0.00890 0.00307 0.00630 0.00364 0.23923 0.11918 0.15637 0.14627 0.01430 0.00833 0.01186 0.01034 0.02504 0.01103 0.01617 0.01387 0.00000 0.00064 0.00000 0.00092 0.03383 0.01772 0.02497 0.02121 0.00000 0.00242 0.00070 0.00319 0.00544 0.00100 0.00402 0.00152 +GCC 0.00216 0.00666 0.00214 0.00477 0.01826 0.03896 0.02024 0.02197 0.00200 0.01389 0.00218 0.00881 0.00490 0.00719 0.00561 0.00454 0.00325 0.00314 0.00333 0.00203 0.00864 0.01692 0.01130 0.01087 0.00143 0.00281 0.00168 0.00210 0.00238 0.00393 0.00245 0.00284 0.00423 0.00644 0.00453 0.00380 0.14159 0.30926 0.16163 0.17132 0.00900 0.01747 0.01036 0.01073 0.01353 0.02825 0.01244 0.01575 0.00000 0.00132 0.00000 0.00116 0.02187 0.03748 0.02494 0.02424 0.00000 0.00536 0.00071 0.00380 0.00296 0.00242 0.00297 0.00160 +GCG 0.00105 0.00142 0.00113 0.00142 0.00727 0.00619 0.01053 0.00628 0.00088 0.00265 0.00115 0.00253 0.00230 0.00140 0.00290 0.00141 0.00162 0.00077 0.00164 0.00086 0.00348 0.00361 0.00701 0.00318 0.00063 0.00077 0.00116 0.00070 0.00106 0.00091 0.00144 0.00097 0.00224 0.00144 0.00322 0.00124 0.05211 0.04534 0.09721 0.04571 0.00369 0.00373 0.00604 0.00340 0.00602 0.00467 0.00805 0.00492 0.00000 0.00039 0.00000 0.00033 0.00788 0.00695 0.01329 0.00722 0.00000 0.00115 0.00035 0.00117 0.00115 0.00052 0.00190 0.00052 +GCT 0.00252 0.00447 0.00200 0.00662 0.02052 0.01964 0.01850 0.03483 0.00205 0.00895 0.00174 0.01249 0.00575 0.00436 0.00555 0.00719 0.00350 0.00203 0.00283 0.00300 0.01000 0.01053 0.00989 0.01638 0.00106 0.00131 0.00168 0.00236 0.00263 0.00237 0.00241 0.00391 0.00474 0.00373 0.00431 0.00642 0.16307 0.16078 0.15293 0.26402 0.00971 0.01006 0.00926 0.01592 0.01546 0.01441 0.01257 0.02627 0.00000 0.00104 0.00000 0.00163 0.02398 0.02228 0.02320 0.03371 0.00000 0.00351 0.00044 0.00515 0.00335 0.00154 0.00328 0.00250 +GGA 0.00358 0.00502 0.00228 0.00574 0.00469 0.00290 0.00379 0.00338 0.01183 0.01281 0.00620 0.01328 0.00201 0.00064 0.00169 0.00092 0.00391 0.00153 0.00214 0.00190 0.00227 0.00165 0.00227 0.00168 0.00473 0.00217 0.00299 0.00288 0.00100 0.00058 0.00057 0.00069 0.01115 0.00526 0.00580 0.00537 0.01473 0.00781 0.01140 0.00897 0.34780 0.16769 0.23567 0.18952 0.00616 0.00210 0.00285 0.00254 0.00000 0.00027 0.00000 0.00053 0.00478 0.00329 0.00402 0.00361 0.00000 0.00209 0.00124 0.00270 0.00105 0.00035 0.00089 0.00053 +GGC 0.00170 0.01217 0.00166 0.00834 0.00328 0.00607 0.00391 0.00427 0.00375 0.03494 0.00371 0.02125 0.00114 0.00147 0.00124 0.00106 0.00193 0.00345 0.00187 0.00255 0.00184 0.00273 0.00267 0.00188 0.00203 0.00553 0.00281 0.00332 0.00054 0.00093 0.00057 0.00074 0.00385 0.01186 0.00368 0.00669 0.00820 0.01447 0.01101 0.00888 0.16015 0.33181 0.16396 0.19735 0.00251 0.00482 0.00206 0.00290 0.00000 0.00094 0.00000 0.00074 0.00366 0.00538 0.00514 0.00397 0.00000 0.00686 0.00092 0.00466 0.00063 0.00069 0.00076 0.00058 +GGG 0.00199 0.00449 0.00260 0.00417 0.00300 0.00227 0.00372 0.00271 0.00511 0.01093 0.01016 0.01069 0.00118 0.00069 0.00202 0.00067 0.00238 0.00133 0.00253 0.00144 0.00159 0.00154 0.00247 0.00127 0.00297 0.00208 0.00492 0.00210 0.00065 0.00051 0.00080 0.00055 0.00563 0.00379 0.00746 0.00384 0.00880 0.00646 0.01343 0.00616 0.16960 0.12355 0.25839 0.12550 0.00376 0.00205 0.00386 0.00221 0.00000 0.00030 0.00000 0.00044 0.00341 0.00258 0.00448 0.00252 0.00000 0.00192 0.00306 0.00207 0.00082 0.00032 0.00123 0.00035 +GGT 0.00139 0.00579 0.00118 0.00876 0.00258 0.00284 0.00242 0.00422 0.00265 0.01406 0.00232 0.02298 0.00076 0.00058 0.00090 0.00105 0.00136 0.00139 0.00120 0.00263 0.00129 0.00130 0.00134 0.00184 0.00178 0.00173 0.00134 0.00353 0.00044 0.00037 0.00037 0.00067 0.00308 0.00488 0.00246 0.00863 0.00667 0.00583 0.00658 0.00921 0.11872 0.12944 0.10924 0.23409 0.00222 0.00182 0.00168 0.00362 0.00000 0.00039 0.00000 0.00083 0.00274 0.00248 0.00303 0.00362 0.00000 0.00285 0.00053 0.00500 0.00058 0.00029 0.00050 0.00054 +GTA 0.00125 0.00070 0.00081 0.00088 0.00768 0.00369 0.00568 0.00431 0.00136 0.00122 0.00088 0.00127 0.03506 0.01433 0.00968 0.01644 0.00164 0.00058 0.00101 0.00066 0.00231 0.00123 0.00161 0.00122 0.00071 0.00036 0.00060 0.00053 0.00839 0.00435 0.00550 0.00482 0.00209 0.00058 0.00150 0.00070 0.01171 0.00533 0.00845 0.00649 0.00280 0.00119 0.00237 0.00161 0.13050 0.06004 0.08321 0.06700 0.00000 0.00032 0.00000 0.00044 0.00327 0.00151 0.00226 0.00187 0.00000 0.00066 0.00031 0.00086 0.01148 0.00135 0.00710 0.00175 +GTC 0.00077 0.00185 0.00070 0.00139 0.00560 0.01096 0.00634 0.00698 0.00078 0.00319 0.00096 0.00216 0.02964 0.05429 0.00957 0.03284 0.00123 0.00136 0.00107 0.00106 0.00175 0.00278 0.00206 0.00168 0.00055 0.00100 0.00082 0.00084 0.00680 0.01441 0.00666 0.00830 0.00124 0.00171 0.00127 0.00098 0.00848 0.01828 0.01077 0.00994 0.00157 0.00376 0.00213 0.00217 0.09869 0.23710 0.10050 0.12299 0.00000 0.00124 0.00000 0.00094 0.00265 0.00371 0.00242 0.00250 0.00000 0.00190 0.00037 0.00148 0.00684 0.00590 0.00665 0.00350 +GTG 0.00207 0.00172 0.00231 0.00198 0.01441 0.01030 0.01777 0.01185 0.00190 0.00289 0.00292 0.00315 0.06259 0.04713 0.03843 0.04759 0.00289 0.00156 0.00298 0.00175 0.00423 0.00304 0.00508 0.00311 0.00125 0.00139 0.00216 0.00147 0.01759 0.01373 0.02219 0.01360 0.00365 0.00133 0.00481 0.00154 0.02394 0.01551 0.03579 0.01669 0.00410 0.00310 0.00771 0.00384 0.26346 0.19360 0.36780 0.20480 0.00000 0.00114 0.00000 0.00126 0.00649 0.00409 0.00747 0.00474 0.00000 0.00197 0.00141 0.00261 0.01844 0.00438 0.02497 0.00505 +GTT 0.00099 0.00135 0.00073 0.00205 0.00673 0.00661 0.00719 0.01096 0.00092 0.00252 0.00083 0.00355 0.03145 0.03118 0.01061 0.05157 0.00155 0.00088 0.00116 0.00135 0.00191 0.00180 0.00205 0.00302 0.00055 0.00069 0.00070 0.00111 0.00673 0.00798 0.00657 0.01306 0.00154 0.00113 0.00131 0.00207 0.01053 0.01007 0.01122 0.01789 0.00188 0.00224 0.00227 0.00427 0.10879 0.12150 0.10502 0.21390 0.00000 0.00068 0.00000 0.00127 0.00326 0.00273 0.00266 0.00395 0.00000 0.00160 0.00054 0.00206 0.00775 0.00321 0.00774 0.00556 +TAA 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.57489 0.00000 0.30525 0.00000 0.00000 0.00000 0.00000 0.00000 0.22054 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 +TAC 0.00052 0.00380 0.00051 0.00276 0.00055 0.00112 0.00079 0.00097 0.00064 0.00200 0.00062 0.00155 0.00070 0.00121 0.00091 0.00090 0.00249 0.02629 0.00218 0.01767 0.00047 0.00119 0.00076 0.00075 0.00104 0.00302 0.00105 0.00259 0.00137 0.00290 0.00131 0.00208 0.00045 0.00148 0.00045 0.00090 0.00057 0.00099 0.00103 0.00083 0.00024 0.00085 0.00037 0.00054 0.00061 0.00145 0.00069 0.00080 0.00000 0.52036 0.00000 0.34215 0.00165 0.00423 0.00206 0.00278 0.00000 0.01125 0.00267 0.00797 0.00245 0.03633 0.00182 0.02556 +TAG 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.16413 0.00000 0.47206 0.00000 0.00000 0.00000 0.00000 0.00000 0.10117 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 +TAT 0.00055 0.00233 0.00048 0.00411 0.00073 0.00090 0.00093 0.00129 0.00063 0.00141 0.00070 0.00239 0.00086 0.00085 0.00093 0.00127 0.00254 0.01622 0.00184 0.02381 0.00051 0.00070 0.00062 0.00104 0.00140 0.00193 0.00100 0.00386 0.00154 0.00177 0.00135 0.00259 0.00054 0.00099 0.00051 0.00169 0.00068 0.00072 0.00073 0.00108 0.00038 0.00055 0.00044 0.00095 0.00069 0.00091 0.00063 0.00123 0.00000 0.28240 0.00000 0.43553 0.00182 0.00266 0.00190 0.00437 0.00000 0.00668 0.00257 0.01282 0.00248 0.02244 0.00177 0.03261 +TCA 0.00246 0.00313 0.00173 0.00372 0.02612 0.01384 0.01893 0.01764 0.00190 0.00783 0.00134 0.00900 0.00365 0.00118 0.00378 0.00148 0.00498 0.00216 0.00325 0.00254 0.02250 0.01161 0.01514 0.01332 0.00171 0.00105 0.00152 0.00158 0.00463 0.00152 0.00251 0.00206 0.00221 0.00139 0.00171 0.00162 0.02522 0.01372 0.01763 0.01604 0.00346 0.00277 0.00342 0.00316 0.00521 0.00257 0.00326 0.00320 0.00000 0.00138 0.00000 0.00184 0.23909 0.11502 0.16910 0.12683 0.00000 0.00437 0.00180 0.00593 0.01330 0.00185 0.00823 0.00264 +TCC 0.00156 0.00588 0.00132 0.00419 0.01627 0.02719 0.01865 0.01823 0.00147 0.01511 0.00133 0.01003 0.00212 0.00193 0.00252 0.00170 0.00331 0.00412 0.00281 0.00317 0.01146 0.02639 0.01312 0.01472 0.00133 0.00273 0.00168 0.00211 0.00207 0.00297 0.00174 0.00198 0.00144 0.00237 0.00153 0.00160 0.01581 0.02815 0.01860 0.01783 0.00285 0.00488 0.00311 0.00343 0.00288 0.00430 0.00246 0.00321 0.00000 0.00423 0.00000 0.00322 0.13767 0.29800 0.16395 0.15853 0.00000 0.01140 0.00119 0.00754 0.00461 0.00611 0.00437 0.00336 +TCG 0.00062 0.00105 0.00056 0.00112 0.00653 0.00479 0.00809 0.00564 0.00054 0.00326 0.00057 0.00305 0.00083 0.00039 0.00138 0.00047 0.00116 0.00079 0.00121 0.00078 0.00461 0.00401 0.00654 0.00395 0.00054 0.00058 0.00080 0.00055 0.00111 0.00050 0.00099 0.00058 0.00058 0.00053 0.00061 0.00050 0.00603 0.00507 0.00964 0.00503 0.00094 0.00126 0.00146 0.00114 0.00117 0.00076 0.00122 0.00085 0.00000 0.00056 0.00000 0.00062 0.05482 0.04441 0.09137 0.04539 0.00000 0.00164 0.00106 0.00187 0.00193 0.00062 0.00328 0.00090 +TCT 0.00208 0.00489 0.00156 0.00647 0.02149 0.01922 0.01948 0.02968 0.00187 0.01221 0.00152 0.01518 0.00221 0.00152 0.00327 0.00251 0.00402 0.00352 0.00292 0.00530 0.01512 0.01746 0.01500 0.02887 0.00132 0.00164 0.00151 0.00344 0.00251 0.00195 0.00198 0.00370 0.00189 0.00184 0.00172 0.00280 0.02029 0.01952 0.02071 0.02892 0.00335 0.00386 0.00325 0.00537 0.00382 0.00311 0.00306 0.00497 0.00000 0.00298 0.00000 0.00566 0.16272 0.16995 0.17963 0.27638 0.00000 0.00831 0.00124 0.01443 0.00597 0.00370 0.00551 0.00694 +TGA 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.26099 0.00000 0.22269 0.00000 0.00000 0.00000 0.00000 0.00000 0.67829 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 +TGC 0.00059 0.00238 0.00058 0.00175 0.00142 0.00264 0.00155 0.00207 0.00125 0.01013 0.00143 0.00728 0.00068 0.00111 0.00069 0.00083 0.00133 0.00414 0.00109 0.00319 0.00052 0.00116 0.00087 0.00097 0.00266 0.00847 0.00289 0.00524 0.00063 0.00187 0.00070 0.00126 0.00026 0.00064 0.00033 0.00044 0.00161 0.00300 0.00230 0.00209 0.00135 0.00464 0.00172 0.00294 0.00094 0.00164 0.00088 0.00140 0.00000 0.00838 0.00000 0.00603 0.00390 0.00849 0.00450 0.00578 0.00000 0.52365 0.00371 0.31521 0.00117 0.00438 0.00118 0.00341 +TGG 0.00056 0.00029 0.00083 0.00035 0.00054 0.00036 0.00076 0.00031 0.00209 0.00059 0.00469 0.00068 0.00049 0.00030 0.00118 0.00036 0.00137 0.00087 0.00212 0.00130 0.00046 0.00022 0.00096 0.00024 0.00367 0.00244 0.00752 0.00241 0.00137 0.00081 0.00169 0.00090 0.00024 0.00011 0.00037 0.00013 0.00049 0.00041 0.00072 0.00028 0.00084 0.00065 0.00287 0.00057 0.00046 0.00034 0.00066 0.00049 0.00000 0.00208 0.00000 0.00243 0.00168 0.00093 0.00305 0.00090 0.00000 0.00388 0.90947 0.00412 0.00171 0.00193 0.00447 0.00229 +TGT 0.00063 0.00167 0.00055 0.00284 0.00187 0.00182 0.00199 0.00290 0.00145 0.00687 0.00130 0.01006 0.00076 0.00081 0.00076 0.00125 0.00140 0.00300 0.00110 0.00523 0.00078 0.00086 0.00099 0.00162 0.00260 0.00359 0.00237 0.00827 0.00096 0.00117 0.00081 0.00175 0.00036 0.00054 0.00031 0.00080 0.00196 0.00196 0.00216 0.00283 0.00161 0.00291 0.00172 0.00476 0.00112 0.00118 0.00108 0.00166 0.00000 0.00548 0.00000 0.01067 0.00488 0.00518 0.00475 0.00926 0.00000 0.29089 0.00363 0.48068 0.00143 0.00289 0.00142 0.00503 +TTA 0.00072 0.00040 0.00049 0.00059 0.00283 0.00124 0.00211 0.00162 0.00079 0.00056 0.00068 0.00059 0.01628 0.00572 0.01011 0.00659 0.00203 0.00090 0.00157 0.00122 0.00250 0.00117 0.00207 0.00126 0.00076 0.00065 0.00090 0.00081 0.07640 0.03765 0.04914 0.04040 0.00053 0.00021 0.00043 0.00020 0.00259 0.00119 0.00165 0.00143 0.00048 0.00031 0.00052 0.00043 0.01168 0.00423 0.00593 0.00486 0.00000 0.00131 0.00000 0.00160 0.00849 0.00246 0.00381 0.00297 0.00000 0.00084 0.00117 0.00111 0.17793 0.00694 0.07980 0.00794 +TTC 0.00030 0.00115 0.00023 0.00075 0.00117 0.00218 0.00138 0.00163 0.00032 0.00125 0.00029 0.00081 0.00449 0.00834 0.00406 0.00569 0.00104 0.00497 0.00086 0.00354 0.00083 0.00201 0.00100 0.00106 0.00047 0.00136 0.00059 0.00106 0.00930 0.02189 0.00910 0.01289 0.00025 0.00043 0.00026 0.00023 0.00110 0.00223 0.00170 0.00151 0.00037 0.00077 0.00047 0.00049 0.00315 0.00839 0.00324 0.00462 0.00000 0.04448 0.00000 0.03328 0.00271 0.00750 0.00280 0.00423 0.00000 0.00720 0.00302 0.00515 0.01594 0.51137 0.01299 0.28288 +TTG 0.00094 0.00069 0.00100 0.00089 0.00342 0.00210 0.00404 0.00256 0.00094 0.00083 0.00110 0.00103 0.01597 0.00789 0.02462 0.00978 0.00293 0.00145 0.00320 0.00182 0.00348 0.00195 0.00451 0.00235 0.00129 0.00098 0.00201 0.00146 0.09329 0.06223 0.10535 0.06926 0.00082 0.00028 0.00078 0.00037 0.00317 0.00197 0.00449 0.00232 0.00068 0.00061 0.00131 0.00061 0.01198 0.00682 0.01330 0.00805 0.00000 0.00161 0.00000 0.00189 0.00872 0.00387 0.01070 0.00455 0.00000 0.00140 0.00506 0.00182 0.13228 0.00937 0.19038 0.01160 +TTT 0.00037 0.00089 0.00032 0.00132 0.00152 0.00148 0.00172 0.00238 0.00048 0.00103 0.00047 0.00149 0.00570 0.00550 0.00473 0.00900 0.00125 0.00354 0.00088 0.00493 0.00101 0.00123 0.00124 0.00223 0.00058 0.00060 0.00062 0.00153 0.01007 0.01263 0.00986 0.02068 0.00035 0.00028 0.00028 0.00049 0.00162 0.00143 0.00167 0.00239 0.00055 0.00063 0.00050 0.00089 0.00399 0.00486 0.00363 0.00780 0.00000 0.03048 0.00000 0.04712 0.00377 0.00402 0.00395 0.00773 0.00000 0.00547 0.00351 0.00873 0.01777 0.27555 0.01565 0.49052 + diff --git a/extra/tables/human/firstCodon_profile.txt b/extra/tables/human/firstCodon_profile.txt new file mode 100644 index 0000000..5010b26 --- /dev/null +++ b/extra/tables/human/firstCodon_profile.txt @@ -0,0 +1,14 @@ +A T C G +0.209 0.192 0.316 0.282 +0.191 0.187 0.272 0.350 +0.183 0.184 0.342 0.291 +0.198 0.196 0.336 0.271 +0.193 0.178 0.235 0.394 +0.174 0.188 0.329 0.310 +0.240 0.116 0.388 0.256 +0.453 0.063 0.103 0.380 +0.281 0.123 0.395 0.201 +0.180 0.076 0.465 0.280 +0.999 0.000 0.001 0.000 +0.000 0.999 0.000 0.000 +0.000 0.001 0.000 0.999 diff --git a/extra/tables/human/lastCodon_profile.txt b/extra/tables/human/lastCodon_profile.txt new file mode 100644 index 0000000..d8b02dc --- /dev/null +++ b/extra/tables/human/lastCodon_profile.txt @@ -0,0 +1,7 @@ +A T C G +0.25 0.25 0.25 0.25 +0.25 0.25 0.25 0.25 +0.25 0.25 0.25 0.25 +0.25 0.25 0.25 0.25 +0.25 0.25 0.25 0.25 +0.25 0.25 0.25 0.25 diff --git a/extra/tables/human/u12_acc_profile.txt b/extra/tables/human/u12_acc_profile.txt new file mode 100644 index 0000000..d491adb --- /dev/null +++ b/extra/tables/human/u12_acc_profile.txt @@ -0,0 +1,23 @@ +A T C G +0.158 0.446 0.169 0.226 +0.175 0.531 0.147 0.147 +0.220 0.401 0.203 0.175 +0.085 0.520 0.254 0.141 +0.096 0.520 0.271 0.113 +0.062 0.429 0.469 0.040 +0.068 0.407 0.480 0.045 +0.130 0.525 0.322 0.023 +0.209 0.508 0.249 0.034 +0.305 0.441 0.130 0.124 +0.441 0.288 0.175 0.096 +0.367 0.220 0.345 0.068 +0.305 0.288 0.322 0.085 +0.198 0.271 0.322 0.209 +0.226 0.350 0.282 0.141 +0.237 0.333 0.215 0.215 +0.209 0.288 0.282 0.220 +0.130 0.390 0.322 0.158 +0.158 0.350 0.277 0.215 +0.090 0.350 0.542 0.017 +1.000 0.000 0.000 0.000 +0.000 0.000 1.000 0.000 diff --git a/extra/tables/human/u12_donor_profile.txt b/extra/tables/human/u12_donor_profile.txt new file mode 100644 index 0000000..1717a69 --- /dev/null +++ b/extra/tables/human/u12_donor_profile.txt @@ -0,0 +1,7 @@ +A T C G +1.000 0.000 0.000 0.000 +0.000 1.000 0.000 0.000 +1.000 0.000 0.000 0.000 +0.000 1.000 0.000 0.000 +0.000 0.000 1.000 0.000 +0.005 0.024 0.971 0.000 diff --git a/precompiledBinary_x86_64/cesar b/precompiledBinary_x86_64/cesar new file mode 100755 index 0000000..c93bcaf Binary files /dev/null and b/precompiledBinary_x86_64/cesar differ diff --git a/src/Alignment.c b/src/Alignment.c new file mode 100644 index 0000000..cc325cf --- /dev/null +++ b/src/Alignment.c @@ -0,0 +1,247 @@ +/** + * Alignment composition + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include +#include + +#include "Params.h" +#include "Alignment.h" +#include "Logging.h" +#include "SafeAlloc.h" + +/** + * Write the most probable deletion to a string buffer. + * @param deletion a partial codon match state, e.g. 1nt-Deletion or 2nt-Deletion. + * @param emission_table the emission table where to pick the probabilties from. + * @param reference array of Literals. + * @param query array of Literals. + * @param result pre-allocated string buffer. + * @return success boolean. + */ +bool find_best_deletion(struct State* deletion, struct EmissionTable* emission_table, Literal* reference, Literal* query, char* result) { + + double prob=LOGODD_NEGINF, highest_prob=LOGODD_NEGINF; + uint_fast8_t best_pos=0, pos=0; + Literal lookup_query[3]; + + if (deletion->num_emissions == 2) { + for(; pos < 3; pos++) { + for (uint_fast8_t i=0; i < 3; i++) { + if (pos == i) { + lookup_query[i] = LITERAL_N; + continue; + } + if (pos > i) { + lookup_query[i] = query[i]; + continue; + } + if (pos < i) { + lookup_query[i] = query[i-1]; + continue; + } + } + prob = EmissionTable__by_literals(emission_table, reference, lookup_query); + if (prob >= highest_prob) { + highest_prob = prob; + for (uint_fast8_t j=0; j < 3; j++) { + result[j] = Literal__char(lookup_query[j]); + } + result[pos] = '-'; + } + logv(6, "lookup_query: %c%c%c (%c%c/%c%c)\tprob (highest): %E (%E)\tbest deletion: '%s' at pos=%u\n", + Literal__char(lookup_query[0]), Literal__char(lookup_query[1]), Literal__char(lookup_query[2]), + Literal__char(reference[0]), Literal__char(reference[1]), + Literal__char(query[0]), Literal__char(query[1]), + prob, highest_prob, result, best_pos + ); + } + return true; + } else if (deletion->num_emissions == 1) { + strncpy(result, "---", 3); + for(; pos < 3; pos++) { + for (uint_fast8_t i=0; i < 3; i++) { + if (pos == i) { + lookup_query[i] = query[0]; + } else { + lookup_query[i] = LITERAL_N; + } + } + prob = EmissionTable__by_literals(deletion->emission_table, reference, lookup_query); + if (prob >= highest_prob) { + highest_prob = prob; + best_pos = pos; + } + logv(6, "lookup_query: %c%c%c (%c%c%c/%c)\tprob (highest): %E (%E)\tbest deletion: '%s' at pos=%u\n", + Literal__char(lookup_query[0]), Literal__char(lookup_query[1]), Literal__char(lookup_query[2]), + Literal__char(reference[0]),Literal__char(reference[1]),Literal__char(reference[2]), + Literal__char(query[0]), + prob, highest_prob, result, best_pos + ); + } + result[best_pos] = Literal__char(query[0]); + return true; + } + return false; +} + +/** + * Create a new alignment. + * @param fasta the Fasta that contains reference and query sequences. + * @param query_id the query index of which the alignment shall be created. + * @param params the Params. + * @param path_length the length of the Viterbi path. + * @param path the Viterbi path. + * @return the alignment. + */ +struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struct Params* params, size_t path_length, struct State** path) { + uint8_t reference_id = 0; + const char lower = 'a' - 'A'; + + struct Alignment* self = (struct Alignment*) SAFEMALLOC(sizeof(struct Alignment)); + size_t length = fasta->queries[query_id]->length; + for (uint8_t ref_id=0; ref_id < fasta->num_references; ref_id++) { + length += fasta->references[ref_id]->length; + } + self->reference = (char*) SAFECALLOC(sizeof(char), length+1+20*fasta->num_references); + self->query = (char*) SAFECALLOC(sizeof(char), length+1+20*fasta->num_references); + + // assemble the alignment + char* deletion; + char bases[4] = ""; + + uint_fast8_t pending_deletion=0; + size_t q = 0, r = 0, t = 0; + for (size_t i=1; i < path_length; i++) { + uint8_t j=0, emissions = path[i]->num_emissions; + + // deleting 1nt and 2nt will always emit 3 bases/dashes to maintain reading frame intact. + if (!strncmp("delete_1nt", path[i]->name, 10) || !strncmp("delete_2nt", path[i]->name, 10)) { + emissions = 3; + } + + if (emissions == 0) { + if (i > 0) { + logv(3, "i=%lu\tt=%lu\tj=%u\tq=%lu\tr=%lu\tref=''\tqry=''\t%s -> %s (%s)", i, t, j, q, r, path[i-1]->name, path[i]->name, ""); + } else { + logv(3, "i=%lu\tt=%lu\tj=%u\tq=%lu\tr=%lu\tref=''\tqry=''\t[] -> %s (%s)", i, t, j, q, r, path[i]->name, ""); + } + } + for (; j < emissions; j++) { + if (r >= fasta->references[reference_id]->length && reference_id < fasta->num_references-1) { + reference_id++; + logv(3, "reference id: %u/%u", reference_id, fasta->num_references); + r = 0; + } + if (!strncmp("intron", path[i]->name, 6)) { + self->reference[t+j] = ' '; + self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]) + lower; + } else if (!strncmp("split_codon", path[i]->name, 11)) { + self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]) + lower; + self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]) + lower; + } else if (!strncmp("match_codon", path[i]->name, 11)) { + self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]); + self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]); + } else if (!strncmp("match_split", path[i]->name, 11)) { + self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]) + lower; + self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]) + lower; + } else if (!strncmp("match", path[i]->name, 5)) { // donor, acceptor + self->reference[t+j] = ' '; + self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]) + lower; + } else if (!strncmp("insert", path[i]->name, 6)) { + self->reference[t+j] = '-'; + self->query[t+j] = Literal__char(fasta->queries[query_id]->sequence[q++]); + } else if (!strncmp("delete", path[i]->name, 6)) { + if (pending_deletion == 0) { + deletion = (char*) SAFECALLOC(sizeof(char), 3); + find_best_deletion( + path[i], + params->emission_table_61_LAMBDA, + &fasta->references[reference_id]->sequence[r], + &fasta->queries[query_id]->sequence[q], + deletion + ); + pending_deletion = 3; + q += path[i]->num_emissions; + } + self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]); + self->query[t+j] = deletion[3-pending_deletion--]; + + if (pending_deletion == 0) { + free(deletion); + } + } + Literal__str(path[i]->num_emissions, path[i]->reference, bases); + if (i > 0) { + logv(3, "i=%lu\tt=%lu\tj=%u\tq=%lu\tr=%lu\tref='%c'\tqry='%c'\t%s -> %s (%s)", i, t, j, q, r, self->reference[t+j], self->query[t+j], path[i-1]->name, path[i]->name, bases); + } else { + logv(3, "i=%lu\tt=%lu\tj=%u\tq=%lu\tr=%lu\tref='%c'\tqry='%c'\t[] -> %s (%s)", i, t, j, q, r, self->reference[t+j], self->query[t+j], path[i]->name, bases); + } + } + + if (!strncmp("between_split_donor", path[i-1]->name, 19) && !strncmp("between_acc_split", path[i]->name, 17)) { + for (j=0; j < 19; j++) { + self->reference[t+j] = '>'; + self->query[t+j] = '-'; + } + } + if (i > 0 && !strncmp("between_acc_split", path[i-1]->name, 17) && + !strncmp("between_split_ins", path[i]->name, 17)) { + for (; j < fasta->references[reference_id]->start_split_length; j++) { + self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]) + lower; + self->query[t+j] = '-'; + } + } + if (i > 0 && !strncmp("between_split_donor", path[i]->name, 19) && + !strncmp("end_codon", path[i-1]->name, 9) && path[i-1]->custom == + fasta->references[reference_id]->num_codons-1) { + for (; j < fasta->references[reference_id]->end_split_length; j++) { + self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r++]) + lower; + self->query[t+j] = '-'; + } + } + if (i > 0 && (!strncmp("start_first_codon", path[i-1]->name, 17) || + !strncmp("end_codon", path[i-1]->name, 9)) && !strncmp("end_codon", + path[i]->name, 9)) { + // determine the number of deleted codons from the state names + size_t deleted_codons = path[i]->custom; + if (!strncmp("end_codon", path[i-1]->name, 9)) { + deleted_codons -= path[i-1]->custom; + } else if (!strncmp("start_first_codon", path[i-1]->name, 17)) { + deleted_codons -= -1; + } + + char bases[4] = ""; + Literal__str(path[i]->num_emissions, path[i]->reference, bases); + for (; j < 3*deleted_codons; j++) { + if (r >= fasta->references[reference_id]->length && reference_id < fasta->num_references+1) { + r = 0; + reference_id++; + } + self->reference[t+j] = Literal__char(fasta->references[reference_id]->sequence[r]); + self->query[t+j] = '-'; + logv(3, "i=%lu\tt=%lu\tj=%u\tq=%lu\tr=%lu\tref='%c'\tqry='%c'\t%s -> %s (%s)", i, t, j, q, r, self->reference[t+j], self->query[t+j], path[i-1]->name, path[i]->name, bases); + r++; + } + } + t += j; + } + + return self; +} + + +/** + * Destroy an alignment. + * @param self the alignment. + * @return success boolean. + */ +bool Alignment__destroy(struct Alignment* self) { + free(self->reference); + free(self->query); + free(self); + return true; +} diff --git a/src/Alignment.h b/src/Alignment.h new file mode 100644 index 0000000..8ba4133 --- /dev/null +++ b/src/Alignment.h @@ -0,0 +1,23 @@ +/** + * Alignment type and function definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef ALIGNMENT_H_ +#define ALIGNMENT_H_ + +#include "Fasta.h" +#include "Params.h" +#include "State.h" +#include "Literal.h" + +typedef struct Alignment { + size_t length; + char* reference; + char* query; +} Alignment; + +struct Alignment* Alignment__create(struct Fasta* fasta, uint8_t query_id, struct Params* params, size_t path_length, struct State** sequence); +bool Alignment__destroy(struct Alignment* self); + +#endif // ALIGNMENT_H_ diff --git a/src/Arguments.c b/src/Arguments.c new file mode 100644 index 0000000..e458d63 --- /dev/null +++ b/src/Arguments.c @@ -0,0 +1,127 @@ +/** + * Argument parser implementation. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include + +#include "Logging.h" +#include "Params.h" + +#include "Arguments.h" + +#ifndef VERSION +#define VERSION "0.01 build" +#endif + +/** + * Print information about the program. + */ +void print_version() { + printf("Cesar %s\n" + "Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede\n" + "\n" + "Align a pair of sequences using HMM.\n", + VERSION); +} + +/** + * Print a list of available input parameters. + */ +void print_help() { + print_version(); + printf("\n" + "Usage: cesar \n" + " -c/--clade \n" + " -m/--matrix \n" + " -p/--profiles \n" + " -i/--split_codon_emissions \n" + " -f/--firstexon\n" + " -l/--lastexon\n" + " -x/--max-memory\n" + " -v/--verbosity <0 .. 2>\n" + " -V/--version\n" + " -s/--set name1=value1 .. nameN=valueN\n" + " -h/--help\n"); +} + +/** + * Read arguments from user input and store them in a Params struct. + * @param argc number of given arguments. + * @param argv pointer to argument vector + * @param parameters a Params struct. + * @return success boolean. + */ +bool Arguments__read(int argc, char** argv, struct Params* parameters) { + if (argc < 2) { + print_help(); + die("Insufficient number of arguments. Please provide at least an input file."); + } + + uint8_t num_input_files = 0; + int i = 1; // [0] is the cesar binary itself. + while (argv[i] && i < argc) { + char* argument = argv[i]; + + if (!strcmp(argument, "-h") || !strcmp(argument, "-?") || !strcmp(argument, "--help")) { + print_help(); + return false; + } else if (!strcmp(argument, "-V") || !strcmp(argument, "--version")) { + print_version(); + return false; + } + + if (argument[0] == '-' && argc <= i+1) { + print_help(); + die("Insufficient number of parameters for argument %s.", argument); + } + if (!strcmp(argument, "-i") || !strcmp(argument, "--split_codon_emissions")) { + Params__set_via_str(parameters, "split_emissions_acceptor", argv[++i]); + Params__set_via_str(parameters, "split_emissions_donor", argv[++i]); + } else if (!strcmp(argument, "-x") || !strcmp(argument, "--max-memory")) { + Params__set_via_str(parameters, "max_memory", argv[++i]); + } else if (!strcmp(argument, "-c") || !strcmp(argument, "--clade")) { + Params__set_via_str(parameters, "clade", argv[++i]); + } else if (!strcmp(argument, "-m") || !strcmp(argument, "--matrix")) { + Params__set_via_str(parameters, "eth_file", argv[++i]); + } else if (!strcmp(argument, "-v") || !strcmp(argument, "--verbosity")) { + g_loglevel = (char) atoi(argv[++i]); + } else if (!strcmp(argument, "-p") || !strcmp(argument, "--profiles")) { + Params__set_via_str(parameters, "acc_profile", argv[++i]); + Params__set_via_str(parameters, "do_profile", argv[++i]); + } else if (!strcmp(argument, "-s") || !strcmp(argument, "--set")) { + int j = i+1; + for (; j < argc; j++) { + if (argv[j][0] == '-') { + break; + } + char* key = strtok(argv[j], "="); + char* val = strtok(NULL, "="); + if(!Params__set_via_str(parameters, key, val)) { + warn("Ignoring unknown parameter `%s'.", key); + } + } + i = j; + } else if (!strcmp(argument, "-d") || !strcmp(argument, "--dot")) { + Params__set_via_str(parameters, "dot", argv[++i]); + } else if (!strcmp(argument, "-f") || !strcmp(argument, "--firstexon")) { + parameters->firstexon = true; + } else if (!strcmp(argument, "-l") || !strcmp(argument, "--lastexon")) { + parameters->lastexon = true; + } else { + num_input_files++; + if (num_input_files > 1) { + die("Too many input files given, e.g.: %s (%u)", argv[i], num_input_files); + } + Params__set_via_str(parameters, "fasta_file", argv[i]); + } + + i++; + } // while + if (num_input_files == 0) { + die("Too few input files given: %u", num_input_files); + } + return true; +} diff --git a/src/Arguments.h b/src/Arguments.h new file mode 100644 index 0000000..a3db64b --- /dev/null +++ b/src/Arguments.h @@ -0,0 +1,15 @@ +/** + * Argument parser. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef ARGUMENTS_H_ +#define ARGUMENTS_H_ + +#include "Params.h" + +void print_version(); +void print_help(); +bool Arguments__read(int argc, char** argv, struct Params* parameters); + +#endif // ARGUMENTS_H_ diff --git a/src/Cesar.c b/src/Cesar.c new file mode 100644 index 0000000..23b3495 --- /dev/null +++ b/src/Cesar.c @@ -0,0 +1,186 @@ +/** + * CESAR entry point. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include +#include + +#include "Fasta.h" +#include "Profile.h" +#include "Logging.h" +#include "SafeAlloc.h" +#include "Viterbi.h" +#include "Alignment.h" +#include "Arguments.h" +#include "EmissionTable.h" +#include "Sequence.h" +#include "Model.h" + + +char g_loglevel = LOGLEVEL; + +/** + * The main program. + */ +int main(int argc, char* argv[argc]) { + struct EmissionTable emission_tables[6]; + + struct Params parameters; + Params__create(¶meters, emission_tables); + + if (!Arguments__read(argc, argv, ¶meters)) { + return 1; + } + + Params__set_paths(¶meters); + + struct Fasta fasta; + Fasta__init(&fasta); + Fasta__read(&fasta, parameters.fasta_file); + + struct Profile** acceptors = SAFEMALLOC(sizeof(Profile*) * fasta.num_references); + struct Profile** donors = SAFEMALLOC(sizeof(Profile*) * fasta.num_references); + + char prefix[PATH_STRING_LENGTH] = "extra/tables/"; + if(strchr(parameters.clade, '/') != NULL) { + memset(prefix, 0, strlen(prefix)); + } + + for (uint8_t i=0; i < fasta.num_references; i++) { + struct Sequence* reference = fasta.references[i]; + + if (reference->acceptor[0] != '\0') { + char name[STATE_NAME_LENGTH]; + sprintf(name, "ref%iacc", i); + acceptors[i] = Profile__create(name); + char path[PROFILE_FILENAME_LENGTH]; + sprintf(path, "%s%s/%s", prefix, parameters.clade, reference->acceptor); + if (access(path, R_OK) != -1) { + Profile__read(acceptors[i], path); + } else { + Profile__read(acceptors[i], reference->acceptor); + } + } else { + if (fasta.num_references > 1) { + warn("Missing acceptor profile for reference %u.", i); + } + acceptors[i] = Profile__create("acceptor"); + if (i == 0 && ((parameters.firstexon && fasta.num_references == 1) || (!parameters.firstexon && fasta.num_references > 1))) { + Profile__read(acceptors[i], parameters.first_codon_profile); + } else { + Profile__read(acceptors[i], parameters.acc_profile); + } + } + logv(1, "Reference %u uses acceptor:\t%s", i, acceptors[i]->filename); + + if (reference->acceptor[0] != '\0') { + char name[STATE_NAME_LENGTH]; + sprintf(name, "ref%idon", i); + donors[i] = Profile__create(name); + char path[PROFILE_FILENAME_LENGTH]; + sprintf(path, "%s%s/%s", prefix, parameters.clade, reference->donor); + if (access(path, R_OK) != -1) { + Profile__read(donors[i], path); + } else { + Profile__read(donors[i], reference->donor); + } + } else { + if (fasta.num_references > 1) { + warn("Missing donor profile for reference %u.", i); + } + donors[i] = Profile__create("donor"); + if (i == fasta.num_references-1 && ((parameters.lastexon && fasta.num_references == 1) || (!parameters.lastexon && fasta.num_references > 1))) { + Profile__read(donors[i], parameters.last_codon_profile); + } else { + Profile__read(donors[i], parameters.do_profile); + } + } + logv(1, "Reference %u uses donor:\t%s", i, donors[i]->filename); + } + + size_t rlength = 0; + for (uint8_t i=0; i < fasta.num_references; i++) { + logv(1, "Reference %u length: %lu", i, fasta.references[i]->length); + logv(1, "Reference %u split codon lengths: %u %u", i, fasta.references[i]->start_split_length, fasta.references[i]->end_split_length); + rlength += 11 + 6 * fasta.references[i]->length + donors[i]->length + acceptors[i]->length; + } + size_t qlength = 0; + for (uint8_t i=0; i < fasta.num_queries; i++) { + logv(1, "Query %u length: %lu", i, fasta.queries[i]->length); + qlength += fasta.queries[i]->length; + } + float mem = 7e-9*(rlength*qlength + 4*rlength); // in GB. Factor of 17 bytes is taken from measurements. + if (mem > (float)parameters.max_memory) { + die("The memory consumption is limited to %u GB by default. Your attempt requires %u GB. You can change the limit via --max-memory.", parameters.max_memory, (uint8_t)mem); + } else { + logv(1, "Expecting a memory consumption of: %u GB", (uint8_t)mem); + } + + if (g_loglevel >= 7) { + char* tmp; + for (uint8_t i=0; i < fasta.num_references; i++) { + tmp = SAFECALLOC(sizeof(char), SEQUENCENAMELENGTH + fasta.references[i]->length); + Literal__str(fasta.references[i]->length, fasta.references[i]->sequence, tmp); + logv(1, ">original %s\n%s", fasta.references[i]->name, tmp); + free(tmp); + } + + for (uint8_t i=0; i < fasta.num_queries; i++) { + tmp = SAFECALLOC(sizeof(char), SEQUENCENAMELENGTH + fasta.queries[i]->length); + Literal__str(fasta.queries[i]->length, fasta.queries[i]->sequence, tmp); + logv(1, ">original %s\n%s", fasta.queries[i]->name, tmp); + free(tmp); + } + } + + clock_t time = clock(); + struct HMM* hmm = multi_exon(¶meters, &fasta, acceptors, donors); + logv(1, "HMM construction (sec):\t%f", (float)(clock() - time) / CLOCKS_PER_SEC); + + if (parameters.dot[0] != '\0') { + FILE * dotfile = fopen(parameters.dot, "w"); + HMM__dot(hmm, dotfile); + fclose(dotfile); + } + + for (uint8_t q=0; q < fasta.num_queries; q++) { + + size_t length = 2*fasta.queries[q]->length; + for (uint8_t i=0; i < fasta.num_references; i++) { + length += fasta.references[i]->length; + } + + struct State** path = (struct State**) SAFEMALLOC(sizeof(struct State*) * length); + size_t path_length = 0; + + time = clock(); + Viterbi(hmm, fasta.queries[q]->length, fasta.queries[q]->sequence, &path_length, path); + logv(1, "Viterbi (sec):\t%f", (float)(clock() - time) / CLOCKS_PER_SEC); + + struct Alignment* alignment = Alignment__create(&fasta, q, ¶meters, path_length, path); + printf(">%s\n%s\n", "referenceExon", alignment->reference); + printf(">%s\n%s\n", fasta.queries[q]->name, alignment->query); + Alignment__destroy(alignment); + + free(path); + + } + + HMM__destroy(hmm); + for (uint8_t i=0; i +#include +#include +#include +#include + +#include "Logging.h" +#include "SafeAlloc.h" + +#include "EmissionTable.h" + +/** + * Read distribution from file. + * @param self a pointer to the emission table. + * @param filename the path to the file. + * @return success boolean. + */ +bool EmissionTable__read(struct EmissionTable* self, char* filename) { + if (self->distribution != LAMBDA_DISTRIBUTION) { + die("Cannot load file into non-lambda distributed emission table."); + } + + // read distribution from file here. + FILE* file_descriptor = fopen(filename, "r"); + if (file_descriptor == NULL) { + die("Cannot open file: %s", filename); + } + + #define LINELENGTH 1024 + #define DELIMITERS " \t" + + bool done = false; + size_t lineno=0; + while (!done) { + // fill the line + char line[LINELENGTH]; + for (size_t i=0; i < LINELENGTH; i++) { + assert(i < LINELENGTH); + + char c = fgetc(file_descriptor); + + bool stop = false; + switch (c) { + case EOF: + line[i] = '\0'; + done = true; + stop = true; + break; + case '\n': + line[i] = '\0'; + stop = true; + break; + default: + line[i] = c; + } + + if (stop) { + break; + } + } + + // set matrix entries using the tokens + size_t i=0; + size_t line_offset = 0; + char* token = strtok(line, DELIMITERS); + while (token != NULL) { + // skip commented lines + if (token[0] == '#') { + if (i == 0) { + line_offset++; + } + token = NULL; + break; + } + + // check the first token: it should be the codon of which the array index + // corresponds to the line number. + if (i==0) { + Literal* codon = (Literal*) SAFEMALLOC(sizeof(Literal) * self->num_literals); + for (uint8_t j=0; j < self->num_literals; j++) { + codon[j] = Literal__from_char(token[j]); + } + + size_t expected = lineno - line_offset; + EMISSION_ID_T index = Literal__uint(self->num_literals, codon); + free(codon); + + if (index != expected) { + die("Unsupported order of oligomers found in %s:%lu: Expected %lu, got %u (%s)", filename, lineno+1, expected, index, token); + } + } else { + double prob; + sscanf(token, "%lf", &prob); + LogoddMatrix__set(self->values, lineno-line_offset, i-1, Logodd__log(prob)); + } + + token = strtok(NULL, DELIMITERS); + i++; + } + lineno++; + } + + fclose(file_descriptor); + + return true; +} + +/** + * Set individual entries in the EmissionTable. + * @param self emission table. + * @param sequence Literal sequence whose matches to all other sequences are set to logodd. + * @param logodd the logodd value. + * @return success boolean. + */ +bool EmissionTable__set(struct EmissionTable* self, Literal sequence[], LOGODD_T logodd) { + bool result = true; + // for each query, set logodd concerning + EMISSION_ID_T row = Literal__uint(self->num_literals, sequence); + for (uint8_t column=0; column < pow(4, self->num_literals); column++) { + result &= LogoddMatrix__set(self->values, column, row, logodd); + if (!result) { + break; + } + } + return result; +} + + +/** + * Set the emission of given literal sequence to zero probability. + * @param self emission table. + * @param sequence sequence of literals that will be forbidden. + */ +bool EmissionTable__forbid(struct EmissionTable* self, Literal sequence[]) { + return EmissionTable__set(self, sequence, LOGODD_NEGINF); +} + +/** + * Recursively check for Ns in query and replace them by A,C,T,G while recording those in visited. + */ +void EmissionTable__variants(uint8_t num_literals, Literal query[num_literals], uint8_t position, bool visited[]) { + if (g_loglevel >= 6) { + char tmp[4] = ""; + Literal__str(num_literals, query, tmp); + logv(7, "Variants? %s[%u/%u]", tmp, position, num_literals); + } + + if (position >= num_literals) { + visited[Literal__uint(num_literals, query)] = true; + return; + } + + if (query[position] != LITERAL_N) { + return EmissionTable__variants(num_literals, query, position+1, visited); + } + + Literal copy[4]; + for (uint8_t i=0; i < num_literals; i++) { + copy[i] = query[i]; + } + + for (Literal l=LITERAL_A; l < LITERAL_N; l++) { + copy[position] = l; + EmissionTable__variants(num_literals, copy, position+1, visited); + } +} + +/** + * Look up an emission table for two arrays of literals. + * @param self the emission table. + * @param reference will be used to look up the row. + * @param query will be used to look up the column. + * @return the probability to emit query. + */ +LOGODD_T EmissionTable__by_literals(struct EmissionTable* self, Literal reference[], Literal query[]) { + uint8_t reference_Ns = Literal__Ns(self->num_literals, reference); + uint8_t query_Ns = Literal__Ns(self->num_literals, query); + + if (query_Ns == 0 && reference_Ns == 0) { + EMISSION_ID_T row = Literal__uint(self->num_literals, reference); + EMISSION_ID_T column = Literal__uint(self->num_literals, query); + + if (g_loglevel >= 6) { + char ref[5] = ""; + char qry[5] = ""; + Literal__str(self->num_literals, reference, ref); + Literal__str(self->num_literals, query, qry); + logv(7, " by literals: qry=%s=%i x ref=%s=%i", qry, column, ref, row); + } + + return EmissionTable__get(self, column, row); + } + + bool* visited_reference = (bool*) SAFECALLOC(sizeof(bool), self->values->num_columns); + EmissionTable__variants(self->num_literals, reference, 0, visited_reference); + + uint8_t reference_visits = 0; + double total_sum = 0; + for (EMISSION_ID_T row = 0; row < self->values->num_rows; row++) { + + if (!visited_reference[row]) { + continue; + } + reference_visits++; + + bool* visited_query = (bool*) SAFECALLOC(sizeof(bool), self->values->num_columns); + EmissionTable__variants(self->num_literals, query, 0, visited_query); + + uint8_t query_visits = 0; + double sum = 0; + for (EMISSION_ID_T column = 0; column < self->values->num_columns; column++) { + if (!visited_query[column]) { + continue; + } + + query_visits++; + sum += Logodd__exp(EmissionTable__get(self, column, row)); + logv(7, "Visit: %02x", column); + } + + free(visited_query); + logv(7, "query_visits=%u\tsum=%f", query_visits, sum); + + total_sum += sum / query_visits; + } + + logv(7, "reference_visits=%u\ttotalsum=%f", reference_visits, total_sum); + + return Logodd__log(total_sum / reference_visits); +} + +/** + * Initialize an Emissiontable. + * @param self the EmissionTable. + * @param num_emissions the number of literals the table should provide for. + * @param distribution the type of distribution. + * @return A pointer to the created emission table. + */ +bool EmissionTable__init(struct EmissionTable* self, EMISSION_ID_T num_emissions, Distribution distribution) { + self->distribution = distribution; + self->num_literals = num_emissions; + + // the value for each of N literals will be 1/N + LOGODD_T value; + + switch (distribution) { + case UNIFORM_DISTRIBUTION: + value = Logodd__log((LOGODD_T) pow(.25, num_emissions)); + break; + case LAMBDA_DISTRIBUTION: + value = LOGODD_NEGINF; + break; + default: + die("Unknown distribution: %x", distribution); + } + self->values = LogoddMatrix__create(pow(4, num_emissions), pow(4, num_emissions), value); + + return true; +} + +/** + * Destroy an emission table's sub structure. + * Notice: This only frees memory that has been allocated during init, not the given pointer itself. + * @param self the EmissionTable to deconstruct. + * @return success boolean. + */ +bool EmissionTable__destroy(struct EmissionTable* self) { + LogoddMatrix__destroy(self->values); + return true; +} + +/** + * Lookup function for the emission table. + * @param self emission table. + * @param row table column. + * @param column table row. + * @return emission log odd. + */ +LOGODD_T EmissionTable__get(struct EmissionTable* self, EMISSION_ID_T column, EMISSION_ID_T row) { + return LogoddMatrix__get(self->values, (size_t) column, (size_t) row); +} + +/** + * Compose a string representation of the emission table. + * @param self emission table. + * @param buffer storage of the resulting string. + * @return success boolean. + */ +bool EmissionTable__str(struct EmissionTable* self, char buffer[]) { + return LogoddMatrix__str(self->values, buffer); +} + + +bool EmissionTable__emittable(struct EmissionTable* self, EMISSION_ID_T row) { + bool emittable = false; + uint8_t column = 0; + while (column < self->values->num_columns && !emittable) { + emittable = EmissionTable__get(self, column++, row) != LOGODD_NEGINF; + } + if (!emittable) return false; + return true; +} diff --git a/src/EmissionTable.h b/src/EmissionTable.h new file mode 100644 index 0000000..d32b582 --- /dev/null +++ b/src/EmissionTable.h @@ -0,0 +1,33 @@ +/** + * EmissionTable definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#ifndef EMISSIONTABLE_H_ +#define EMISSIONTABLE_H_ + +#include +#include + +#include "Logodd.h" +#include "Literal.h" +#include "Matrix.h" +#include "Distribution.h" + +typedef struct EmissionTable { + LogoddMatrix* values; + Distribution distribution; + uint8_t num_literals; +} EmissionTable; + + +bool EmissionTable__read(struct EmissionTable* self, char* filename); +LOGODD_T EmissionTable__by_literals(struct EmissionTable* self, Literal reference[], Literal query[]); +bool EmissionTable__init(struct EmissionTable* self, EMISSION_ID_T num_literals, Distribution distribution); +bool EmissionTable__destroy(struct EmissionTable* self); +bool EmissionTable__set(struct EmissionTable* self, Literal sequence[], LOGODD_T logodd); +bool EmissionTable__forbid(struct EmissionTable* self, Literal sequence[]); +LOGODD_T EmissionTable__get(struct EmissionTable* self, uint_fast8_t column, uint_fast8_t row); +bool EmissionTable__str(struct EmissionTable* self, char buffer[]); +bool EmissionTable__emittable(struct EmissionTable* self, EMISSION_ID_T reference); + +#endif // EMISSIONTABLE_H_ diff --git a/src/Fasta.c b/src/Fasta.c new file mode 100644 index 0000000..0991558 --- /dev/null +++ b/src/Fasta.c @@ -0,0 +1,329 @@ +/** + * Fasta parser implementation. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include +#include +#include + +#include "Logging.h" +#include "SafeAlloc.h" +#include "Literal.h" +#include "Params.h" + +#include "Fasta.h" + +#ifndef g_loglevel +#define g_loglevel 0 +#endif + +/** + * Initialize an empty Fasta struct. + * @param self a Fasta struct. + * @return success boolean. + */ +bool Fasta__init(struct Fasta* self) { + self->num_references = 0; + self->num_queries = 0; + self->queries = SAFEMALLOC(0); + self->references = SAFEMALLOC(0); + return self->references != NULL && self->queries != NULL; +} + + +/** + * Destroy a Fasta struct. + * @param self a Fasta struct. + * @return success boolean. + */ +bool Fasta__destroy(struct Fasta* self) { + for(uint8_t i = 0; i < self->num_references; i++) { + Sequence__destroy(self->references[i]); + } + free(self->references); + + for(uint8_t i = 0; i < self->num_queries; i++) { + Sequence__destroy(self->queries[i]); + } + free(self->queries); + + return true; +} + + +/** + * Add a new reference sequence. + * @param self a Fasta struct. + * @param reference a reference sequence. + * @return success boolean. + */ +bool Fasta__add_reference(struct Fasta* self, struct Sequence* reference) { + self->num_references++; + self->references = SAFEREALLOC(self->references, sizeof(struct Sequence*) * self->num_references); + if (self->references == NULL) { + return false; + } + self->references[self->num_references-1] = reference; + return true; +} + + +/** + * Add a new query sequence. + * @param self a Fasta struct. + * @param query a query sequence. + * @return success boolean. + */ +bool Fasta__add_query(struct Fasta* self, struct Sequence* query) { + self->num_queries++; + self->queries = SAFEREALLOC(self->queries, sizeof(struct Sequence*) * self->num_queries); + if (self->queries == NULL) { + return false; + } + self->queries[self->num_queries-1] = query; + return true; +} + + +/** + * Read a Fasta file. + * @param self a Fasta struct. + * @param filename + * @return success boolean. + */ +bool Fasta__read(struct Fasta* self, char* filename) { + FILE* file_descriptor = fopen(filename, "r"); + if (file_descriptor == NULL) { + die("Cannot open file: %s", filename); + } + + struct Sequence* sequence = SAFECALLOC(sizeof(Sequence), 1); + Sequence__init(sequence); + + uint8_t name_length = 0; + uint8_t acc_length = 0; + uint8_t do_length = 0; + uint8_t state = 0; + size_t lineno = 0; + bool reached_codons = false; + bool reached_queries = false; + + // initialize new sequence + while (1) { + char c = fgetc(file_descriptor); + + if (c == '\n') { + lineno++; + } + + logv(3, "Fasta-read-in sequence: %p\treferences: %u\tqueries: %u\tline: %lu\tstate: %u\tc: '%c'", sequence, self->num_references, self->num_queries, lineno, state, c); + switch (state) { + case 0: // awaiting '>' for sequence name + if (c == EOF) { + state = 3; + break; + } + + if (c == '#') { + reached_queries = true; + state = 13; + break; + } + + if (c == '>') { + state = 1; + break; + } + + break; + + case 1: // reading sequence name + if (c == '\t') { + sequence->name[name_length] = '\0'; + state = 11; + break; + } + if (c == '\n') { + sequence->name[name_length] = '\0'; + state = 2; + break; + } + sequence->name[name_length++] = c; + break; + + case 11: // reading acceptor profile file name + if (c == '\t') { + sequence->acceptor[acc_length] = '\0'; + acc_length = 0; + logv(3, "Full acceptor filename: %s", sequence->acceptor); + state = 12; + break; + } + sequence->acceptor[acc_length++] = c; + break; + + case 12: // reading donor profile file name + if (c == '\n') { + sequence->donor[do_length] = '\0'; + do_length = 0; + logv(3, "Full donor filename: %s", sequence->donor); + state = 2; + break; + } + sequence->donor[do_length++] = c; + break; + + case 13: // between references and query + if (c == '\n') { + state = 0; + break; + } + break; + + case 2: // read sequence + if (c == '\n') { + state = 0; + + if (sequence != NULL) { + // save current sequence + if (reached_queries) { + Fasta__add_query(self, sequence); + } else { + sequence->num_codons = floor(sequence->num_codon_bases / 3); + Fasta__add_reference(self, sequence); + } + } + + // initialize new sequence + sequence = SAFECALLOC(sizeof(Sequence), 1); + Sequence__init(sequence); + name_length = 0, + reached_codons = false; + + break; + } + // handle aligned fasta + if (c == ' ') { + sequence->num_align_spaces++; + break; // skip spaces + } + if (c == '-') { + // ignore deletes + break; + } + if (c == '|') { + sequence->start_split_length += sequence->num_codon_bases; + sequence->codons_offset = sequence->start_split_length; + sequence->end_split_length = 0; + sequence->num_codon_bases = 0; + reached_codons = false; + state = 21; + break; + } + if ('a' <= c && c <= 'z') { // lower case + // lower case indicate split codon + if(!reached_codons) { + sequence->codons_offset++; + } + if (sequence->length < 2) { + sequence->start_split_length++; // 3 = codon length + } else { + sequence->end_split_length++; + } + } else if ('A' <= c && c <= 'Z') { // upper case + reached_codons = true; + sequence->num_codon_bases++; + } + Sequence__append(sequence, Literal__from_char(c)); + break; + + case 21: // full codons following pipes + if (c == '\n') { + state = 0; + + if (sequence != NULL) { + // save current sequence + if (reached_queries) { + Fasta__add_query(self, sequence); + } else { + sequence->num_codons = floor(sequence->num_codon_bases / 3); + Fasta__add_reference(self, sequence); + } + } + + // initialize new sequence + sequence = SAFECALLOC(sizeof(Sequence), 1); + Sequence__init(sequence); + name_length = 0, + reached_codons = false; + + break; + } + if (c == '-') { + // ignore deletes + break; + } + if (c == '|') { + state = 22; + break; + } + reached_codons = true; + sequence->num_codon_bases++; + Sequence__append(sequence, Literal__from_char(c)); + break; + + case 22: // split codon donor end + if (c == '\n') { + state = 0; + + if (sequence != NULL) { + // save current sequence + if (reached_queries) { + Fasta__add_query(self, sequence); + } else { + sequence->num_codons = floor(sequence->num_codon_bases / 3); + Fasta__add_reference(self, sequence); + } + } + + // initialize new sequence + sequence = SAFECALLOC(sizeof(Sequence), 1); + Sequence__init(sequence); + name_length = 0, + reached_codons = false; + + break; + } + if (c == '-') { + // ignore deletes + break; + } + sequence->end_split_length++; + Sequence__append(sequence, Literal__from_char(c)); + break; + + case 3: // stop + if (sequence != NULL && !reached_queries) { + logv(1, "No separator line beginning with '#' found. Interpreting last sequence as query sequence."); + logv(3, "References: %u, Queries: %u", self->num_references, self->num_queries); + // remove from references, add to queries + Fasta__add_query(self, self->references[self->num_references-1]); + self->references[self->num_references-1] = NULL; + self->num_references--; + logv(3, "References: %u, Queries: %u", self->num_references, self->num_queries); + } + fclose(file_descriptor); + return true; + + default: + fclose(file_descriptor); + die("Unknown state: %u", state); + } // switch + } // while + + fclose(file_descriptor); + return false; +} diff --git a/src/Fasta.h b/src/Fasta.h new file mode 100644 index 0000000..cf578dc --- /dev/null +++ b/src/Fasta.h @@ -0,0 +1,27 @@ +/** + * Fasta parser declarations + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef FASTA_H_ +#define FASTA_H_ + +#include + +#include "Params.h" +#include "Sequence.h" + +typedef struct Fasta { + uint8_t num_references; + uint8_t num_queries; + struct Sequence** references; + struct Sequence** queries; +} Fasta; + +bool Fasta__init(struct Fasta* self); +bool Fasta__destroy(struct Fasta* self); +bool Fasta__read(struct Fasta* self, char* filename); +bool Fasta__add_reference(struct Fasta* self, struct Sequence* reference); +bool Fasta__add_query(struct Fasta* self, struct Sequence* query); + +#endif // FASTA_H_ diff --git a/src/HMM.c b/src/HMM.c new file mode 100644 index 0000000..7e53afe --- /dev/null +++ b/src/HMM.c @@ -0,0 +1,193 @@ +/** + * HMM implementation. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include +#include + +#include "Logging.h" +#include "SafeAlloc.h" + +#include "HMM.h" + +/** + * Create an empty Hidden Markov Model. + * @param num_states the number of states. + * @param num_starts the number of possible first states. + * @param num_ends the number of final states. + * @return pointer to the new HMM. + */ +struct HMM* HMM__create(size_t num_states, size_t num_starts, size_t num_ends) { + struct HMM* self = (struct HMM*) SAFEMALLOC(sizeof(struct HMM)); + + self->starts = (struct Transition*) SAFEMALLOC(sizeof(Transition) * num_starts); + self->num_states = 0; + self->max_states = num_states; + + self->ends = (struct Transition*) SAFEMALLOC(sizeof(Transition) * num_ends); + self->num_ends = 0; + self->max_ends = num_ends; + + self->states = (struct State*) SAFECALLOC(num_states, sizeof(struct State)); + self->num_starts = 0; + self->max_starts = num_states; + + return self; +} + +/** + * Remove HMM from memory. + * @param self a HMM. + * @return success boolean. + */ +bool HMM__destroy(struct HMM* self) { + free(self->starts); + free(self->ends); + free(self->states); + free(self); + return true; +} + +/** + * Add state. + * @param self a HMM. + * @return pointer to the next used state. + */ +struct State* HMM__new_state(struct HMM* self) { + if (self->num_states >= self->max_states) { + die("More states requested than reserved: %lu of %lu", self->num_states+1, self->max_states); + } + + STATE_ID_T id = self->num_states++; + self->states[id].id = id; + + return &self->states[id]; +} + +/** + * Set state to be a start state. + * @param self a HMM. + * @param transition the new transition. + * @return success boolean. + */ +bool HMM__set_start(struct HMM* self, struct Transition transition) { + if (self->num_starts >= self->max_starts) { + return false; + } + + self->starts[self->num_starts++] = transition; + return true; +} + +/** + * Set end. + * @param self a HMM. + * @param transition the new transition. + * @return success boolean. + */ +bool HMM__set_end(struct HMM* self, struct Transition transition) { + if (self->num_ends >= self->max_ends) { + return false; + } + + self->ends[self->num_ends++] = transition; + return true; +} + +/** + * Compose a string representation of a HMM. + * @param self a HMM. + * @param file a path to a file where the string representation will be stored in. + * @return success boolean. + */ +bool HMM__dot(struct HMM* self, FILE * file) { + fprintf(file, "digraph {\n"); + fprintf(file, " rankdir = \"LR\"\n"); + for (STATE_ID_T i=0; i < self->num_states; i++) { + struct State* state = &self->states[i]; + + for (int j=0; j < state->num_incoming; j++) { + struct Transition* incoming = &state->incoming[j]; + struct State* origin = &self->states[incoming->origin]; + + if (strlen(origin->name) == 0) { + origin->name[0] = 'x'; + } + fprintf(file, " %s_"SID" -> %s_"SID" [label=\"%le\"];\n", origin->name, origin->id, state->name, state->id, Logodd__exp(incoming->logodd)); + } + } + for (uint8_t i=0; i < self->num_starts; i++) { + struct Transition t = self->starts[i]; + struct State* state = &self->states[t.origin]; + + fprintf(file, " START -> %s_"SID" [label=\"%le\"];\n", state->name, state->id, Logodd__exp(t.logodd)); + } + for (uint8_t i=0; i < self->num_ends; i++) { + struct Transition t = self->ends[i]; + struct State* state = &self->states[t.origin]; + + fprintf(file, " %s_"SID" -> END [label=\"%le\"];\n", state->name, state->id, Logodd__exp(t.logodd)); + } + fprintf(file, "}\n"); + return true; +} + +/** + * Normalize transitions + * @param hmm a HMM. + * @param left a state of which the outgoing transitions are to be normalized. + * @return success boolean. + */ +bool HMM__normalize(struct HMM* hmm, struct State* left) { + // gather all outgoing edges that leave the left state + uint8_t num_outgoing = 0; + struct Transition* outgoing[15]; + struct State* rights[15]; + + for (size_t j=0; j < hmm->num_states; j++) { + struct State* right = &hmm->states[j]; + + for (size_t k=0; k < right->num_incoming; k++) { + struct Transition* transition = &right->incoming[k]; + + if (transition->origin == left->id) { + rights[num_outgoing] = right; + outgoing[num_outgoing++] = transition; + } + } // O(+deg) + } // O(+deg * S) + for (size_t k=0; k < hmm->num_ends; k++) { + struct Transition* transition = &hmm->ends[k]; + if (transition->origin == left->id) { + rights[num_outgoing] = NULL; + outgoing[num_outgoing++] = transition; + } + } + + // perform sumexp(outgoings) (This is how yahmm does it.) + LOGODD_T total = LOGODD_NEGINF; + for (uint8_t j=0; j < num_outgoing; j++) { + struct Transition* transition = outgoing[j]; + total = Logodd__sumexp(total, transition->logodd); + } + + if (total == 0.0) { + logv(5, "No normalization\t%s ==%E==> X", left->name, total); + return true; + } + + // substract log(sumexp) from all outgoing transition log odds + for (uint8_t j=0; j < num_outgoing; j++) { + LOGODD_T old = outgoing[j]->logodd; + outgoing[j]->logodd = Logodd__add(outgoing[j]->logodd, -total); + if (rights[j] != NULL) { + logv(5, "Normalization:\t%s ==(%f to %f)==> %s", left->name, Logodd__exp(old), Logodd__exp(outgoing[j]->logodd), rights[j]->name); + } else { + logv(5, "Normalization:\t%s ==(%f to %f)==> END", left->name, Logodd__exp(old), Logodd__exp(outgoing[j]->logodd)); + } + } + return true; +} // O(+deg * S) diff --git a/src/HMM.h b/src/HMM.h new file mode 100644 index 0000000..126739c --- /dev/null +++ b/src/HMM.h @@ -0,0 +1,38 @@ +/** + * HMM definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#ifndef HMM_H_ +#define HMM_H_ + +#include +#include + +#include "State.h" +#include "Transition.h" + +#define HMM_MAX_STRING_LENGTH 4096 + +typedef struct HMM { + struct Transition* starts; + size_t num_starts; + size_t max_starts; + + struct Transition* ends; + size_t num_ends; + size_t max_ends; + + struct State* states; + size_t num_states; + size_t max_states; +} HMM; + +struct HMM* HMM__create(size_t states, size_t num_starts, size_t num_ends); +bool HMM__destroy(struct HMM* hmm); +struct State* HMM__new_state(struct HMM* self); +bool HMM__set_start(struct HMM* self, struct Transition transition); +bool HMM__set_end(struct HMM* self, struct Transition transition); +bool HMM__dot(struct HMM* self, FILE* file); +bool HMM__normalize(struct HMM* self, struct State* state); + +#endif // HMM_H_ diff --git a/src/Literal.c b/src/Literal.c new file mode 100644 index 0000000..1e53433 --- /dev/null +++ b/src/Literal.c @@ -0,0 +1,109 @@ +/** + * Literal definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include "Logging.h" + +#include "Literal.h" + +/** + * Given a character, return its Literal representation. + * An invalid character causes this function to kill the process (die). + * @param c a character. + * @return the according Literal. + */ +Literal Literal__from_char(char c) { + switch (c) { + case 'a': + case 'A': + return LITERAL_A; + case 'c': + case 'C': + return LITERAL_C; + case 'g': + case 'G': + return LITERAL_G; + case 't': + case 'T': + return LITERAL_T; + case 'n': + case 'N': + return LITERAL_N; + default: + die("Unkown literal 0x%x=%c", c, c); + } + return LITERAL_N; +} + +/** + * Given a Literal, return its character representation. + * This function kills the process if the literal is unknown (die). + * @param literal a Literal. + * @return character representation. + */ +char Literal__char(Literal literal) { + switch (literal) { + case LITERAL_A: + return 'A'; + case LITERAL_C: + return 'C'; + case LITERAL_G: + return 'G'; + case LITERAL_T: + return 'T'; + case LITERAL_N: + return 'N'; + default: + die("Unknown literal 0x%x", literal); + } + return 'N'; +} + +/** + * Given an array of literals, fill a string with character representations. + * @param length the number of Literals. + * @param literals the array of Literals. + * @param result the string buffer that will contain the string representation. + */ +void Literal__str(size_t length, Literal literals[length], char result[length+1]) { + for (size_t i=0; i < length; i++) { + result[i] = Literal__char(literals[i]); + } + result[length] = '\0'; +} + + +/** + * Convert an array of literals to an unsigned int. + * @param length size of the array. + * @param array the array of literals. + * @return an uint8_fastest. + */ +EMISSION_ID_T Literal__uint(uint8_t length, Literal array[length]) { + EMISSION_ID_T byte = 0; + + for (uint8_t i=0; i < length; i++) { + if (array[i] == LITERAL_N) { + warn("N literal found."); + } + byte <<= 2; + byte += array[i]; + } + + return byte; +} + + +/** + * Count the number of N-Literals in a given array. + * @param length length of the given array. + * @param array an array of Literals. + * @return the number of N-Literals. + */ +uint8_t Literal__Ns(uint8_t length, Literal array[length]) { + uint8_t count = 0; + for (uint8_t i=0; i < length; i++) { + count += array[i] == LITERAL_N; + } + return count; +} diff --git a/src/Literal.h b/src/Literal.h new file mode 100644 index 0000000..45e3fcb --- /dev/null +++ b/src/Literal.h @@ -0,0 +1,28 @@ +/** + * Literal definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#ifndef LITERAL_H_ +#define LITERAL_H_ + +#include +#include + +#define EMISSION_ID_T uint_fast8_t + +typedef enum Literal { + LITERAL_A, + LITERAL_C, + LITERAL_G, + LITERAL_T, + LITERAL_N +} Literal; +#define NUM_LITERALS 4 + +Literal Literal__from_char(char c); +char Literal__char(Literal literal); +void Literal__str(size_t length, Literal literals[length], char buffer[length]); +EMISSION_ID_T Literal__uint(uint8_t length, Literal array[length]); +uint8_t Literal__Ns(uint8_t length, Literal array[length]); + +#endif // LITERAL_H_ diff --git a/src/Logging.h b/src/Logging.h new file mode 100644 index 0000000..bd039fe --- /dev/null +++ b/src/Logging.h @@ -0,0 +1,43 @@ +/** + * Logging macros. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef LOGGING_H_ +#define LOGGING_H_ + +#include +#include +#include + +// set loglevel +#ifndef LOGLEVEL +#if DEBUG +#define LOGLEVEL 5 +#else +#define LOGLEVEL 0 +#endif // DEBUG +#endif // LOGLEVEL + +#define FAIL_EXIT 1 + +extern char g_loglevel; + +#define warn(format, ...)\ + fprintf(stderr, "WARNING! %s:%d %s():\t" format "\n", __FILE__, __LINE__, __func__, ##__VA_ARGS__);\ + +/** logv prints to stderr if given level exceeds LOGLEVEL + * http://stackoverflow.com/questions/1644868/c-define-macro-for-debug-printing + */ +#define logv(level, format, ...) if (level <= g_loglevel)\ + fprintf(stderr, "VERBOSE%i %s:%d %s():\t" format "\n", level, __FILE__, __LINE__, __func__, ##__VA_ARGS__);\ + +/** + * Kill the program after printing some information to stderr. + */ +#define die(format, ...)\ + fprintf(stderr, "\x1B[31mCRITICAL %s:%d %s():\t\x1B[0m" format "\n", __FILE__, __LINE__, __func__, ##__VA_ARGS__); exit(FAIL_EXIT); + + + +#endif // LOGGING_H_ diff --git a/src/Logodd.c b/src/Logodd.c new file mode 100644 index 0000000..a9cd40b --- /dev/null +++ b/src/Logodd.c @@ -0,0 +1,104 @@ +/** + * Logodd operations. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include + +#include "Logging.h" +#include "Logodd.h" + +/** + * Export a value to log-space. (Natural logarithm) + * Hereby, the result will be LOGODD_NEGINF if the given value = 0. + * @param input a value + * @return the natural logarithm of input + */ +LOGODD_T Logodd__log(LOGODD_T input) { + if (input == 0) { + return LOGODD_NEGINF; + } + return log(input); +} + +/** + * Import a value from log-space. (Natural logarithm) + * Hereby, the result will be LOGODD_NEGINF if the given value = LOGODD_NEGINF. + * @param input a value + * @return the natural logarithm of input + */ +LOGODD_T Logodd__exp(LOGODD_T input) { + if (input == LOGODD_NEGINF) { + return 0; + } + return exp(input); +} + +/** + * Compute the sum of two log odds. + * This method absorbs LOGODD_NEGINF, meaning that the sum of any value to + * LOGODD_NEGINF will result in LOGODD_NEGINF. + * @param a a log odd. + * @param b a log odd. + * @return the sum of a and b. + */ +LOGODD_T Logodd__add(LOGODD_T a, LOGODD_T b) { + if (isnan(a)) { + die("a is NAN"); + } else if (isnan(b)) { + die("b is NAN"); + } + if (a == LOGODD_NEGINF) { + return LOGODD_NEGINF; + } + if (b == LOGODD_NEGINF) { + return LOGODD_NEGINF; + } + return a + b; +} + +/** + * Compute the sum of two probabilities given in log space. + * This method tries to avoid loosing precision due to conversion from log space. + * @param a a log odd. + * @param b a log odd. + * @return log(exp(a)+exp(b)). + */ +LOGODD_T Logodd__sumexp(LOGODD_T a, LOGODD_T b) { + if (isnan(a) || isnan(b)) { + die("Either a, b or both are NAN."); + } + if (a > b) { + if (b == LOGODD_NEGINF) { + return a; + } + return a + Logodd__log(1 + Logodd__exp(b - a)); + } + if (a == LOGODD_NEGINF) { + return b; + } + return b + Logodd__log(1 + Logodd__exp(a - b)); +} + + +/** + * Compute the subtraction of two probabilities given in log space. + * This method tries to avoid loosing precision due to conversion from log space. + * @param a a log odd. + * @param b a log odd. + * @return log(exp(a)-exp(b)). + */ +LOGODD_T Logodd__subexp(LOGODD_T a, LOGODD_T b) { + if (isnan(a) || isnan(b)) { + die("Either a, b or both are NAN."); + } + if (a > b) { + if (b == LOGODD_NEGINF) { + return a; + } + return a + Logodd__log(1 - Logodd__exp(b - a)); + } + if (a == LOGODD_NEGINF) { + return b; + } + return b + Logodd__log(1 - Logodd__exp(a - b)); +} diff --git a/src/Logodd.h b/src/Logodd.h new file mode 100644 index 0000000..2f00175 --- /dev/null +++ b/src/Logodd.h @@ -0,0 +1,23 @@ +/** + * Logodd definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#ifndef LOGODD_H_ +#define LOGODD_H_ + +#include +#include +#include + +//#define LOGODD_T long double +//#define LOGODD_NEGINF -LDBL_MAX +#define LOGODD_T double +#define LOGODD_NEGINF -DBL_MAX + +LOGODD_T Logodd__log(LOGODD_T input); +LOGODD_T Logodd__exp(LOGODD_T input); +LOGODD_T Logodd__add(LOGODD_T summand, LOGODD_T addend); +LOGODD_T Logodd__sumexp(LOGODD_T a, LOGODD_T b); +LOGODD_T Logodd__subexp(LOGODD_T a, LOGODD_T b); + +#endif // LOGODD_H_ diff --git a/src/Matrix.c b/src/Matrix.c new file mode 100644 index 0000000..e69f297 --- /dev/null +++ b/src/Matrix.c @@ -0,0 +1,260 @@ +/** + * LogoddLogoddMatrix implementation. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include +#include +#include + +#include "Stateid.h" +#include "Logodd.h" +#include "Logging.h" +#include "SafeAlloc.h" + +#include "Matrix.h" + +/** + * Create a PathMatrix. + * @param columns number of columns. + * @param rows number of rows. + * @param default_value value, the matrix will be filled with ab initio. + * @return pointer to the created matrix. + */ +struct PathMatrix* PathMatrix__create(size_t columns, size_t rows, STATE_ID_T default_value) { + logv(2, "create(%lux%lu, "SID")", columns, rows, default_value); + PathMatrix* self = (struct PathMatrix*) SAFEMALLOC(sizeof(struct PathMatrix)); + + self->num_rows = rows; + self->num_columns = columns; + + self->v = (STATE_ID_T*) SAFEMALLOC(sizeof(STATE_ID_T) * rows * columns); + for (size_t i=0; i < rows * columns; i++) { + self->v[i] = default_value; + } + + return self; +} + +/** + * Destroy a PathMatrix. + * @param self a PathMatrix. + * @return success boolean. + */ +bool PathMatrix__destroy(struct PathMatrix* self) { + free(self->v); + free(self); + return true; +} + +/** + * Set a value in the matrix. + * @param self the PathMatrix. + * @param column the first value of the coordinates. + * @param row the second value of the coordinates. + * @param value the value that will be written. + * @return success boolean. + */ +bool PathMatrix__set(struct PathMatrix* self, size_t column, size_t row, STATE_ID_T value) { + self->v[self->num_rows * column + row] = value; + return true; +} + +/** + * Get a value in the matrix. + * @param self the PathMatrix. + * @param column the first value of the coordinates. + * @param row the second value of the coordinates. + * @return the value. + */ +STATE_ID_T PathMatrix__get(struct PathMatrix* self, size_t column, size_t row) { + if (column >= self->num_columns || row >= self->num_rows) { + die("Invalid matrix access: %lux%lu[%lu][%lu]", self->num_columns, self->num_rows, column, row); + } + + return self->v[self->num_rows * column + row]; +} + +/** + * Compose a string representation for the PathMatrix. + * @param self a PathMatrix. + * @param buffer a pre-allocated string that will contain the resulting representation. + * @return success boolean. + */ +bool PathMatrix__str(struct PathMatrix* self, char* buffer) { + buffer[0] = '\0'; + char tmp[255] = ""; + for (size_t row=0; row < self->num_rows; row++) { + sprintf(tmp, "%lu\t", row); + strcat(buffer, tmp); + /* + if (row == 0 && self->num_rows > 1) { + strcat(buffer, "/ "); + } else if (row == self->num_rows-1 && self->num_rows > 1) { + strcat(buffer, "\\ "); + } else { + strcat(buffer, "[ "); + } + */ + + for (size_t column=0; column < self->num_columns; column++) { + STATE_ID_T id = PathMatrix__get(self, column, row); + if (id == STATE_MAX_ID) { + sprintf(tmp, "NA\t"); + } else { + sprintf(tmp, SID"\t", id); + } + strcat(buffer, tmp); + } + + /* + if (row == 0 && self->num_rows > 1) { + strcat(buffer, "\\"); + } else if (row == self->num_rows-1 && self->num_rows > 1) { + strcat(buffer, "/"); + } else { + strcat(buffer, "]"); + } + */ + strcat(buffer, "\n"); + } + + return true; +} + +/** + * Get the size of a PathMatrix in Bytes. + * @param self a PathMatrix. + * @return the number of Bytes used by this PathMatrix + */ +size_t PathMatrix__bytes(struct PathMatrix* self) { + return (self->num_rows * self->num_columns * sizeof(STATE_ID_T)); +} + + +/*****************************************/ + + +/** + * Create a LogoddMatrix. + * @param columns number of columns. + * @param rows number of rows. + * @param default_value the resulting matrix will be filled with this value. + * @return pointer to resulting matrix. + */ +struct LogoddMatrix* LogoddMatrix__create(size_t columns, size_t rows, LOGODD_T default_value) { + logv(2, "create(%lux%lu, %E)", (unsigned long) columns, (unsigned long) rows, default_value); + LogoddMatrix* self = (struct LogoddMatrix*) SAFEMALLOC(sizeof(struct LogoddMatrix)); + + self->num_rows = rows; + self->num_columns = columns; + + self->v = (LOGODD_T*) SAFEMALLOC(sizeof(LOGODD_T) * rows * columns); + for (size_t i=0; i < rows * columns; i++) { + self->v[i] = default_value; + } + + return self; +} + +/** + * Destroy a LogoddMatrix. + * @param self a LogoddMatrix. + * @return success boolean. + */ +bool LogoddMatrix__destroy(LogoddMatrix* self) { + free(self->v); + free(self); + return true; +} + +/** + * Set a value in the LogoddMatrix + * @param self the LogoddMatrix. + * @param column the first value of the coordinates. + * @param row the second value of the coordinates. + * @param value the value that will be written. + * @return success boolean. + */ +bool LogoddMatrix__set(LogoddMatrix* self, size_t column, size_t row, LOGODD_T value) { + self->v[self->num_rows * column + row] = value; + return true; +} + +/** + * Get a value from the LogoddMatrix + * @param self the LogoddMatrix. + * @param column the first value of the coordinates. + * @param row the second value of the coordinates. + * @return the value. + */ +LOGODD_T LogoddMatrix__get(LogoddMatrix* self, size_t column, size_t row) { + if (column >= self->num_columns || row >= self->num_rows) { + die("Invalid matrix access: %lux%lu[%lu][%lu]", self->num_columns, self->num_rows, column, row); + } + + return self->v[self->num_rows * column + row]; +} + +/** + * Write string representation of a LogoddMatrix to buffer. + * @param self a LogoddMatrix. + * @param buffer pre-allocated string that will contain the resulting representation. + * @return success boolean. + */ +bool LogoddMatrix__str(LogoddMatrix* self, char* buffer) { + buffer[0] = '\0'; + char tmp[255] = ""; + + if (self->num_columns * self->num_rows > 64*64) { + warn("You tried to print a very large table."); + } + + for (size_t row=0; row < self->num_rows; row++) { + sprintf(tmp, "%lu\t", row); + strcat(buffer, tmp); + /* + if (row == 0 && self->num_rows > 1) { + strcat(buffer, "/ "); + } else if (row == self->num_rows-1 && self->num_rows > 1) { + strcat(buffer, "\\ "); + } else { + strcat(buffer, "[ "); + } + */ + + for (size_t column=0; column < self->num_columns; column++) { + LOGODD_T logodd = LogoddMatrix__get(self, column, row); + if (logodd == LOGODD_NEGINF) { + sprintf(tmp, "-inf\t"); + } else { + sprintf(tmp, "%+E\t", logodd); + } + strcat(buffer, tmp); + } + + /* + if (row == 0 && self->num_rows > 1) { + strcat(buffer, "\\"); + } else if (row == self->num_rows-1 && self->num_rows > 1) { + strcat(buffer, "/"); + } else { + strcat(buffer, "]"); + } + */ + strcat(buffer, "\n"); + } + + return true; +} + +/** + * Get the size of a LogoddMatrix in Bytes. + * @param self a LogoddMatrix. + * @return number of Bytes used by given LogoddMatrix. + */ +size_t LogoddMatrix__bytes(struct LogoddMatrix* self) { + return (self->num_rows * self->num_columns * sizeof(LOGODD_T)); +} diff --git a/src/Matrix.h b/src/Matrix.h new file mode 100644 index 0000000..6778217 --- /dev/null +++ b/src/Matrix.h @@ -0,0 +1,53 @@ +/** + * LogoddMatrix definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#ifndef VMATRIX_H_ +#define VMATRIX_H_ + +#include + +#include "Logodd.h" +#include "State.h" + + +typedef struct PathMatrix { + size_t num_rows; + size_t num_columns; + STATE_ID_T* v; +} PathMatrix; + +struct PathMatrix* PathMatrix__create(size_t columns, size_t rows, STATE_ID_T default_value); + +bool PathMatrix__destroy(struct PathMatrix* self); + +bool PathMatrix__set(struct PathMatrix* self, size_t column, size_t row, STATE_ID_T value); + +STATE_ID_T PathMatrix__get(struct PathMatrix* self, size_t column, size_t row); + +bool PathMatrix__str(struct PathMatrix* self, char* buffer); + +size_t PathMatrix__bytes(struct PathMatrix* self); + +/******************************************/ + +typedef struct LogoddMatrix { + size_t num_rows; + size_t num_columns; + LOGODD_T* v; +} LogoddMatrix; + +struct LogoddMatrix* LogoddMatrix__create(size_t columns, size_t rows, LOGODD_T default_value); + +bool LogoddMatrix__destroy(struct LogoddMatrix* self); + +bool LogoddMatrix__set(struct LogoddMatrix* self, size_t column, size_t row, LOGODD_T value); + +LOGODD_T LogoddMatrix__get(struct LogoddMatrix* self, size_t column, size_t row); + +bool LogoddMatrix__str(struct LogoddMatrix* self, char buffer[]); + +size_t LogoddMatrix__bytes(struct LogoddMatrix* self); + + +#endif // VMATRIX_H_ diff --git a/src/Model.c b/src/Model.c new file mode 100644 index 0000000..f593c21 --- /dev/null +++ b/src/Model.c @@ -0,0 +1,364 @@ +/** + * Align an exon to a reference + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "Logging.h" +#include "SafeAlloc.h" +#include "Distribution.h" +#include "EmissionTable.h" +#include "HMM.h" +#include "State.h" +#include "Params.h" +#include "Profile.h" +#include "Fasta.h" + +#include "Model.h" + + +bool create_profile_chain(struct HMM* hmm, struct Profile* profile, struct State** start, struct State** end) { + struct State* previous_state = NULL; + for (int i=0; i < profile->length; i++) { + struct State* current_state = HMM__new_state(hmm); + + Literal reference_base[1]; + reference_base[0] = LITERAL_A; + + char name[STATE_NAME_LENGTH]; + sprintf(name, "match_%s", profile->name); + State__init(current_state, name, 1, reference_base, &profile->emission_tables[i]); + current_state->custom = i; + if (previous_state != NULL) { + State__add_incoming(previous_state, 0, current_state); + } + + if (i == 0) { + *start = current_state; + } else if (i >= profile->length-1) { + *end = current_state; + } + + previous_state = current_state; + } // Chain end. + assert(*start != NULL); + assert(*end != NULL); + return true; +} + + +bool forward_deletions(struct HMM* hmm, struct Params* params, size_t num_codons, struct State** codons) { + for (size_t i=0; i < num_codons; i++) { + for (uint8_t j=0; j < params->num_factors; j++) { + if (i+j+1 >= num_codons) { + break; + } + if (i == 0) { + State__add_incoming(codons[i], params->multiple_cd_factors[j] + params->cd_acc, codons[i+j+2]); + logv(5, "cd_acc: %e from: %lu to: %lu", Logodd__exp(params->multiple_cd_factors[j]+params->cd_acc), i, i+j+2); + } else if (num_codons == i+j+2) { + State__add_incoming(codons[i], params->multiple_cd_factors[j] + params->cd_do, codons[i+j+2]); + logv(5, "cd_do: %e from: %lu to: %lu", Logodd__exp(params->multiple_cd_factors[j]+params->cd_do), i, i+j+2); + } else { + State__add_incoming(codons[i], Logodd__log(params->multiple_cd_factors[j] * Logodd__exp(params->cd_logodd)), codons[i+j+2]); + } + } +#if !NONORMALIZE + HMM__normalize(hmm, codons[i]); +#endif // NONORMALIZE + } + return true; +} + +bool match_codon(struct HMM* hmm, Params* params, size_t index, Literal codon[3], bool last, struct State** curr_cluster) { + char codon_str[4] = ""; + Literal__str(3, codon, codon_str); + + /* + for (uint8_t i=0; i < params->num_stop_codons; i++) { + if (codon[0] == params->stop_codons[i*3] && + codon[0+1] == params->stop_codons[i*3+1] && + codon[0+2] == params->stop_codons[i*3+2]) { + die("Reference contains full stop codon %s: Codon %lu", codon_str, index); + } + } + */ + + // match_curr_codon + struct State* match_codon = HMM__new_state(hmm); + struct State* insert_codon_codon = HMM__new_state(hmm); + struct State* insert_1nt_codon = HMM__new_state(hmm); + struct State* delete_1nt_codon = HMM__new_state(hmm); + struct State* delete_2nt_codon = HMM__new_state(hmm); + struct State* next_cluster = HMM__new_state(hmm); + + State__init(match_codon, "match_codon", 3, codon, params->emission_table_64_LAMBDA); + match_codon->custom = index; + State__init_uniform(insert_codon_codon, "insert_codon_codon", 3, params->emission_table_61_LAMBDA); + insert_codon_codon->custom = index; + State__init_uniform(insert_1nt_codon, "insert_1nt_codon", 1, params->emission_table_4_UNIFORM); + insert_1nt_codon->custom = index; + State__init_uniform(delete_1nt_codon, "delete_1nt_codon", 2, params->emission_table_16_UNIFORM); + delete_1nt_codon->custom = index; + State__init_uniform(delete_2nt_codon, "delete_2nt_codon", 1, params->emission_table_4_UNIFORM); + delete_2nt_codon->custom = index; + State__init_silent(next_cluster, "end_codon"); + next_cluster->custom = index; + + State__add_incoming(*curr_cluster, params->js_c1, match_codon); + + State__add_incoming(match_codon, params->fs_logodd, insert_1nt_codon); + State__add_incoming(insert_1nt_codon, params->nti_js, next_cluster); + if (last) { + State__add_incoming(*curr_cluster, params->cd_do, next_cluster); + State__add_incoming(match_codon, params->c3_i1_do, insert_codon_codon); + State__add_incoming(insert_codon_codon, params->i3_i1_do, insert_codon_codon); + State__add_incoming(insert_codon_codon, params->i3_js_do, next_cluster); + } else { + if (index == 0) { + State__add_incoming(*curr_cluster, params->cd_acc, next_cluster); + } else { + State__add_incoming(*curr_cluster, params->js_js, next_cluster); + } + State__add_incoming(match_codon, params->c3_i1, insert_codon_codon); + State__add_incoming(insert_codon_codon, params->i3_i1, insert_codon_codon); + State__add_incoming(insert_codon_codon, params->i3_js, next_cluster); + } + State__add_incoming(insert_1nt_codon, params->nti_nti, insert_1nt_codon); + + State__add_incoming(match_codon, params->c3_js, next_cluster); + + + + State__add_incoming(*curr_cluster, params->fs_logodd, delete_1nt_codon); + State__add_incoming(*curr_cluster, params->fs_logodd, delete_2nt_codon); + State__add_incoming(delete_1nt_codon, 0, next_cluster); + State__add_incoming(delete_2nt_codon, 0, next_cluster); + + *curr_cluster = next_cluster; + return true; +} + + +bool create_codon_chain(struct HMM* hmm, struct Params* params, size_t* num_codons, struct State** codons, size_t num_literals, Literal* reference, struct State* start, struct State** end) { + if (num_literals % 3 != 0) { + die("A reference is out of frame by %lu nt.", num_literals % 3); + } + + Literal codon[3]; + codons[0] = start; + *num_codons = 0; + + struct State* curr_cluster = start; + for (size_t i=0; i+2 < num_literals; i+=3) { + codon[0] = reference[i]; + codon[1] = reference[i+1]; + codon[2] = reference[i+2]; + //logv(8, "Cluster i=%lu\tnum_codons=%lu\tcodon=%x%x%x", i, (*num_codons)+1, codon[0], codon[1], codon[2]); + // Create a cluster for this codon. + match_codon(hmm, params, *num_codons, codon, i+3 >= num_literals, &curr_cluster); + codons[1 + (*num_codons)++] = curr_cluster; + } + *end = curr_cluster; + return true; +} + + +struct HMM* multi_exon(struct Params* params, struct Fasta* fasta, struct Profile** acceptors, struct Profile** donors) { + // read profiles and count states + size_t num_states = 0; + for (uint8_t i=0; i < fasta->num_references; i++) { + struct Sequence* reference = fasta->references[i]; + + num_states += 6 + 6 * reference->num_codons + 1 + 2 + 2; + + num_states += acceptors[i]->length; + num_states += donors[i]->length; + } + + struct HMM* hmm = HMM__create(num_states, 3, 3); + Params__recalculate(params); + + uint8_t num_split_codons = 0; + struct State** split_codons = (struct State**) SAFEMALLOC(sizeof(struct State*) * fasta->num_references * 2); + struct State* former_intron = NULL; + struct State* first_intron = NULL; + + logv(1, "There are %i references.", fasta->num_references); + for (uint8_t i=0; i < fasta->num_references; i++) { + struct Sequence* reference = fasta->references[i]; + + // intron start + struct State* intron_start; + if (former_intron == NULL) { + intron_start = HMM__new_state(hmm); + State__init_uniform(intron_start, "intron_start", 1, params->emission_table_4_UNIFORM); + State__add_incoming(intron_start, params->b2_b2, intron_start); + first_intron = intron_start; + } else { + intron_start = former_intron; + } + + // acceptor + struct State* match_acceptor = NULL; + struct State* match_acceptor_end = NULL; + create_profile_chain(hmm, acceptors[i], &match_acceptor, &match_acceptor_end); + State__add_incoming(intron_start, params->b2_acc, match_acceptor); + + struct State* between_acc_split = HMM__new_state(hmm); + State__init_silent(between_acc_split, "between_acc_split"); + split_codons[num_split_codons++] = between_acc_split; + State__add_incoming(intron_start, params->b2_bas, between_acc_split); + State__add_incoming(match_acceptor_end, 0, between_acc_split); + + logv(1, "reference->start_split_length: %u", reference->start_split_length); + struct State* split_codon_acceptor = HMM__new_state(hmm); + switch (reference->start_split_length) { + case 0: + State__init_silent(split_codon_acceptor, "split_codon_acceptor"); + break; + case 1: + State__init_uniform(split_codon_acceptor, "split_codon_acceptor", 1, params->emission_table_4_UNIFORM); + break; + case 2: + State__init_uniform(split_codon_acceptor, "split_codon_acceptor", 2, params->emission_table_16_UNIFORM); + break; + default: + die("Invalid number of split codon nucleotides: %u", reference->start_split_length); + } + State__add_incoming(between_acc_split, Logodd__log(1.0 - Logodd__exp(params->fs_logodd)), split_codon_acceptor); + + struct State* insert_1nt_acceptor = HMM__new_state(hmm); + State__init_uniform(insert_1nt_acceptor, "insert_1nt_acceptor", 1, params->emission_table_4_UNIFORM); + State__add_incoming(insert_1nt_acceptor, params->nti_nti, insert_1nt_acceptor); + + struct State* insert_codon_acceptor = HMM__new_state(hmm); + State__init_uniform(insert_codon_acceptor, "insert_codon_acceptor", 3, params->emission_table_61_LAMBDA); + State__add_incoming(insert_codon_acceptor, params->i3_i1_acc, insert_codon_acceptor); + + struct State* between_split_ins = HMM__new_state(hmm); + State__init_silent(between_split_ins, "between_split_ins"); + State__add_incoming(split_codon_acceptor, 0.0, between_split_ins); + State__add_incoming(between_acc_split, params->fs_logodd, between_split_ins); + State__add_incoming(between_split_ins, params->splice_nti, insert_1nt_acceptor); + State__add_incoming(between_split_ins, params->splice_i1, insert_codon_acceptor); + + // codons + struct State* start_first_codon = HMM__new_state(hmm); + State__init_silent(start_first_codon, "start_first_codon"); + State__add_incoming(between_split_ins, params->splice_js, start_first_codon); + State__add_incoming(insert_1nt_acceptor, params->nti_js, start_first_codon); + State__add_incoming(insert_codon_acceptor, params->i3_js_acc, start_first_codon); + + size_t num_codons = 0; + struct State* end_last_codon = NULL; + struct State** codons = (struct State**) SAFEMALLOC(sizeof(struct State*) * (2 + reference->num_codons)); + + // Iterate over Codons + create_codon_chain(hmm, params, &num_codons, codons, reference->num_codon_bases, &reference->sequence[reference->codons_offset], start_first_codon, &end_last_codon); + + logv(1, "Codons:\t%lu", num_codons); + assert(num_codons == reference->num_codons); + + forward_deletions(hmm, params, num_codons, codons); + + free(codons); + + // donor + logv(1, "reference->end_split_length: %u", reference->end_split_length); + struct State* split_codon_donor = HMM__new_state(hmm); + switch (reference->end_split_length) { + case 0: + State__init_silent(split_codon_donor, "split_codon_donor"); + break; + case 1: + State__init_uniform(split_codon_donor, "split_codon_donor", 1, params->emission_table_4_UNIFORM); + break; + case 2: + State__init_uniform(split_codon_donor, "split_codon_donor", 2, params->emission_table_16_UNIFORM); + break; + default: + die("Invalid number of split codon nucleotides in file %s: %u", params->fasta_file, params->split_emissions_donor); + } + State__add_incoming(end_last_codon, params->js_scd, split_codon_donor); + + struct State* between_split_donor = HMM__new_state(hmm); + State__init_silent(between_split_donor, "between_split_donor"); + split_codons[num_split_codons++] = between_split_donor; + + State__add_incoming(end_last_codon, params->fs_logodd, between_split_donor); + State__add_incoming(split_codon_donor, 0., between_split_donor); + + struct State* match_donor = NULL; + struct State* match_donor_end = NULL; + create_profile_chain(hmm, donors[i], &match_donor, &match_donor_end); + if (i+1 == fasta->num_references) { + State__add_incoming(between_split_donor, params->bsd_do, match_donor); + } else { + State__add_incoming(between_split_donor, params->bsd_do_id, match_donor); + } + + // intron end + struct State* intron_end = HMM__new_state(hmm); + State__init_uniform(intron_end, "intron_end", 1, params->emission_table_4_UNIFORM); + State__add_incoming(intron_end, params->e1_e1, intron_end); + State__add_incoming(between_split_donor, params->skip_do, intron_end); + State__add_incoming(match_donor_end, params->do2_e1, intron_end); + former_intron = intron_end; + + struct Transition t; + + // where to start + if (i == 0) { + t.origin=first_intron->id; + t.logodd=params->b1_b2; + HMM__set_start(hmm, t); + + t.origin=match_acceptor->id; + t.logodd=params->b1_acc; + HMM__set_start(hmm, t); + + t.origin=between_acc_split->id; + t.logodd=params->b1_bas; + HMM__set_start(hmm, t); + } + + // where to end + if (i+1 == fasta->num_references) { + t.origin = intron_end->id; + t.logodd = params->e1_e2; + HMM__set_end(hmm, t); + + t.origin = between_split_donor->id; + t.logodd = params->bsd_e2; + HMM__set_end(hmm, t); + + t.origin = match_donor_end->id; + t.logodd = params->do2_e2; + HMM__set_end(hmm, t); + } + + if(params->dirty) { + for(STATE_ID_T i=0; i < hmm->num_states; i++) { + HMM__normalize(hmm, &hmm->states[i]); + } + } + } // end for + +#if !NONORMALIZE + for (uint8_t i=1; i+1 < num_split_codons; i+=2) { + State__add_incoming(split_codons[i], params->intron_del, split_codons[i+1]); + HMM__normalize(hmm, split_codons[i]); + } +#endif + free(split_codons); + + return hmm; +} diff --git a/src/Model.h b/src/Model.h new file mode 100644 index 0000000..56cdfbf --- /dev/null +++ b/src/Model.h @@ -0,0 +1,16 @@ +/** + * Model construction. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef MODEL_H_ +#define MODEL_H_ + +#include "Literal.h" +#include "Profile.h" +#include "Fasta.h" +#include "HMM.h" + +struct HMM* multi_exon(struct Params* params, struct Fasta* fasta, struct Profile** acceptors, struct Profile** donors); + +#endif // MODEL_H_ diff --git a/src/Params.c b/src/Params.c new file mode 100644 index 0000000..4925af6 --- /dev/null +++ b/src/Params.c @@ -0,0 +1,446 @@ +/** + * Param implementation. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include +#include +#include + +#include "EmissionTable.h" +#include "Logging.h" +#include "SafeAlloc.h" + +#include "Params.h" + +#define MEMORYLIMIT 16 + +bool Params__create(struct Params* self, struct EmissionTable emission_tables[6]) { + self->dirty = false; + self->firstexon = false; + self->lastexon = false; + self->max_memory = MEMORYLIMIT; + + self->num_stop_codons = 3; + self->stop_codons = (Literal*) SAFEMALLOC(sizeof(Literal) * 3 * self->num_stop_codons); + self->stop_codons[0] = LITERAL_T; + self->stop_codons[1] = LITERAL_G; + self->stop_codons[2] = LITERAL_A; + + self->stop_codons[3] = LITERAL_T; + self->stop_codons[4] = LITERAL_A; + self->stop_codons[5] = LITERAL_A; + + self->stop_codons[6] = LITERAL_T; + self->stop_codons[7] = LITERAL_A; + self->stop_codons[8] = LITERAL_G; + + self->split_emissions_acceptor = 99; + self->split_emissions_donor = 99; + + self->emission_table_4_UNIFORM = &emission_tables[0]; + self->emission_table_16_UNIFORM = &emission_tables[1]; + self->emission_table_64_UNIFORM = &emission_tables[2]; + self->emission_table_64_LAMBDA = &emission_tables[3]; + self->emission_table_61_LAMBDA = &emission_tables[4]; + self->emission_table_61_UNIFORM = &emission_tables[5]; + + //strncpy(self->matrices_path_prefix, "CESAR/matrices", PATH_STRING_LENGTH-100); + strncpy(self->dot, "\0", PATH_STRING_LENGTH-1); + strncpy(self->clade, "human", PATH_STRING_LENGTH-1); + + //strncpy(self->blosum_file, "blosum_freq62", PATH_STRING_LENGTH-1); + + strncpy(self->eth_file, "extra/tables/%s/eth_codon_sub.txt", PATH_STRING_LENGTH-1); + strncpy(self->acc_profile, "extra/tables/%s/acc_profile.txt", PATH_STRING_LENGTH-1); + strncpy(self->first_codon_profile, "extra/tables/%s/firstCodon_profile.txt", PATH_STRING_LENGTH-1); + strncpy(self->do_profile, "extra/tables/%s/do_profile.txt", PATH_STRING_LENGTH-1); + strncpy(self->last_codon_profile, "extra/tables/%s/lastCodon_profile.txt", PATH_STRING_LENGTH-1); + strncpy(self->u12_acc_profile, "extra/tables/%s/u12_acc_profile.txt", PATH_STRING_LENGTH-1); + strncpy(self->u12_donor_profile, "extra/tables/%s/u12_donor_profile.txt", PATH_STRING_LENGTH-1); + + EmissionTable__init(self->emission_table_4_UNIFORM, 1, UNIFORM_DISTRIBUTION); + EmissionTable__init(self->emission_table_16_UNIFORM, 2, UNIFORM_DISTRIBUTION); + EmissionTable__init(self->emission_table_64_UNIFORM, 3, UNIFORM_DISTRIBUTION); + EmissionTable__init(self->emission_table_64_LAMBDA, 3, LAMBDA_DISTRIBUTION); + EmissionTable__init(self->emission_table_61_LAMBDA, 3, LAMBDA_DISTRIBUTION); + EmissionTable__init(self->emission_table_61_UNIFORM, 3, UNIFORM_DISTRIBUTION); + + self->no_leading_introns_prob = 0.5; + self->no_trailing_introns_prob = 0.49; + + // transition probabilities that are currently 1.0: + self->acc_acc = 0; + self->acc2_ii = 0; + self->do_do = 0; + self->ii1_ii2 = 0; + self->i1_i2 = 0; + self->i2_i3 = 0; + self->c2_c3 = 0; + + self->stop_codon_emission_logodd = Logodd__log(0.00019625991262287552); // based on 1704/8,682,364 + + long double multiple_cd_factors[] = { + 0.432, // delete 2 codons at once = 0.432 + 1cd_logodd + 0.276, // delete 3 codons at once ...; + 0.208, // 4; + 0.164, // ... + 0.147, + 0.133, + 0.127, + 0.123, + 0.118 // delete 10 codons at once ... + }; + self->num_factors = 9; + memcpy(self->multiple_cd_factors, multiple_cd_factors, sizeof(long double) * self->num_factors); + long double sum_multiple_cd_factors = 0; + for (uint8_t i=0; i < self->num_factors; i++) { + sum_multiple_cd_factors += self->multiple_cd_factors[i]; + } + + self->fs_logodd = Logodd__log(0.0001); + self->ci_logodd = Logodd__log(0.01); + self->ci2_logodd = Logodd__log(0.2); + + self->skip_acc = self->fs_logodd; + self->skip_do = self->fs_logodd; + + self->splice_i1 = self->ci_logodd; + self->splice_nti = self->fs_logodd; + self->nti_nti_logodd = Logodd__log(0.25); + self->total_cd_logodd = Logodd__log(0.025); + + // factors for splice site + self->cd_logodd = self->total_cd_logodd - Logodd__log(1.0 + sum_multiple_cd_factors); + self->cd_acc = Logodd__log(0.012); + self->cd_do = Logodd__log(0.012); + + self->c3_i1 = self->ci_logodd; + self->c3_i1_do = self->ci_logodd; + self->i3_i1 = self->ci2_logodd; + self->i3_i1_acc = Logodd__log(0.4); + self->i3_i1_do = Logodd__log(0.4); + + self->c3_js = self->ci2_logodd; + + self->i3_js = Logodd__subexp(0., self->ci2_logodd); + + self->nti_nti = self->nti_nti_logodd; + self->nti_js = Logodd__subexp(0., self->nti_nti); + + self->e1_e1 = Logodd__log(0.9); // TODO test more thoroughly. + // only matters with long introns + + self->b2_b2 = Logodd__log(0.9); // TODO test more thoroughly. + // only matters with long introns + + self->no_leading_introns_logodd = Logodd__log(self->no_leading_introns_prob); + self->no_trailing_introns_logodd = Logodd__log(self->no_trailing_introns_prob); + + self->bsd_e2 = self->fs_logodd; + self->b1_bas = self->fs_logodd; + self->b2_bas = self->fs_logodd; + return true; +} + + +bool Params__set_paths(struct Params* self) { + char tmp[PATH_STRING_LENGTH]; + sprintf(tmp, self->eth_file, self->clade); + strncpy(self->eth_file, tmp, PATH_STRING_LENGTH); + sprintf(tmp, self->acc_profile, self->clade); + strncpy(self->acc_profile, tmp, PATH_STRING_LENGTH); + sprintf(tmp, self->first_codon_profile, self->clade); + strncpy(self->first_codon_profile, tmp, PATH_STRING_LENGTH); + sprintf(tmp, self->do_profile, self->clade); + strncpy(self->do_profile, tmp, PATH_STRING_LENGTH); + sprintf(tmp, self->last_codon_profile, self->clade); + strncpy(self->last_codon_profile, tmp, PATH_STRING_LENGTH); + sprintf(tmp, self->u12_acc_profile, self->clade); + strncpy(self->u12_acc_profile, tmp, PATH_STRING_LENGTH); + sprintf(tmp, self->u12_donor_profile, self->clade); + strncpy(self->u12_donor_profile, tmp, PATH_STRING_LENGTH); + + /* + logv(1, "fasta_file:\t%s", self->fasta_file); + logv(1, "eth_file:\t%s", self->eth_file); + logv(1, "acc_profile:\t%s", self->acc_profile); + logv(1, "first_codon_profile:\t%s", self->first_codon_profile); + logv(1, "do_profile:\t%s", self->do_profile); + logv(1, "last_codon_profile:\t%s", self->last_codon_profile); + logv(1, "u12_acc_profile:\t%s", self->u12_acc_profile); + logv(1, "u12_donor_profile:\t%s", self->u12_donor_profile); + */ + + return true; +} + + +/** Inserts are like matches against random reference sequences. + * + * In other words, inserts are treated like a match for any reference and have + * the same probability for every reference. However each query has a different + * probability that depends on the substitution matrix. + * + * We use the substitution matrix to calculate the insert matrix. For that we + * need two prerequisites: + * 1. the sum of all probabilities *scores_sum* to normalize the values. + * 2. the sum of all stop codon emission probabilites *fix_factor* to + * re-adjust all non-stop codon probs. + * Afterwards (3.) we will remove the stop codons by setting all their + * emission probabilites to zero while we slightly increase all other values + * by the factor (1+fix_factor) to make sure, those sum up to one again. + * + **/ +bool Params__make_insert_table(struct Params* self, struct EmissionTable* table) { + + // 1. get the total sum of all probabilities + LOGODD_T scores_sum = LOGODD_NEGINF, stop_codon_sum = LOGODD_NEGINF; + for (uint8_t query=0; query < table->values->num_columns; query++) { + for (uint8_t reference=0; reference < table->values->num_rows; reference++) { + scores_sum = Logodd__sumexp(scores_sum, EmissionTable__get(table, query, reference)); + } + } + + // 2. get the sum of normalized stop codon emission probs + for (uint8_t i=0; i < self->num_stop_codons; i++) { + LOGODD_T sum_raw_insertion_score = LOGODD_NEGINF; + for (uint8_t query=0; query < table->values->num_columns; query++) { + sum_raw_insertion_score = Logodd__sumexp(sum_raw_insertion_score, EmissionTable__get(table, query, Literal__uint(3, &self->stop_codons[i*3]))); + } + stop_codon_sum = Logodd__sumexp(stop_codon_sum, sum_raw_insertion_score); + } + stop_codon_sum = Logodd__add(stop_codon_sum, -scores_sum); + + // 3. set (query x reference) + LOGODD_T fix_factor = stop_codon_sum/1.0; + for (uint8_t query=0; query < table->values->num_columns; query++) { + + // handle stop_codons + bool is_stop_codon = false; + for (uint8_t i=0; i < self->num_stop_codons; i++) { + uint8_t stop_codon_id = Literal__uint(3, &self->stop_codons[i*3]); + is_stop_codon = query == stop_codon_id; + if (is_stop_codon) { + LogoddMatrix__set(table->values, query, stop_codon_id, LOGODD_NEGINF); + break; + } + } + if (is_stop_codon) { + continue; + } + + // handle all other + LOGODD_T sum_raw_insertion_score = LOGODD_NEGINF; + for (uint8_t reference=0; reference < table->values->num_rows; reference++) { + sum_raw_insertion_score = Logodd__sumexp(sum_raw_insertion_score, EmissionTable__get(table, query, reference)); + } + + double normalized = sum_raw_insertion_score + Logodd__sumexp(0, fix_factor) - scores_sum; + + for (uint8_t reference=0; reference < table->values->num_rows; reference++) { + LogoddMatrix__set(table->values, query, reference, normalized); + } + } + + return true; +} + + +bool Params__recalculate(struct Params* self) { + long double sum_multiple_cd_factors = 0; + for (uint8_t i=0; i < self->num_factors; i++) { + sum_multiple_cd_factors += self->multiple_cd_factors[i]; + } + + EmissionTable__read(self->emission_table_64_LAMBDA, self->eth_file); + + self->cd_logodd = self->total_cd_logodd - Logodd__log(1.0 + sum_multiple_cd_factors); + self->splice_js = Logodd__subexp(Logodd__subexp(0, self->fs_logodd), self->splice_i1); + + self->js_js = self->cd_logodd; + self->js_c1 = Logodd__subexp(Logodd__subexp(0, Logodd__sumexp(self->fs_logodd, self->fs_logodd)), self->total_cd_logodd); + self->bas_sca = Logodd__subexp(0, self->fs_logodd); + + self->nti_nti = self->nti_nti_logodd; + self->nti_js = Logodd__subexp(0, self->nti_nti); + + self->c3_js = Logodd__subexp(Logodd__subexp(0, self->fs_logodd), self->c3_i1); + + self->i3_i1 = self->ci2_logodd; + self->i3_js = Logodd__subexp(0, self->ci2_logodd); + self->i3_js_do = Logodd__subexp(0, self->i3_i1_do); + self->i3_js_acc = Logodd__subexp(0, self->i3_i1_acc); + + self->b2_acc = Logodd__subexp(Logodd__subexp(0, self->b2_b2), self->skip_acc); + self->b1_b2 = Logodd__add(Logodd__subexp(0, self->b1_bas), self->no_leading_introns_logodd); + self->b1_acc = Logodd__add(Logodd__subexp(0, self->b1_bas), self->no_leading_introns_logodd); + + self->js_scd = Logodd__subexp(0, self->fs_logodd); + self->bsd_do = Logodd__subexp(Logodd__subexp(0, self->skip_do), self->bsd_e2); + self->bsd_do_id = Logodd__subexp(Logodd__subexp(Logodd__subexp(0, self->skip_do), self->bsd_e2), self->intron_del); + self->do2_e1 = Logodd__log(1.0 - self->no_trailing_introns_prob); + self->do2_e2 = self->no_trailing_introns_logodd; + + self->e1_e2 = Logodd__subexp(0, self->e1_e1); + + // assign substitutions to non-/stop codons + for (uint8_t this=0; this < self->num_stop_codons; this++) { + uint8_t row = Literal__uint(3, &self->stop_codons[this*3]); + + for (uint8_t column=0; column < self->emission_table_61_UNIFORM->values->num_columns; column++) { + bool overwrite = true; + for (uint8_t other=0; overwrite && other < self->num_stop_codons; other++) { + if (column == Literal__uint(3, &self->stop_codons[other*3])) { + overwrite = false; + } + } + + LOGODD_T logodd = LogoddMatrix__get(self->emission_table_61_UNIFORM->values, column, row); + LOGODD_T new_value = self->stop_codon_emission_logodd - Logodd__log(3); + if (overwrite) { + new_value = Logodd__subexp(0, Logodd__exp(self->stop_codon_emission_logodd)) + logodd; + } + logv(7, "(%ix%i)\t%f -> %f", column, row, Logodd__exp(logodd), Logodd__exp(new_value)); + LogoddMatrix__set(self->emission_table_61_UNIFORM->values, column, row, new_value); + } + } + + EmissionTable__read(self->emission_table_61_LAMBDA, self->eth_file); + Params__make_insert_table(self, self->emission_table_61_LAMBDA); + + if (g_loglevel > 3) { + FILE* matrix_log = fopen("cesar_matrix.log", "a"); + char tmp[1024000] = ""; + EmissionTable__str(self->emission_table_64_LAMBDA, tmp); + fprintf(matrix_log, "64Lambda:\n%s", tmp); + EmissionTable__str(self->emission_table_61_LAMBDA, tmp); + fprintf(matrix_log, "61Lambda:\n%s", tmp); + EmissionTable__str(self->emission_table_64_UNIFORM, tmp); + fprintf(matrix_log, "64Uniform:\n%s", tmp); + EmissionTable__str(self->emission_table_61_UNIFORM, tmp); + fprintf(matrix_log, "61Uniform:\n%s", tmp); + fclose(matrix_log); + } + + return true; +} + +bool Params__set_via_str(struct Params* self, char* string, char* value) { + #define STRING_LENGTH 10 + const void* STRING_DICT[STRING_LENGTH][2] = { + {"clade", &self->clade}, + {"eth_file", &self->eth_file}, + {"acc_profile", &self->acc_profile}, + {"do_profile", &self->do_profile}, + {"first_codon_profile", &self->first_codon_profile}, + {"last_codon_profile", &self->last_codon_profile}, + {"u12_acc_profile", &self->u12_acc_profile}, + {"u12_donor_profile", &self->u12_donor_profile}, + {"fasta_file", &self->fasta_file}, + {"dot", &self->dot} + }; + #define LODD_LENGTH 51 + const void* LODD_DICT[LODD_LENGTH][2] = { + {"fs_prob", &self->fs_logodd}, + {"ci_prob", &self->ci_logodd}, + {"ci2_prob", &self->ci2_logodd}, + {"nti_nti_prob", &self->nti_nti_logodd}, + {"total_cd_prob", &self->total_cd_logodd}, + {"cd_acc", &self->cd_acc}, + {"cd_do", &self->cd_do}, + {"c3_i1_do", &self->c3_i1_do}, + {"i3_i1_do", &self->i3_i1_do}, + {"i3_i1_acc", &self->i3_i1_acc}, + {"i3_js_do", &self->i3_js_do}, + {"i3_js_acc", &self->i3_js_acc}, + {"stop_codon_emission_prob", &self->stop_codon_emission_logodd}, + {"no_leading_introns_prob", &self->no_leading_introns_logodd}, + {"no_traling_introns_prob", &self->no_trailing_introns_logodd}, + {"intron_del", &self->intron_del}, + {"split_do1", &self->split_do1}, + {"js_js", &self->js_js}, + {"bas_sca", &self->bas_sca}, + {"bsd_do", &self->bsd_do}, + {"b1_bas", &self->b1_bas}, + {"b2_bas", &self->b2_bas}, + {"b1_acc", &self->b1_acc}, + {"b1_b2", &self->b1_b2}, + {"b2_b2", &self->b2_b2}, + {"b2_acc", &self->b2_acc}, + {"acc_acc", &self->acc_acc}, + {"acc2_ii", &self->acc2_ii}, + {"skip_acc", &self->skip_acc}, + {"js_scd", &self->js_scd}, + {"do_do", &self->do_do}, + {"do2_e1", &self->do2_e1}, + {"do2_e2", &self->do2_e2}, + {"skip_do", &self->skip_do}, + {"splice_js", &self->splice_js}, + {"splice_nti", &self->splice_nti}, + {"nti_js", &self->nti_js}, + {"nti_nti", &self->nti_nti}, + {"splice_i1", &self->splice_i1}, + {"ii1_ii2", &self->ii1_ii2}, + {"i1_i2", &self->i1_i2}, + {"i2_i3", &self->i2_i3}, + {"i3_i1", &self->i3_i1}, + {"i3_js", &self->i3_js}, + {"js_c1", &self->js_c1}, + {"c2_c3", &self->c2_c3}, + {"c3_i1", &self->c3_i1}, + {"c3_js", &self->c3_js}, + {"bsd_e2", &self->bsd_e2}, + {"e1_e1", &self->e1_e1}, + {"e1_e2", &self->e1_e2}, + }; +#define UINT_LENGTH 3 + const void* UINT_DICT[UINT_LENGTH][2] = { + {"split_emissions_acceptor", &self->split_emissions_acceptor}, + {"split_emissions_donor", &self->split_emissions_donor}, + {"max_memory", &self->max_memory} + }; + + // assume value is string + for (int i=0; i < STRING_LENGTH; i++) { + if (!strcmp(STRING_DICT[i][0], string)) { + strncpy((char*) STRING_DICT[i][1], value, PATH_STRING_LENGTH); + logv(1, "Setting %s := %s", string, (char*) STRING_DICT[i][1]); + return true; + } + } + + // assume value is a float + for (int i=0; i < LODD_LENGTH; i++) { + if (!strcmp(LODD_DICT[i][0], string)) { + double prob; + sscanf(value, "%lf", &prob); + *((LOGODD_T*) LODD_DICT[i][1]) = Logodd__log(prob); + logv(1, "Setting %s := %E", string, *((LOGODD_T*) LODD_DICT[i][1])); + self->dirty = true; + return true; + } + } + + // assume value is a uint + for (int i=0; i < UINT_LENGTH; i++) { + if (!strcmp(UINT_DICT[i][0], string)) { + unsigned int uint = 0; + sscanf(value, "%u", &uint); + *((uint8_t*) UINT_DICT[i][1]) = (uint8_t) uint; + logv(1, "Setting %s := %u", string, *((uint8_t*) UINT_DICT[i][1])); + return true; + } + } + + return false; +} + +void Params__destroy(struct Params* self) { + EmissionTable__destroy(self->emission_table_4_UNIFORM); + EmissionTable__destroy(self->emission_table_16_UNIFORM); + EmissionTable__destroy(self->emission_table_64_UNIFORM); + EmissionTable__destroy(self->emission_table_64_LAMBDA); + EmissionTable__destroy(self->emission_table_61_LAMBDA); + EmissionTable__destroy(self->emission_table_61_UNIFORM); + free(self->stop_codons); +} diff --git a/src/Params.h b/src/Params.h new file mode 100644 index 0000000..50c23a2 --- /dev/null +++ b/src/Params.h @@ -0,0 +1,132 @@ +/** + * Params defintion. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef PARAM_CONTAINER_H_ +#define PARAM_CONTAINER_H_ + +#include +#include +#include + +#include "Logodd.h" +#include "Literal.h" +#include "Profile.h" +#include "EmissionTable.h" +#include "Sequence.h" + +#define PATH_STRING_LENGTH 255 +#define NUM_CS_DELETIONS 10 + +typedef struct Params { + char clade[PATH_STRING_LENGTH], + //blosum_file[PATH_STRING_LENGTH], + eth_file[PATH_STRING_LENGTH], + acc_profile[PATH_STRING_LENGTH], + do_profile[PATH_STRING_LENGTH], + first_codon_profile[PATH_STRING_LENGTH], + last_codon_profile[PATH_STRING_LENGTH], + u12_acc_profile[PATH_STRING_LENGTH], + u12_donor_profile[PATH_STRING_LENGTH], + fasta_file[PATH_STRING_LENGTH], + dot[PATH_STRING_LENGTH]; + + struct EmissionTable* emission_table_64_UNIFORM, + * emission_table_16_UNIFORM, + * emission_table_4_UNIFORM, + * emission_table_61_UNIFORM, + * emission_table_61_LAMBDA, + * emission_table_64_LAMBDA; + + bool dirty; + + size_t num_factors; + long double multiple_cd_factors[10]; + + long double no_leading_introns_prob, + no_trailing_introns_prob; + + uint8_t split_emissions_acceptor, + split_emissions_donor, + max_memory; + + Literal* stop_codons; + uint8_t num_stop_codons; + bool multiexon, lastexon, firstexon, forcelong; + + LOGODD_T stop_codon_emission_logodd, + fs_logodd, + ci_logodd, + ci2_logodd, + nti_nti_logodd, + total_cd_logodd, + cd_logodd, + + cd_acc, + cd_do, + + no_leading_introns_logodd, + no_trailing_introns_logodd, + intron_del, + + acc2_split, + split_do1, + + c3_i1_do, + i3_i1_acc, + i3_i1_do, + i3_js_acc, + i3_js_do, + + js_js, + bas_sca, + b1_bas, + b2_bas, + + b1_acc, + b1_b2, + b2_b2, + b2_acc, + acc_acc, + acc2_ii, + skip_acc, + + js_scd, + bsd_do, + bsd_do_id, + do_do, + do2_e1, + do2_e2, + skip_do, + + splice_js, + + splice_nti, + nti_js, + nti_nti, + + splice_i1, + ii1_ii2, + i1_i2, + i2_i3, + i3_i1, + i3_js, + + js_c1, + c2_c3, + c3_i1, + c3_js, + + bsd_e2, + e1_e1, + e1_e2; +} Params; // struct params + +bool Params__create(struct Params* self, struct EmissionTable emission_tables[6]); +void Params__destroy(struct Params* self); +bool Params__recalculate(struct Params* self); +bool Params__set_paths(struct Params* self); +bool Params__set_via_str(struct Params* self, char* string, char* value); + +#endif // PARAM_CONTAINER_H_ diff --git a/src/Profile.c b/src/Profile.c new file mode 100644 index 0000000..eb4cf6a --- /dev/null +++ b/src/Profile.c @@ -0,0 +1,164 @@ +/** + * Profile implementation. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include +#include + +#include "EmissionTable.h" +#include "Logging.h" +#include "SafeAlloc.h" + +#include "Profile.h" + +bool Profile__read(struct Profile* self, char filename[]) { + strcpy(self->filename, filename); + + FILE* file_descriptor = fopen(filename, "r"); + if (file_descriptor == NULL) { + die("Cannot open file: %s", filename); + } + + #define LINELENGTH 1000 + #define DELIMITERS " \t" + + bool done = false; + size_t lineno = 0; + while (!done) { + // fill the line + char line[LINELENGTH] = "\0"; + for (size_t i=0; i < LINELENGTH; i++) { + assert(i < LINELENGTH); + + char c = fgetc(file_descriptor); + + bool stop = false; + switch (c) { + case EOF: + line[i] = '\0'; + done = true; + stop = true; + case '\n': + line[i] = '\0'; + i=0; + stop = true; + break; + default: + line[i] = c; + } + + if (stop) { + break; + } + } + + size_t i=0; + size_t line_offset = 0; + Literal keys[4]; + char* token = strtok(line, DELIMITERS); + struct EmissionTable* etable; + while (token != NULL) { + + // skip commented lines + if (token[0] == '#') { + if (i == 0) { + line_offset++; + } + token = NULL; // fulfill postcondition + break; + } + + // the first line looks like `A\tC\tG\tT\n' + if (lineno == 0) { + // In a profile, num_literals of each emission_table will be const 1. + keys[i] = Literal__from_char(token[0]); + + token = strtok(NULL, DELIMITERS); + i++; + continue; + } + + + if (i == 0) { + etable = Profile__add_emission(self); + } + + assert(self->length == lineno-line_offset); + + double prob; + sscanf(token, "%lf", &prob); + LogoddMatrix__set(etable->values, keys[i], 0, Logodd__log(prob)); + + token = strtok(NULL, DELIMITERS); + i++; + } + + lineno++; + } + + fclose(file_descriptor); + + return true; +} + +LOGODD_T Profile__by_literals(struct Profile* self, Literal query[]) { + LOGODD_T result = 0; + Literal reference = LITERAL_A; // not important, which base it is. + for (size_t i=0; i < self->length; i++) { + LOGODD_T summand = EmissionTable__by_literals(&self->emission_tables[i], &query[i], &reference); + result = Logodd__add(result, summand); + if (result == LOGODD_NEGINF) { + return result; + } + } + return result; +} + +struct Profile* Profile__create(char name[STATE_NAME_LENGTH]) { + struct Profile* self = (struct Profile*) SAFEMALLOC(sizeof(Profile)); + self->emission_tables = NULL; // will be SAFEREALLOCed when tables are added during read + strncpy(self->name, name, STATE_NAME_LENGTH); + self->length=0; + return self; +} + +void Profile__destroy(struct Profile* self) { + for (size_t i=0; i < self->length; i++) { + EmissionTable__destroy(&self->emission_tables[i]); + } + free(self->emission_tables); + free(self); +} + +struct EmissionTable* Profile__add_emission(struct Profile* self) { + size_t pos = self->length++; + + self->emission_tables = (struct EmissionTable*) SAFEREALLOC(self->emission_tables, sizeof(struct EmissionTable) * self->length); + + EmissionTable__init(&self->emission_tables[pos], 1, LAMBDA_DISTRIBUTION); + return &self->emission_tables[pos]; +} + +bool Profile__str(struct Profile* self, char* buffer) { + char tmp[255] = ""; + sprintf(tmp, "%s(%u)\n", self->name, self->length); + strcat(buffer, tmp); + for (uint8_t t=0; t < self->length; t++) { + sprintf(tmp, "%i\t", t); + strcat(buffer, tmp); + for (Literal l=0; l < LITERAL_N; l++) { + sprintf(tmp, "%E", EmissionTable__get(&self->emission_tables[t], l, 0)); + strcat(buffer, tmp); + if (l == LITERAL_T) { + sprintf(tmp, "\n"); + } else { + sprintf(tmp, "\t"); + } + strcat(buffer, tmp); + } + } + return true; +} diff --git a/src/Profile.h b/src/Profile.h new file mode 100644 index 0000000..dc5f5a7 --- /dev/null +++ b/src/Profile.h @@ -0,0 +1,28 @@ +/** + * Profile definition + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef PROFILE_H_ +#define PROFILE_H_ + +#include "State.h" +#include "EmissionTable.h" + +#define PROFILE_FILENAME_LENGTH 256 + +typedef struct Profile { + char name[STATE_NAME_LENGTH]; + char filename[PROFILE_FILENAME_LENGTH]; + uint8_t length; + struct EmissionTable* emission_tables; +} Profile; + +struct Profile* Profile__create(char name[STATE_NAME_LENGTH]); +void Profile__destroy(); +bool Profile__read(struct Profile* self, char filename[]); +struct EmissionTable* Profile__add_emission(struct Profile* self); +LOGODD_T Profile__by_literals(struct Profile* self, Literal query[]); +bool Profile__str(struct Profile* self, char* buffer); + +#endif // PROFILE_H_ diff --git a/src/SafeAlloc.h b/src/SafeAlloc.h new file mode 100644 index 0000000..d50765f --- /dev/null +++ b/src/SafeAlloc.h @@ -0,0 +1,37 @@ +/** + * Makros for safety checks after memory allocations. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#define outofmem(n, line) fprintf(stderr, "CRITICAL %s:%lu %s():\tOut of memory: %lu bytes\n", __FILE__, line, __func__, n); + +//https://stackoverflow.com/posts/16298916/revisions +static void* safe_malloc(size_t n, size_t line) { + void* p = malloc(n); + if (!p) { + outofmem(n, line); + exit(EXIT_FAILURE); + } + return p; +} +#define SAFEMALLOC(n) safe_malloc(n, __LINE__) + +static void* safe_calloc(size_t s, size_t n, size_t line) { + void* p = calloc(s, n); + if (!p) { + outofmem(n, line); + exit(EXIT_FAILURE); + } + return p; +} +#define SAFECALLOC(s, n) safe_calloc(s, n, __LINE__) + +static void* safe_realloc(void* ptr, size_t n, size_t line) { + void* p = realloc(ptr, n); + if (!p) { + outofmem(n, line); + exit(EXIT_FAILURE); + } + return p; +} +#define SAFEREALLOC(ptr, n) safe_realloc(ptr, n, __LINE__) diff --git a/src/Sequence.c b/src/Sequence.c new file mode 100644 index 0000000..7f69f7f --- /dev/null +++ b/src/Sequence.c @@ -0,0 +1,59 @@ +/** + * Sequence management + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include + +#include "Literal.h" +#include "Logging.h" +#include "SafeAlloc.h" + +#include "Sequence.h" + +bool Sequence__init(struct Sequence* self) { + self->length = 0; + self->length_reserved = 1000; + self->sequence = SAFECALLOC(sizeof(Literal), self->length_reserved); + self->name[0] = '\0'; + self->start_split_length = 0; + self->end_split_length = 0; + self->codons_offset = 0; + self->num_codons = 0; + self->num_codon_bases = 0; + self->num_states = 0; + self->num_align_spaces = 0; + + self->acceptor[0] = '\0'; + self->donor[0] = '\0'; + + logv(2, "init: length reserved: %lu", self->length_reserved); + return true; +} + +bool Sequence__destroy(struct Sequence* self) { + free(self->sequence); + return true; +} + +bool Sequence__set_profiles(struct Sequence* self, char acceptor[], char donor[]) { + strcpy(self->acceptor, acceptor); + strcpy(self->donor, donor); + return true; +} + +bool Sequence__append(struct Sequence* self, Literal literal) { + // increase memory space if necessary + if (self->length+1 >= self->length_reserved) { + self->length_reserved *= 2; + logv(2, "length reserved: %lu", self->length_reserved); + self->sequence = SAFEREALLOC(self->sequence, sizeof(Literal) * self->length_reserved); + if (self->sequence == NULL) { + return false; + } + } + self->sequence[self->length++] = literal; + return true; +} diff --git a/src/Sequence.h b/src/Sequence.h new file mode 100644 index 0000000..1ba0353 --- /dev/null +++ b/src/Sequence.h @@ -0,0 +1,37 @@ +/** + * Fasta parser declarations + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#ifndef SEQUENCE_H_ +#define SEQUENCE_H_ + +#include + +#include "Literal.h" +#include "Profile.h" + +#define SEQUENCENAMELENGTH 255 + +typedef struct Sequence { + size_t length; + Literal* sequence; + char name[SEQUENCENAMELENGTH], + acceptor[PROFILE_FILENAME_LENGTH], + donor[PROFILE_FILENAME_LENGTH]; + uint8_t start_split_length, + end_split_length; + size_t num_states, + codons_offset, + num_codon_bases, + num_align_spaces, + num_codons, + length_reserved, + genome_location_start; +} Sequence; + +bool Sequence__init(struct Sequence* self); +bool Sequence__destroy(struct Sequence* self); +bool Sequence__append(struct Sequence* self, Literal literal); + +#endif // SEQUENCE_H_ diff --git a/src/State.c b/src/State.c new file mode 100644 index 0000000..42f9734 --- /dev/null +++ b/src/State.c @@ -0,0 +1,111 @@ +/** + * State implementation. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include +#include + +#include "Logging.h" +#include "EmissionTable.h" +#include "SafeAlloc.h" + +#include "State.h" + +/** + * Initialize a State. + * @param self a state. + * @param name the state name. + * @param num_emissions the number of literals the state will emit. + * @param emission_reference an array of the reference sequence. + * @param emission_table a pointer to an emission table. + * @return success boolean. + */ +bool State__init(struct State* self, char name[STATE_NAME_LENGTH], uint8_t num_emissions, Literal emission_reference[num_emissions], struct EmissionTable* emission_table) { + logv(5, "State init:\t%s", name); + strncpy(self->name, name, STATE_NAME_LENGTH-1); + self->name[STATE_NAME_LENGTH-1] = '\0'; + + if (g_loglevel >= 5) { + char tmp[4] = ""; + Literal__str(num_emissions, emission_reference, tmp); + logv(5, SID"\tref=%s\t%s(%i)", self->id, tmp, name, num_emissions); + } + + self->num_incoming = 0; + self->num_emissions = num_emissions; + for (uint8_t i=0; i < num_emissions; i++) { + self->reference[i] = emission_reference[i]; + } + self->emission_table = emission_table; + + // Die if emission table does not allow the state to emit its reference. + // This originally was thought to avoid stop codon emissions. + /* + if (num_emissions) { + uint8_t reference_id = Literal__uint(num_emissions, emission_reference); + if (!EmissionTable__emittable(emission_table, reference_id)) { + char tmp[4] = ""; + Literal__str(num_emissions, emission_reference, tmp); + die("State cannot emit reference %s: %s ", tmp, self->name); + } + } + */ + + return true; +} + +/** + * Fewer arguments for irrelevant reference. + * @param self a state. + * @param name the name of a state. + * @param num_emissions number of emissitted literals from this state. + * @param emission_table the table containing num_emissions entries. + * @return success boolean. + */ +bool State__init_uniform(struct State* self, char name[STATE_NAME_LENGTH], uint8_t num_emissions, struct EmissionTable* emission_table) { + Literal emission_reference[STATE_MAX_REF_LEN] = { LITERAL_A, LITERAL_A, LITERAL_A }; + return State__init(self, name, num_emissions, emission_reference, emission_table); +} + +/** + * Silent State + * @param self a state. + * @param name a string. + * @return success boolean. + */ +bool State__init_silent(struct State* self, char name[STATE_NAME_LENGTH]) { + return State__init(self, name, 0, NULL, NULL); +} + +/** + * Add an incoming transition to the target state. + * @param self the origin of the transition. + * @param target the target state of the incoming transition. + * @param logodd the log of the probability to transide from self to target. + * @return success boolean. + */ +bool State__add_incoming(struct State* self, LOGODD_T logodd, struct State* target) { + if(target->num_incoming > STATE_MAX_NUM_INCOMING) { + die("Too many incoming transitions for state %s ("SID").", target->name, target->id); + } + logv(5, "State incoming:\t%s--(%f)-->%s", self->name, Logodd__exp(logodd), target->name); + struct Transition t = {.origin = self->id, .logodd = logodd}; + target->incoming[target->num_incoming++] = t; + return true; +} + +/** + * Compose a string representation of the state. + * @param self the state. + * @param buffer a string that can be filled. + * @return success boolean. + */ +bool State__str(struct State* self, char* buffer) { + char* tmp = SAFEMALLOC(sizeof(char) * self->num_emissions+1); + Literal__str(self->num_emissions, self->reference, tmp); + sprintf(buffer, "State {.name=\"%s\", .id=%Lu, .num_emissions=%u, .reference=\"%s\", .num_incoming=%u}", self->name, (long long unsigned int) self->id, (unsigned int) self->num_emissions, tmp, (unsigned int) self->num_incoming); + free(tmp); + return true; +} diff --git a/src/State.h b/src/State.h new file mode 100644 index 0000000..19ec901 --- /dev/null +++ b/src/State.h @@ -0,0 +1,39 @@ +/** + * State definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#ifndef STATE_H_ +#define STATE_H_ + +#include +#include + +#include "Literal.h" +#include "Transition.h" +#include "Logodd.h" +#include "Stateid.h" + +#define STATE_NAME_LENGTH 20 +#define STATE_MAX_REF_LEN 3 +#define STATE_MAX_NUM_INCOMING 15 +#define STATE_CUSTOM_T uint16_t + +typedef struct State { + STATE_ID_T id; + char name[STATE_NAME_LENGTH]; + struct EmissionTable* emission_table; + uint8_t num_emissions; + Literal reference[STATE_MAX_REF_LEN]; + uint8_t num_incoming; + struct Transition incoming[STATE_MAX_NUM_INCOMING]; + STATE_CUSTOM_T custom; +} State; + +bool State__init(struct State* self, char name[STATE_NAME_LENGTH], uint8_t num_emissions, Literal emission_reference[num_emissions], struct EmissionTable* emission_table); +bool State__init_uniform(struct State* self, char name[STATE_NAME_LENGTH], uint8_t num_emissions, struct EmissionTable* emission_table); +bool State__init_silent(struct State* self, char name[STATE_NAME_LENGTH]); +bool State__add_incoming(struct State* self, LOGODD_T logodd, struct State* target); +bool State__str(struct State* self, char* buffer); +LOGODD_T State__remainder(struct State* self); + +#endif // STATE_H_ diff --git a/src/Stateid.h b/src/Stateid.h new file mode 100644 index 0000000..5516158 --- /dev/null +++ b/src/Stateid.h @@ -0,0 +1,20 @@ +/** + * State id definition. + * Copyright 2017 Peter Schwede + * + * These lines are separated from State.h to solve dependencies issues. + */ + +#include + +#ifndef STATE_ID_H_ +#define STATE_ID_H_ + +//#define STATE_ID_T size_t +//#define STATE_MAX_ID SIZE_MAX + +#define STATE_ID_T uint32_t +#define STATE_MAX_ID UINT32_MAX +#define SID "%u" //PRIu32 + +#endif // STATE_ID_H_ diff --git a/src/Transition.h b/src/Transition.h new file mode 100644 index 0000000..9f583e8 --- /dev/null +++ b/src/Transition.h @@ -0,0 +1,16 @@ +/** + * Transition definition. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#ifndef TRANSITION_H_ +#define TRANSITION_H_ + +#include "Logodd.h" +#include "Stateid.h" + +typedef struct Transition { + STATE_ID_T origin; + LOGODD_T logodd; +} Transition; + +#endif // TRANSITION_H_ diff --git a/src/Viterbi.c b/src/Viterbi.c new file mode 100644 index 0000000..5f16601 --- /dev/null +++ b/src/Viterbi.c @@ -0,0 +1,366 @@ +/** + * Perform the viterbi on a given HMM. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +// http://www.geeksforgeeks.org/dynamically-allocate-2d-array-c/ + +#include +#include +#include +#include +#include +#include + +#include "Logging.h" +#include "Stateid.h" +#include "EmissionTable.h" +#include "Transition.h" +#include "Matrix.h" + +#include "Viterbi.h" + +/** + * Create a new viterbi matrix and fill its values with -inf + * @param hmm pointer to hidden markov model + * @param num_observations query length + */ +struct LogoddMatrix* Viterbi__init_logodd_matrix(struct HMM* hmm, size_t num_observations) { + LOGODD_T default_value = LOGODD_NEGINF; + + struct LogoddMatrix* vmatrix = LogoddMatrix__create(4, hmm->num_states, default_value); // 4 = 3 previous + 1 current + + return vmatrix; +} + +/** + * Create a new path matrix and fill its values with empty state ids. + * STATE_MAX_ID is used as empty state id. + * @param hmm pointer to hidden markov model + * @param num_observations query length + */ +struct PathMatrix* Viterbi__init_path_matrix(struct HMM* hmm, size_t num_observations) { + struct PathMatrix* pmatrix = PathMatrix__create(num_observations + 1, hmm->num_states, STATE_MAX_ID); // +1 for virtual start state + + for (uint8_t i = 0; i < hmm->num_starts; i++) { + struct Transition start = hmm->starts[i]; + PathMatrix__set(pmatrix, 0, start.origin, start.origin); + } + + return pmatrix; +} + + +/** + * Look up the emission probability of a state during an observation. + * @param observations the querry. + * @param t the point of time of observation -- an index for observations. + * @param state the State whom the caller requests the emission of + */ +LOGODD_T Viterbi__get_emission_logodd(Literal* observations, size_t t, struct State* state) { + if (state->num_emissions == 0) { + return 0; + } + if (t < state->num_emissions) { + return LOGODD_NEGINF; + } + for (uint8_t i=0; i < state->num_emissions; i++) { + logv(6, "Observation: %lu", t); + logv(6, "Logodd lookup for: %c [%u]", Literal__char(observations[t-state->num_emissions]), i); + } + + return EmissionTable__by_literals( + state->emission_table, + state->reference, + &observations[t-state->num_emissions] + ); +} + + +/** + * Perform the viterbi recursion step at time t. + * @param vmatrix the viterbi matrix + * @param pmatrix the path matrix + * @param hmm the hidden markov model + * @param observations the sequence of observed literals + * @param t is assumed to be greater than zero (past the initialization). + * + * # Viterbi sub steps at observation i for an HMM with silent states and + * without silent loops: (Durbin et al., 1998, p. 71) + * (i) For all emitting states l, calculate viterbi(l, i) from + * max(viterbi(k, i-x)+p)+e where p is the prob of t(l,p,k) + * (ii) For all silent states l, set viterbi(l, i) to max(viterbi(k, i))*p + * where p is the probability of t=(l,p,k) (no e because silent) + * (iii) Starting from the lowest numbered silent state l set viterbi(l, i) to + * max(viterbi(k, i)+p) for all silent states k < l + * + * Basically first serve all emitting states, then all silent states and + * finally let silent states driple down their probs before starting with + * emitting states again. + * + * Those steps can be reduced to one general step only iff there is no doubt + * that all silent chains from states A to B in a lower probability compared to + * any emitting chain from A to B. + * + * # Each a viterbi sub step: + * 1. for each state k1 at time t + * 1.1 for each transition a=(k0,p,k1) to k1 select the maximum probability pmax=(viterbi(t-1,k0)*p) + * 1.2 multiply pm with emission probability of observation at time t + * 1.3 assign viterbi(t,k1) := pmax + * 1.4 assign path(t-num_emissions, k1) := k0 + */ +void Viterbi__step(struct LogoddMatrix* vmatrix, struct PathMatrix* pmatrix, struct HMM* hmm, Literal* observations, size_t t) { + if (t != 0) { + for(STATE_ID_T sid=0; sid < hmm->num_states; sid++) { + LogoddMatrix__set(vmatrix, t%4, sid, LOGODD_NEGINF); + } + } + + uint8_t silent = 0; + if (t==0) { + silent = 2; + } + + for(; silent<=2; silent++) { + + // 1. + for (STATE_ID_T i = 0; i < hmm->num_states; i++) { + struct State* state = &hmm->states[i]; + + LOGODD_T max_logodd = LOGODD_NEGINF; + STATE_ID_T origin_id = STATE_MAX_ID; + + if (state->num_emissions > t) { + logv(6, "t=%lu\ti=%s="SID"\tCurrent state emits too much to emit this early:\t%u > %lu", t, state->name, i, state->num_emissions, t); + continue; + } + + if (silent == 0 && state->num_emissions == 0) { + continue; + } + if (silent > 0 && state->num_emissions > 0) { + continue; + } + + LOGODD_T emission_logodd = Viterbi__get_emission_logodd(observations, t, state); + + char ref[4] = "", qry[4] = ""; + if (g_loglevel > 5) { + Literal__str(state->num_emissions, state->reference, ref); + Literal__str(state->num_emissions, &observations[t-state->num_emissions], qry); + logv(6, "t=%lu\ti=%s="SID"\tqry=%s=%i\tref=%s=%i\temission_logodd=%E", + t, state->name, i, + qry, Literal__uint(state->num_emissions, &observations[t-state->num_emissions]), + ref, Literal__uint(state->num_emissions, state->reference), + emission_logodd); + } + + if (state->num_emissions > 0 && emission_logodd == LOGODD_NEGINF) { + if (g_loglevel > 5) { + logv(6, "t=%lu\ti=%s="SID"\tCurrent state cannot emit observation\t%s:\t%E", t, state->name, i, qry, emission_logodd); + } + continue; + } + + // 1.1 for each transition t=(k0, p, k1) select the maximum probability pmax=(viterbi(t-1,k0)*p) + logv(6, "t=%lu\ti=%s="SID"\t1 select max prob", t, state->name, i); + for (size_t j=0; j < state->num_incoming; j++) { + struct Transition transition = state->incoming[j]; + + if (silent == 2) { + if (transition.origin >= i) { + continue; + } + if (hmm->states[transition.origin].num_emissions > 0) { + continue; + } + } + + LOGODD_T origin_logodd = LogoddMatrix__get(vmatrix, ((size_t) t-state->num_emissions)%4, transition.origin); + + logv(6, "t=%lu\ti=%s="SID"\tcheck (%lu / %u), t'=t-%u", t, state->name, i, j+1, state->num_incoming, state->num_emissions); + logv(6, "t=%lu\ti=%s="SID"\torigin_logodd=%E\tt.logodd=%E\t\tt.origin=%s="SID, t, state->name, i, origin_logodd, transition.logodd, hmm->states[transition.origin].name, transition.origin); + + if (origin_logodd == LOGODD_NEGINF) { + continue; + } + + // pmax = viterbi(t-1,k0)*p + LOGODD_T sum = Logodd__add(transition.logodd, origin_logodd); + + // is pmax maximal? + if (max_logodd < sum) { + max_logodd = sum; + origin_id = transition.origin; + logv(6, "t=%lu\ti=%s="SID"\tnew max_logodd:\t%E\t"SID, t, state->name, i, max_logodd, origin_id); + } + + } // O( state->incoming ) = O( deg+(k) ) + + + if (max_logodd == LOGODD_NEGINF) { + logv(6, "t=%lu\ti=%s="SID"\tmax_logodd=%E\torigin=%s="SID"", t, state->name, i, max_logodd, "[]", origin_id); + continue; + } + logv(6, "t=%lu\ti=%s="SID"\tmax_logodd=%E\torigin=%s="SID"", t, state->name, i, max_logodd, hmm->states[origin_id].name, origin_id); + + + // 1.2 multiply pm with emission probability of observation at time t + logv(6, "t=%lu\ti=%s="SID"\t2 multiply emission logodd %E", t, state->name, i, emission_logodd); + max_logodd = Logodd__add(emission_logodd, max_logodd); + + // 1.3 assign viterbi(t,k1) <= pm + if (max_logodd > LogoddMatrix__get(vmatrix, ((size_t) t)%4, state->id)) { + LOGODD_T prevv = LogoddMatrix__get(vmatrix, t%4, state->id); + if (prevv == LOGODD_NEGINF) { + logv(6, "t=%lu\ti=%s="SID"\tassign v(%lu,%s="SID") = -inf := %E", t, state->name, i, t, state->name, state->id, max_logodd); + } else { + logv(6, "t=%lu\ti=%s="SID"\tassign v(%lu,%s="SID") = %E := %E", t, state->name, i, t, state->name, state->id, prevv, max_logodd); + } + LogoddMatrix__set(vmatrix, t%4, state->id, max_logodd); + logv(6, "t=%lu\ti=%s="SID"\tassign p(%lu,%s="SID") := "SID"", t, state->name, i, t, state->name, state->id, origin_id); + PathMatrix__set(pmatrix, t, state->id, origin_id); + } + } + } // hmm->states O( deg+(k) * S + s*s) (s = silent states in S) +} // O(S * deg+(k)) + +/** + * Run viterbi on an HMM. + * The given path pointer contains the most probable path through the HMM that ends in an ending state. + * Notice: This creates two S*T matrices in memory, where S is the number of states in the HMM and T is the number of observations. + * @param hmm the HMM. + * @param num_observations the length of the query. + * @param observations the array of observed literals. + * @param path_length a maximum length of the resulting path. + * @param path a pointer to an array for the sequence of state pointers. + */ +void Viterbi(struct HMM* hmm, size_t num_observations, Literal* observations, size_t* path_length, struct State** path) { + if (num_observations == 0) { + die("No observations."); + } + + // Running viterbi on an HMM without any states will return an empty list of states. + // The same is the case if the HMM has no start or end states. + if (hmm->num_states == 0) { + die("HMM is empty."); + } + if (hmm->num_starts == 0 ) { + die("HMM has zero start states."); + } + if (hmm->num_ends == 0) { + die("HMM has zero end states."); + } + + logv(1, "Num states:\t%lu", hmm->num_states); + logv(1, "Num observations:\t%lu", num_observations); + + // The init of a dynamically sized 2d array requires >=c99 + struct LogoddMatrix* vmatrix = Viterbi__init_logodd_matrix(hmm, num_observations); + struct PathMatrix* pmatrix = Viterbi__init_path_matrix(hmm, num_observations); + + // init the first column: the initial states. + for (size_t i = 0; i < hmm->num_starts; i++) { + struct Transition transition = hmm->starts[i]; + LogoddMatrix__set(vmatrix, 0, transition.origin, transition.logodd); + } // O(S) + + for (size_t t=0; t <= num_observations; t++) { + Viterbi__step(vmatrix, pmatrix, hmm, observations, t); + } // O(deg+(k) * S * T) = O(S * T) + + if (g_loglevel > 3) { + if (hmm->num_states < 200 && num_observations < 300) { + char tmp[1024000] = ""; + FILE * matrixlog = fopen("cesar_matrix.log", "w"); + fprintf(matrixlog, "###states\n"); + for (size_t i=0; i < hmm->num_states; i++) { + State__str(&hmm->states[i], tmp); + fprintf(matrixlog, "%lu\t%s\n", i, tmp); + } + fprintf(matrixlog, "###states_end\n"); + LogoddMatrix__str(vmatrix, tmp); + fprintf(matrixlog, "###vmatrix:\n%s\n###vmatrix_end\n", tmp); + PathMatrix__str(pmatrix, tmp); + fprintf(matrixlog, "###pmatrix:\n%s\n###pmatrix_end\n", tmp); + fclose(matrixlog); + } + } + + /** + * Find the backtrace start + */ + LOGODD_T logodd = LOGODD_NEGINF; + STATE_ID_T best_end = STATE_MAX_ID; + for (size_t k=0; k < hmm->num_ends; k++) { + Transition end = hmm->ends[k]; + LOGODD_T current = LogoddMatrix__get(vmatrix, (size_t) num_observations%4, (size_t) end.origin); + LOGODD_T sum = Logodd__add(current, end.logodd); + logv(2, "Checking end:\t%s\t%lE", hmm->states[end.origin].name, sum); + if (logodd < sum) { + best_end = end.origin; + logodd = sum; + } + } // O(S) + + logv(1, "LogoddMatrix (WxH):\t%lu x %lu", vmatrix->num_columns, vmatrix->num_rows); + logv(1, "LogoddMatrix (bytes):\t%lu", LogoddMatrix__bytes(vmatrix)); + logv(1, "PathMatrix (WxH):\t%lu x %lu", pmatrix->num_columns, pmatrix->num_rows); + logv(1, "PathMatrix (bytes):\t%lu", PathMatrix__bytes(pmatrix)); + logv(1, "States (bytes):\t%lu", sizeof(State) * hmm->num_states); + + if (best_end == STATE_MAX_ID || logodd == LOGODD_NEGINF) { + die("No valid path found."); + } + + + logv(1, "Viterbi logodd:\t%lE", logodd); + logv(1, "Viterbi prob:\t%lE", Logodd__exp(logodd)); + + *path_length = num_observations*2; + + logv(4, "path[%lu] := "SID, *path_length-1, best_end); + path[*path_length-1] = &hmm->states[best_end]; + + size_t i, t = num_observations; + for(i=*path_length-1; i > 0; i--) { + struct State* state = path[i]; + path[i-1] = &hmm->states[PathMatrix__get(pmatrix, t, state->id)]; + + if (g_loglevel >= 4) { + char qry[4] = "", ref[4] = ""; + Literal__str(state->num_emissions, &observations[t-state->num_emissions], qry); + Literal__str(state->num_emissions, state->reference, ref); + /* + uint8_t qry_id = Literal__uint(state->num_emissions, &observations[t-state->num_emissions]); + uint8_t ref_id = Literal__uint(state->num_emissions, state->reference); + logv(4, + "path[%lu] := Pget(%lu, %s="SID") = %s="SID"(%u)\te(qry=%s=%i, ref=%s=%i)=%E\tv=%E", + i-1, + t, state->name, state->id, + path[i-1]->name, path[i-1]->id, path[i-1]->num_emissions, + qry, qry_id, ref, ref_id, Viterbi__get_emission_logodd(observations, t, state), + LogoddMatrix__get(vmatrix, t, state->id)); + */ + } + + if (t == 0 && state->num_emissions > 0) { + break; + } + t -= state->num_emissions; + } // O(T) + + // shrink path + const size_t offset = i; + for(i=0; i < *path_length - offset; i++) { + if (i < *path_length - offset) { + path[i] = path[i+offset]; + } else { + path[i] = NULL; + } + } // O(T) + *path_length -= offset; + + LogoddMatrix__destroy(vmatrix); + PathMatrix__destroy(pmatrix); +} diff --git a/src/Viterbi.h b/src/Viterbi.h new file mode 100644 index 0000000..e4f8e5a --- /dev/null +++ b/src/Viterbi.h @@ -0,0 +1,16 @@ +/** + * Viterbi function declaration. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +//#include "Literal.h" +#include "State.h" +#include "HMM.h" + +struct LogoddMatrix* Viterbi__init_logodd_matrix(struct HMM* hmm, size_t num_observations); + +struct PathMatrix* Viterbi__init_path_matrix(struct HMM* hmm, size_t num_observations); + +LOGODD_T Viterbi__get_emission_logodd(Literal* observations, size_t t, struct State* state); + +void Viterbi(struct HMM* hmm, size_t num_observations, Literal* observation, size_t* path_length, struct State** path); diff --git a/test/EmissionTable.c b/test/EmissionTable.c new file mode 100644 index 0000000..f668de8 --- /dev/null +++ b/test/EmissionTable.c @@ -0,0 +1,63 @@ +/** + * EmissionTable Tests + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include +#include +#include +#include + +#include "Distribution.h" + +#include "EmissionTable.h" + +struct EmissionTable* EmissionTable__dummy; + +void EmissionTable__setup(void) { + EmissionTable__dummy = (struct EmissionTable*) malloc(sizeof(EmissionTable)); + EmissionTable__init(EmissionTable__dummy, 1, UNIFORM_DISTRIBUTION); +} + +void EmissionTable__teardown(void) { + EmissionTable__destroy(EmissionTable__dummy); + free(EmissionTable__dummy); +} + +Test(emissiontable, by_literals, .init=EmissionTable__setup, .fini=EmissionTable__teardown) { + EmissionTable__init(EmissionTable__dummy, 1, LAMBDA_DISTRIBUTION); + Literal reference[1] = { LITERAL_A }; + Literal query[1] = { LITERAL_A }; + EmissionTable__set(EmissionTable__dummy, query, (LOGODD_T) .1); + cr_assert_eq(EmissionTable__by_literals(EmissionTable__dummy, reference, query), .1); +} + +Test(emissiontable, by_literals__too_many, .init=EmissionTable__setup, .fini=EmissionTable__teardown) { + Literal literals[3] = { LITERAL_C, LITERAL_A, LITERAL_A }; + LOGODD_T logodd = EmissionTable__by_literals(EmissionTable__dummy, literals, literals); + cr_assert_eq(log(0.25), logodd); +} + +Test(emissiontable, read64, .fini=EmissionTable__teardown) { + EmissionTable__dummy = (struct EmissionTable*) malloc(sizeof(EmissionTable)); + EmissionTable__init(EmissionTable__dummy, 3, LAMBDA_DISTRIBUTION); + EmissionTable__read(EmissionTable__dummy, "../extra/tables/eth_codon_sub.txt"); + cr_assert_eq(EmissionTable__dummy->num_literals, (uint8_t) 3); + cr_assert_eq(EmissionTable__get(EmissionTable__dummy, 0, 0), Logodd__log(0.40849)); + cr_assert_eq(EmissionTable__get(EmissionTable__dummy, 63, 0), Logodd__log(0.00037)); + cr_assert_eq(EmissionTable__get(EmissionTable__dummy, 0, 63), Logodd__log(0.00054)); +} + +Test(emissiontable, different_sizes) { + for (int size=1; size < 3; size++) { + struct EmissionTable* dummy = (struct EmissionTable*) malloc(sizeof(EmissionTable)); + EmissionTable__init(dummy, size, LAMBDA_DISTRIBUTION); + + /* + char tmp[1000] = "\0"; + EmissionTable__str(dummy, tmp); + printf("etable dummy:\n%s\n", tmp); + */ + + EmissionTable__destroy(dummy); + } +} diff --git a/test/HMM.c b/test/HMM.c new file mode 100644 index 0000000..880afcf --- /dev/null +++ b/test/HMM.c @@ -0,0 +1,97 @@ +/** + * HMM Tests + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include +#include "stdio.h" +#include "math.h" + +#include "State.h" +#include "Transition.h" +#include "EmissionTable.h" + +#include "HMM.h" + +struct HMM* TestHMM__hmm; +struct EmissionTable TestHMM__et[1]; + + +void HMM__setup(void) { + EmissionTable__init(&TestHMM__et[0], 1, UNIFORM_DISTRIBUTION); + + TestHMM__hmm = HMM__create(3, 3, 3); + + struct State* rock = HMM__new_state(TestHMM__hmm); + struct State* paper = HMM__new_state(TestHMM__hmm); + struct State* scissors = HMM__new_state(TestHMM__hmm); + + State__init_uniform(rock, "Rock", 1, &TestHMM__et[0]); + State__init_uniform(paper, "Paper", 1, &TestHMM__et[0]); + State__init_uniform(scissors, "Scissors", 1, &TestHMM__et[0]); + + State__add_incoming(rock, log(.33), rock); + State__add_incoming(rock, log(.33), paper); + State__add_incoming(rock, log(.33), scissors); + State__add_incoming(paper, log(.33), rock); + State__add_incoming(paper, log(.33), paper); + State__add_incoming(paper, log(.33), scissors); + State__add_incoming(scissors, log(.33), rock); + State__add_incoming(scissors, log(.33), paper); + State__add_incoming(scissors, log(.33), scissors); + + struct Transition t = {.origin=rock->id, .logodd=log(.33)}; + HMM__set_start(TestHMM__hmm, t); + HMM__set_end(TestHMM__hmm, t); + t.origin = paper->id; + HMM__set_start(TestHMM__hmm, t); + HMM__set_end(TestHMM__hmm, t); + t.origin = scissors->id; + HMM__set_start(TestHMM__hmm, t); + HMM__set_end(TestHMM__hmm, t); +} + + +void HMM__teardown(void) { + HMM__destroy(TestHMM__hmm); + EmissionTable__destroy(&TestHMM__et[0]); +} + + +Test(hmm, HMM__new_state) { + TestHMM__hmm = HMM__create(3, 0, 0); + + struct State* one = HMM__new_state(TestHMM__hmm); + struct State* two = HMM__new_state(TestHMM__hmm); + struct State* three = HMM__new_state(TestHMM__hmm); + + cr_assert_eq(one->id, 0); + cr_assert_eq(two->id, 1); + cr_assert_eq(three->id, 2); + + cr_assert_eq(TestHMM__hmm->states[0].id, 0); + cr_assert_eq(TestHMM__hmm->states[1].id, 1); + cr_assert_eq(TestHMM__hmm->states[2].id, 2); + + cr_assert_eq(one->num_incoming, 0); + cr_assert_eq(two->num_incoming, 0); +} + + +Test(hmm, names_and_origins, .init=HMM__setup, .fini=HMM__teardown) { + cr_assert_str_eq(TestHMM__hmm->states[0].name, "Rock"); + cr_assert_str_eq(TestHMM__hmm->states[1].name, "Paper"); + cr_assert_str_eq(TestHMM__hmm->states[2].name, "Scissors"); + + char buffer[255]; + State__str(&TestHMM__hmm->states[0], buffer); + + cr_assert_eq(TestHMM__hmm->states[0].id, 0); + cr_assert_eq(TestHMM__hmm->states[1].id, 1); + cr_assert_eq(TestHMM__hmm->states[2].id, 2); + + cr_assert_eq(TestHMM__hmm->states[0].num_incoming, 3); + cr_assert_eq(TestHMM__hmm->states[0].incoming[0].origin, 0); + cr_assert_eq(TestHMM__hmm->states[0].incoming[0].logodd, log(0.33)); + + cr_assert_str_eq(TestHMM__hmm->states[TestHMM__hmm->states[0].incoming[0].origin].name, "Rock"); +} diff --git a/test/Literal.c b/test/Literal.c new file mode 100644 index 0000000..e75cd19 --- /dev/null +++ b/test/Literal.c @@ -0,0 +1,49 @@ +/** + * Literal Tests + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include +#include + +#include "EmissionTable.h" +#include "HMM.h" + +#include "Literal.h" + + +Test(literal, from_char) { + cr_assert_eq(Literal__from_char('A'), LITERAL_A); + cr_assert_eq(Literal__from_char('C'), LITERAL_C); + cr_assert_eq(Literal__from_char('G'), LITERAL_G); + cr_assert_eq(Literal__from_char('T'), LITERAL_T); +} + + +Test(literal, _char) { + cr_assert_eq(Literal__char(LITERAL_A), 'A'); + cr_assert_eq(Literal__char(LITERAL_C), 'C'); + cr_assert_eq(Literal__char(LITERAL_G), 'G'); + cr_assert_eq(Literal__char(LITERAL_T), 'T'); +} + + +Test(literal, str) { + Literal codon[3] = { LITERAL_A, LITERAL_C, LITERAL_G }; + char buffer[4] = "\0"; + Literal__str(3, codon, buffer); + cr_assert_eq(buffer[0], 'A'); + cr_assert_eq(buffer[1], 'C'); + cr_assert_eq(buffer[2], 'G'); + cr_assert_str_eq(buffer, "ACG\0"); +} + + +Test(literal, _uint) { + Literal sequence[3] = { LITERAL_A, LITERAL_C, LITERAL_G }; + cr_assert_eq(Literal__uint(3, sequence), 0x06); + cr_assert_eq(Literal__uint(2, sequence), 0x01); + cr_assert_eq(Literal__uint(1, sequence), 0x00); + cr_assert_eq(Literal__uint(2, &sequence[1]), 0x06); + cr_assert_eq(Literal__uint(1, &sequence[2]), 0x02); + cr_assert_eq(Literal__uint(0, &sequence[3]), 0x00); +} diff --git a/test/Logodd.c b/test/Logodd.c new file mode 100644 index 0000000..8b874ac --- /dev/null +++ b/test/Logodd.c @@ -0,0 +1,29 @@ +/** + * Logodd Tests + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include + +#include + +#include "Logodd.h" + + +Test(logodd, exp) { + cr_assert_eq(Logodd__exp(0), 1); + cr_assert_eq(Logodd__exp(LOGODD_NEGINF), 0); +} + + +Test(logodd, log) { + cr_assert_eq(Logodd__log(1), 0); + cr_assert_eq(Logodd__log(0), LOGODD_NEGINF); +} + + +Test(logodd, add) { + cr_assert_eq(Logodd__add(Logodd__log(0), Logodd__log(0)), LOGODD_NEGINF); + cr_assert_eq(Logodd__add(Logodd__log(1), Logodd__log(0)), LOGODD_NEGINF); + cr_assert_eq(Logodd__add(Logodd__log(0), Logodd__log(1)), LOGODD_NEGINF); + cr_assert_eq(Logodd__add(LOGODD_NEGINF, .1234), LOGODD_NEGINF); +} diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..63be8e0 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,41 @@ +# Copyright 2016 Peter Schwede + +SILENCE=@ + +CC = cc +LD = cc + +LIB = criterion +LIBLOC = ${HOME}/usr/local/lib/ +INC = ${LIBLOC}../include + +CFLAGS = -c -g -O0 -Wextra -Wall -DDEBUG -I../src/ -I${INC} --std=c11 + +LDLIBS = -L${LIBLOC} -l${LIB} -lm -lc + +srcfiles := $(filter-out ../src/Cesar.c, $(wildcard ../src/*.c) $(wildcard *.c)) +objects := $(srcfiles:%.c=%.o) + + +default: all + + +.PHONY: all +all: AllTests + AllTests -j1 + #AllTests + + +.PHONY: valgrind +valgrind: AllTests + valgrind -q --track-origins=yes --leak-check=yes AllTests + + +.PHONY: AllTests +AllTests: ${objects} ${srcfiles} + ${CC} -o $@ ${objects} ${LDLIBS} + + +.PHONY: clean +clean: + ${SILENCE}rm -f ${tests} *.o AllTests cesar_matrix.log diff --git a/test/Matrix.c b/test/Matrix.c new file mode 100644 index 0000000..b047fa8 --- /dev/null +++ b/test/Matrix.c @@ -0,0 +1,39 @@ +/** + * Matrix Test + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include +#include +#include +#include + +#include "Logodd.h" + +#include "Matrix.h" + +struct LogoddMatrix* TestMatrix__dummy; +struct PathMatrix* TestMatrix__puppet; + +Test(matrix, LogoddMatrix__str) { + LOGODD_T dephault = .5; + TestMatrix__dummy = LogoddMatrix__create(3, 3, dephault); + + char buffer[1024] = ""; + LogoddMatrix__str(TestMatrix__dummy, buffer); + + char cmp[1024] = "0\t+5.000000E-01\t+5.000000E-01\t+5.000000E-01\t\n1\t+5.000000E-01\t+5.000000E-01\t+5.000000E-01\t\n2\t+5.000000E-01\t+5.000000E-01\t+5.000000E-01\t\n"; + + cr_assert_str_eq(buffer, cmp); +} + +Test(matrix, PathMatrix__str) { + STATE_ID_T dephault = 42; + TestMatrix__puppet = PathMatrix__create(3, 3, dephault); + + char buffer[1024] = ""; + PathMatrix__str(TestMatrix__puppet, buffer); + + char cmp[1024] = "0\t42\t42\t42\t\n1\t42\t42\t42\t\n2\t42\t42\t42\t\n"; + + cr_assert_str_eq(buffer, cmp); +} diff --git a/test/Normalize.c b/test/Normalize.c new file mode 100644 index 0000000..7e9cb3c --- /dev/null +++ b/test/Normalize.c @@ -0,0 +1,12 @@ +/** + * Test for normalize. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include + +#include "HMM.h" +#include "Logging.h" +#include "EmissionTable.h" + diff --git a/test/Profile.c b/test/Profile.c new file mode 100644 index 0000000..428b489 --- /dev/null +++ b/test/Profile.c @@ -0,0 +1,42 @@ +/** + * Test reading profiles. + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ + +#include +#include + +#include "Logodd.h" + +#include "Profile.h" + + +struct Profile* TestProfile__dummy; + + +void TestProfile__setup() { + TestProfile__dummy = Profile__create("dummy"); + Profile__read(TestProfile__dummy, "../extra/tables/human/do_profile.txt"); +} + + +void TestProfile__teardown() { + Profile__destroy(TestProfile__dummy); +} + + +Test(profile, create_and_destroy, .init=TestProfile__setup, .fini=TestProfile__teardown) { +} + + +Test(profile, read_sequence, .init=TestProfile__setup, .fini=TestProfile__teardown) { + Literal profile[5] = {LITERAL_A, LITERAL_G, LITERAL_G, LITERAL_G, LITERAL_G}; + cr_assert_eq(Profile__by_literals(TestProfile__dummy, profile), LOGODD_NEGINF); +} + + +Test(profile, read, .init=TestProfile__setup, .fini=TestProfile__teardown) { + cr_assert_eq(EmissionTable__get(&TestProfile__dummy->emission_tables[0], LITERAL_A, 0), LOGODD_NEGINF); + cr_assert_eq(EmissionTable__get(&TestProfile__dummy->emission_tables[0], LITERAL_G, 0), 0); + cr_assert_eq(EmissionTable__get(&TestProfile__dummy->emission_tables[5], LITERAL_T, 0), Logodd__log(0.483)); +} diff --git a/test/State.c b/test/State.c new file mode 100644 index 0000000..c50760e --- /dev/null +++ b/test/State.c @@ -0,0 +1,76 @@ +/** + * State Tests + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include + +#include "stdio.h" + +#include "EmissionTable.h" +#include "State.h" + +struct State* TestState__dummy; + +void State__setup(void) { + TestState__dummy = malloc(sizeof(struct State)); +} + +void State__teardown(void) { + free(TestState__dummy); +} + +Test(state, namelength, .init=State__setup, .fini=State__teardown) { + char name[STATE_NAME_LENGTH] = "0123456789012345678\0"; + State__init_silent(TestState__dummy, name); + + cr_assert_str_eq(TestState__dummy->name, name); +} + +Test(state, string_representation, .init=State__setup, .fini=State__teardown) { + State__init_silent(TestState__dummy, "Foobar"); + cr_assert_str_eq(TestState__dummy->name, "Foobar"); + + TestState__dummy->id = 4711; + cr_expect_eq(TestState__dummy->id, 4711); + cr_expect_eq(TestState__dummy->num_incoming, 0); + + State__add_incoming(TestState__dummy, 0.5, TestState__dummy); + cr_expect_eq(TestState__dummy->num_incoming, 1); + cr_assert_eq(TestState__dummy->incoming[0].origin, 4711); + cr_assert_eq(TestState__dummy->incoming[0].logodd, 0.5); +} + +Test(state, namelength_toolong, .init=State__setup, .fini=State__teardown) { + char name[41] = "0123456789012345678"; + State__init_silent(TestState__dummy, name); + + cr_assert_eq(strlen(TestState__dummy->name), STATE_NAME_LENGTH-1); + cr_assert_str_eq(TestState__dummy->name, "0123456789012345678"); +} + +Test(state, string_single_emission, .init=State__setup, .fini=State__teardown) { + State__init_silent(TestState__dummy, "Foobar"); + TestState__dummy->id = 4711; + + char buffer[255]; + State__str(TestState__dummy, buffer); + + cr_assert_str_eq(buffer, "State {.name=\"Foobar\", .id=4711, .num_emissions=0, .reference=\"\", .num_incoming=0}"); +} + +Test(state, string_triple_emission, .init=State__setup, .fini=State__teardown) { + struct EmissionTable* etable = (struct EmissionTable*) malloc(sizeof(EmissionTable)); + TestState__dummy->id = 4711; + EmissionTable__init(etable, 3, UNIFORM_DISTRIBUTION); + + State__init_uniform(TestState__dummy, "Foobarbaz", 3, etable); + cr_assert_eq(TestState__dummy->reference[0], LITERAL_A); + cr_assert_eq(TestState__dummy->reference[1], LITERAL_A); + cr_assert_eq(TestState__dummy->reference[2], LITERAL_A); + + char buffer[255]; + State__str(TestState__dummy, buffer); + cr_assert_str_eq(buffer, "State {.name=\"Foobarbaz\", .id=4711, .num_emissions=3, .reference=\"AAA\", .num_incoming=0}"); + + EmissionTable__destroy(etable); +} diff --git a/test/Viterbi.c b/test/Viterbi.c new file mode 100644 index 0000000..8a2796d --- /dev/null +++ b/test/Viterbi.c @@ -0,0 +1,103 @@ +/** + * Viterbi Tests + * Copyright 2017 MPI-CBG/MPI-PKS Peter Schwede + */ +#include +#include +#include + +#include "EmissionTable.h" +#include "HMM.h" + +#include "Viterbi.h" + +struct HMM* TestViterbi__hmm; +struct EmissionTable TestViterbi__et[5]; +size_t TestViterbi__num_emissiontables = 0; + + +void TestViterbi__simpliest_setup(void) { + EmissionTable__init(&TestViterbi__et[0], 1, UNIFORM_DISTRIBUTION); + TestViterbi__num_emissiontables = 1; + + TestViterbi__hmm = HMM__create(1, 1, 1); + + struct State* s = HMM__new_state(TestViterbi__hmm); + State__init_uniform(s, "1nt_uni", 1, &TestViterbi__et[0]); + State__add_incoming(s, 0.5, s); + + struct Transition t = {.origin=s->id, .logodd=0}; + HMM__set_start(TestViterbi__hmm, t); + t.logodd = log(0.5); + HMM__set_end(TestViterbi__hmm, t); +} + + +void TestViterbi__complex_setup(void) { + EmissionTable__init(&TestViterbi__et[0], 1, UNIFORM_DISTRIBUTION); + EmissionTable__init(&TestViterbi__et[1], 3, UNIFORM_DISTRIBUTION); + TestViterbi__num_emissiontables = 2; + + TestViterbi__hmm = HMM__create(2, 1, 1); + + struct State* s0 = HMM__new_state(TestViterbi__hmm); + struct State* s1 = HMM__new_state(TestViterbi__hmm); + + State__init_uniform(s0, "1nt_uni", 1, &TestViterbi__et[0]); + State__init_uniform(s1, "3nt_uni", 3, &TestViterbi__et[1]); + + State__add_incoming(s0, 0.5, s0); + State__add_incoming(s0, 0.5, s1); + State__add_incoming(s1, 0.5, s1); + + struct Transition t = {.origin=s0->id, .logodd=log(0.5)}; + HMM__set_start(TestViterbi__hmm, t); + + t.origin=s1->id; + HMM__set_end(TestViterbi__hmm, t); +} + + +void TestViterbi__simpliest_teardown(void) { + HMM__destroy(TestViterbi__hmm); + + for (size_t i=0; i < TestViterbi__num_emissiontables; i++) { + EmissionTable__destroy(&TestViterbi__et[i]); + } + TestViterbi__num_emissiontables = 0; +} + + +void TestViterbi__validate_simple_path(void) { + size_t path_length; + struct State* path[3]; + Literal observations[3] = { LITERAL_A, LITERAL_A, LITERAL_A }; + + Viterbi(TestViterbi__hmm, 3, observations, &path_length, path); + + for (size_t i=0; i < 3; i++) { + struct State* state = path[i]; + /* + char tmp[255] = "\0"; + State__str(state, tmp); + printf("%c -> %s\n", Literal__char(observations[i]), tmp); + */ + cr_assert_neq(state, NULL); + cr_assert_eq(state->id, 0); + } +} + + +Test(viterbi, build_and_breakdown, .init=TestViterbi__simpliest_setup, .fini=TestViterbi__simpliest_teardown) { + cr_assert(1); +} + + +Test(viterbi, single_emission, .init=TestViterbi__simpliest_setup, .fini=TestViterbi__simpliest_teardown) { + //TestViterbi__validate_simple_path(); +} + + +Test(viterbi, complex__HMM, .init=TestViterbi__complex_setup, .fini=TestViterbi__simpliest_teardown) { + +}