diff --git a/examples/newick-phylip-unrooted/newick-phylip-unrooted.c b/examples/newick-phylip-unrooted/newick-phylip-unrooted.c index a6b8469..97198f4 100644 --- a/examples/newick-phylip-unrooted/newick-phylip-unrooted.c +++ b/examples/newick-phylip-unrooted/newick-phylip-unrooted.c @@ -93,7 +93,6 @@ int main(int argc, char * argv[]) { unsigned int i; unsigned int tip_nodes_count, inner_nodes_count, nodes_count, branch_count; - unsigned int sequence_count; unsigned int matrix_count, ops_count; unsigned int * matrix_indices; double * branch_lengths; @@ -104,7 +103,7 @@ int main(int argc, char * argv[]) /* we accept only two arguments - the newick tree (unrooted binary) and the alignment in the form of FASTA reads */ if (argc != 3) - fatal(" syntax: %s [newick] [fasta]", argv[0]); + fatal(" syntax: %s [newick] [phylip]", argv[0]); /* parse the unrooted binary tree in newick format, and store the number of tip nodes in tip_nodes_count */ @@ -162,12 +161,18 @@ int main(int argc, char * argv[]) } /* read PHYLIP alignment */ - pll_msa_t * msa = pll_phylip_parse_msa(argv[2], &sequence_count); + pll_phylip_t * fd = pll_phylip_open(argv[2], pll_map_phylip); + if (!fd) + fatal(pll_errmsg); + + pll_msa_t * msa = pll_phylip_parse_interleaved(fd); if (!msa) fatal(pll_errmsg); + pll_phylip_close(fd); + /* compress site patterns */ - if (sequence_count != tip_nodes_count) + if ((unsigned int)(msa->count) != tip_nodes_count) fatal("Number of sequences does not match number of leaves in tree"); #ifdef COMPRESS diff --git a/examples/parsimony/npr-pars.c b/examples/parsimony/npr-pars.c index 871ffa7..0175629 100644 --- a/examples/parsimony/npr-pars.c +++ b/examples/parsimony/npr-pars.c @@ -66,7 +66,6 @@ int main(int argc, char * argv[]) { unsigned int i,j,n; unsigned int tip_nodes_count, inner_nodes_count, nodes_count, branch_count; - unsigned int sequence_count; unsigned int ops_count; pll_parsimony_t * pars; pll_pars_buildop_t * operations; @@ -129,11 +128,17 @@ int main(int argc, char * argv[]) } /* read PHYLIP alignment */ - pll_msa_t * msa = pll_phylip_parse_msa(argv[2], &sequence_count); + pll_phylip_t * fd = pll_phylip_open(argv[2], pll_map_phylip); + if (!fd) + fatal(pll_errmsg); + + pll_msa_t * msa = pll_phylip_parse_interleaved(fd); if (!msa) fatal(pll_errmsg); - if (sequence_count != tip_nodes_count) + pll_phylip_close(fd); + + if ((unsigned int)(msa->count) != tip_nodes_count) fatal("Number of sequences does not match number of leaves in tree"); /* create the PLL parsimony instance diff --git a/src/Makefile.am b/src/Makefile.am index 30f21fb..7f7ee8b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -31,13 +31,12 @@ core_pmatrix.c \ core_likelihood.c \ parse_utree.y \ parse_rtree.y \ -parse_phylip.y \ lex_utree.l \ lex_rtree.l \ -lex_phylip.l \ fast_parsimony.c \ stepwise.c \ -random.c +random.c \ +phylip.c libpll_la_CFLAGS = $(AM_CFLAGS) diff --git a/src/lex_phylip.l b/src/lex_phylip.l deleted file mode 100644 index 58fe93c..0000000 --- a/src/lex_phylip.l +++ /dev/null @@ -1,96 +0,0 @@ -/* - Copyright (C) 2015 Tomas Flouri - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as - published by the Free Software Foundation, either version 3 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see . - - Contact: Tomas Flouri , - Heidelberg Institute for Theoretical Studies, - Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany -*/ -%{ -#include "pll.h" -#include "parse_phylip.h" - -static size_t string_length = 0; - -static char * xstrndup(const char * s, size_t len) -{ - char * p = (char *)malloc(len+1); - if (!p) - { - pll_errno = PLL_ERROR_MEM_ALLOC; - snprintf(pll_errmsg, 200, "Unable to allocate enough memory."); - return PLL_FAILURE; - } - strncpy(p,s,len); - p[len] = 0; - return p; -} - -static char * append(size_t * dstlen, const char * src, size_t srclen) -{ - char * mem = (char *)realloc(pll_phylip_lval.s, (*dstlen + srclen + 1)); - if (!mem) - { - pll_errno = PLL_ERROR_MEM_ALLOC; - snprintf(pll_errmsg, 200, "Unable to allocate enough memory."); - return PLL_FAILURE; - } - pll_phylip_lval.s = mem; - strncpy(&pll_phylip_lval.s[*dstlen], src, srclen); - pll_phylip_lval.s[*dstlen + srclen] = 0; - *dstlen += srclen; - return pll_phylip_lval.s; -} - -%} -%option noyywrap -%option prefix="pll_phylip_" -%option nounput -%option noinput -%s apos -%s quot - -%% -{ -\\\" { append(&string_length, "\"", 1); } -\' { append(&string_length, "\'", 1); } -\" { BEGIN(INITIAL); return STRING; } -} - -{ -\\\' { append(&string_length, "\'", 1); } -\" { append(&string_length, "\"", 1); } -\' { BEGIN(INITIAL); return STRING; } -} - -{ -\\n { append(&string_length, "\n", 1); } -\\t { append(&string_length, "\t", 1); } -\\\\ { append(&string_length, "\\", 1); } -([^\"\'\\]|\n)+ { append(&string_length, pll_phylip_text, pll_phylip_leng); } -} - -\" { string_length = 0; pll_phylip_lval.s = NULL; BEGIN(quot); } -\' { string_length =0; pll_phylip_lval.s = NULL; ;BEGIN(apos); } -([^\t\n\r ])+ { pll_phylip_lval.s=xstrndup(pll_phylip_text,pll_phylip_leng); - return STRING; } -[ \t\n\r]*\#[^\n]* { - ; } -[ \t\n\r] { ; } -. { snprintf(pll_errmsg, 200, - "Syntax error (%c)\n", pll_phylip_text[0]); - pll_errno = PLL_ERROR_PHYLIP_SYNTAX; - return PLL_FAILURE; } -%% diff --git a/src/maps.c b/src/maps.c index 004af7c..83398f6 100644 --- a/src/maps.c +++ b/src/maps.c @@ -101,7 +101,7 @@ const unsigned int pll_map_aa[256] = /* - map for fasta parsing + maps for fasta and phylip parsing legal symbols: ?*- 0123456789 abcdefghijklmnopqrstuvxyz (also upper case) fatal symbols: period (.), ascii 0-8, ascii 14-31 @@ -109,8 +109,37 @@ const unsigned int pll_map_aa[256] = stripped: !"#$&'()+,/:;<=>@^_`æøåÆØŧ¨´ includes both amino acid and nucleotide sequences, adapt to nt only + + TODO: Would be more suitable to create separate maps for parsing nt data, + aa data, binary, etc */ +const unsigned int pll_map_phylip[256] = + { + /* + 0=stripped, 1=legal, 2=fatal, 3=silently stripped + @ A B C D E F G H I J K L M N O + P Q R S T U V W X Y Z [ \ ] ^ _ + */ + +/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, /* 0 */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 1 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, /* 2 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, /* 3 */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 5 */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, /* 6 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 8 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* A */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* D */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* E */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F */ + }; const unsigned int pll_map_fasta[256] = { /* diff --git a/src/parse_phylip.y b/src/parse_phylip.y deleted file mode 100644 index ec7ac9a..0000000 --- a/src/parse_phylip.y +++ /dev/null @@ -1,244 +0,0 @@ -/* - Copyright (C) 2015 Tomas Flouri - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as - published by the Free Software Foundation, either version 3 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see . - - Contact: Tomas Flouri , - Heidelberg Institute for Theoretical Studies, - Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany -*/ -%{ -#include "pll.h" - -extern int pll_phylip_lex(); -extern FILE * pll_phylip_in; -extern void pll_phylip_lex_destroy(); -extern int pll_phylip_lineno; - -static int seq_count = 0; -static int g_error = 0; - -struct sequence_s -{ - int len; - char * label; - char * sequence; -}; - -void pll_phylip_error(pll_msa_t ** msa, const char * s) -{ - fprintf(stderr, "%s.\n", s); -} - -PLL_EXPORT void pll_msa_destroy(pll_msa_t * msa) -{ - if (!msa) return; - - int i; - - if (msa->label) - { - for (i = 0; i < msa->count; ++i) - free(msa->label[i]); - free(msa->label); - } - - if (msa->sequence) - { - for (i = 0; i < msa->count; ++i) - free(msa->sequence[i]); - free(msa->sequence); - } - - free(msa); -} - -void msa_destroy(pll_msa_t * msa) -{ - if (msa) - { - msa->count = seq_count; - pll_msa_destroy(msa); - } -} -%} - -%union -{ - char * s; - pll_msa_t * msa; - struct sequence_s ** sequence; -} - -%error-verbose -%parse-param {pll_msa_t ** msa} -%destructor { msa_destroy($$); } msa - -%token STRING -%type msa -%type sequence_list -%start input -%% - -input: msa -{ - *msa = $1; -} - -msa: STRING STRING sequence_list -{ - int i; - - if (!g_error) - for (i = 0 ; $1[i]; ++i) - if (!isdigit($1[i])) - { - pll_errno = PLL_ERROR_PHYLIP_SYNTAX; - snprintf(pll_errmsg, 200, "Number of taxa must be an integer"); - g_error = 1; - break; - } - - if (!g_error) - for (i = 0 ; $2[i]; ++i) - if (!isdigit($2[i])) - { - pll_errno = PLL_ERROR_PHYLIP_SYNTAX; - snprintf(pll_errmsg, 200, "Sequence length must be an integer"); - g_error = 1; - break; - } - - if (!g_error) - if (atoi($1) != seq_count) - { - pll_errno = PLL_ERROR_PHYLIP_SYNTAX; - snprintf(pll_errmsg, 200, - "Number of sequences read is %d but expected %d\n", - seq_count, - atoi($1)); - g_error = 1; - } - - if (!g_error) - if (atoi($2) != $3[seq_count-1]->len) - { - pll_errno = PLL_ERROR_PHYLIP_SYNTAX; - snprintf(pll_errmsg, 200, - "Sequence length is %d but expected %d\n", - $3[seq_count-1]->len, - atoi($2)); - g_error = 1; - } - - $$ = NULL; - - if (!g_error) - { - $$ = (pll_msa_t *)calloc(1,sizeof(pll_msa_t)); - - $$->count = atoi($1); - $$->length = atoi($2); - $$->sequence = (char **)malloc(seq_count*sizeof(char *)); - $$->label = (char **)malloc(seq_count*sizeof(char *)); - - for (i = 0; i < seq_count; ++i) - { - $$->sequence[i] = $3[seq_count-i-1]->sequence; - $$->label[i] = $3[seq_count-i-1]->label; - free($3[seq_count-i-1]); - } - } - else - { - for (i = 0; i < seq_count; ++i) - { - free($3[seq_count-i-1]->sequence); - free($3[seq_count-i-1]->label); - free($3[seq_count-i-1]); - } - seq_count = 0; - } - - free($1); - free($2); - free($3); -} - -sequence_list: STRING STRING sequence_list -{ - $$ = (struct sequence_s **)realloc((void *)$3, - (seq_count+1)*sizeof(struct sequence_s *)); - $$[seq_count] = (struct sequence_s *)malloc(sizeof(struct sequence_s)); - $$[seq_count]->label = $1; - $$[seq_count]->sequence = $2; - $$[seq_count]->len = strlen($2); - - if ($$[seq_count]->len != $$[seq_count-1]->len) - { - pll_errno = PLL_ERROR_PHYLIP_SYNTAX; - snprintf(pll_errmsg, 200, "Lengths of sequences %s and %s differ", - $$[seq_count]->label, $$[seq_count-1]->label); - g_error = 1; - - } - seq_count++; - -} - | STRING STRING -{ - $$ = (struct sequence_s **)calloc(1,sizeof(struct sequence_s *)); - $$[seq_count] = (struct sequence_s *)malloc(sizeof(struct sequence_s)); - $$[seq_count]->label = $1; - $$[seq_count]->sequence = $2; - $$[seq_count]->len = strlen($2); - seq_count++; -} - -%% - -PLL_EXPORT pll_msa_t * pll_phylip_parse_msa(const char * filename, - unsigned int * msa_count) -{ - pll_msa_t * msa = NULL; - - pll_phylip_lineno = 1; - seq_count= 0; - g_error = 0; - - pll_phylip_in = fopen(filename, "r"); - if (!pll_phylip_in) - { - msa_destroy(msa); - pll_errno = PLL_ERROR_FILE_OPEN; - snprintf(pll_errmsg, 200, "Unable to open file (%s)", filename); - return PLL_FAILURE; - } - else if (pll_phylip_parse(&msa)) - { - msa_destroy(msa); - msa = NULL; - fclose(pll_phylip_in); - pll_phylip_lex_destroy(); - return PLL_FAILURE; - } - - if (pll_phylip_in) fclose(pll_phylip_in); - - pll_phylip_lex_destroy(); - - *msa_count = seq_count; - - return msa; -} diff --git a/src/pll.h b/src/pll.h index 8f0c770..79d990c 100644 --- a/src/pll.h +++ b/src/pll.h @@ -118,26 +118,30 @@ #define PLL_ERROR_FASTA_UNPRINTABLECHAR 104 #define PLL_ERROR_FASTA_INVALIDHEADER 105 #define PLL_ERROR_PHYLIP_SYNTAX 106 -#define PLL_ERROR_NEWICK_SYNTAX 107 -#define PLL_ERROR_MEM_ALLOC 108 -#define PLL_ERROR_PARAM_INVALID 109 -#define PLL_ERROR_TIPDATA_ILLEGALSTATE 110 -#define PLL_ERROR_TIPDATA_ILLEGALFUNCTION 111 -#define PLL_ERROR_TREE_CONVERSION 112 -#define PLL_ERROR_INVAR_INCOMPAT 113 -#define PLL_ERROR_INVAR_PROPORTION 114 -#define PLL_ERROR_INVAR_PARAMINDEX 115 -#define PLL_ERROR_INVAR_NONEFOUND 116 -#define PLL_ERROR_AB_INVALIDMETHOD 117 -#define PLL_ERROR_AB_NOSUPPORT 118 -#define PLL_ERROR_SPR_TERMINALBRANCH 119 -#define PLL_ERROR_SPR_NOCHANGE 120 -#define PLL_ERROR_NNI_INVALIDMOVE 121 -#define PLL_ERROR_NNI_TERMINALBRANCH 122 -#define PLL_ERROR_STEPWISE_STRUCT 123 -#define PLL_ERROR_STEPWISE_TIPS 124 -#define PLL_ERROR_STEPWISE_UNSUPPORTED 125 -#define PLL_ERROR_EINVAL 126 +#define PLL_ERROR_PHYLIP_LONGSEQ 107 +#define PLL_ERROR_PHYLIP_NONALIGNED 108 +#define PLL_ERROR_PHYLIP_ILLEGALCHAR 109 +#define PLL_ERROR_PHYLIP_UNPRINTABLECHAR 110 +#define PLL_ERROR_NEWICK_SYNTAX 111 +#define PLL_ERROR_MEM_ALLOC 112 +#define PLL_ERROR_PARAM_INVALID 113 +#define PLL_ERROR_TIPDATA_ILLEGALSTATE 114 +#define PLL_ERROR_TIPDATA_ILLEGALFUNCTION 115 +#define PLL_ERROR_TREE_CONVERSION 116 +#define PLL_ERROR_INVAR_INCOMPAT 117 +#define PLL_ERROR_INVAR_PROPORTION 118 +#define PLL_ERROR_INVAR_PARAMINDEX 119 +#define PLL_ERROR_INVAR_NONEFOUND 120 +#define PLL_ERROR_AB_INVALIDMETHOD 121 +#define PLL_ERROR_AB_NOSUPPORT 122 +#define PLL_ERROR_SPR_TERMINALBRANCH 123 +#define PLL_ERROR_SPR_NOCHANGE 124 +#define PLL_ERROR_NNI_INVALIDMOVE 125 +#define PLL_ERROR_NNI_TERMINALBRANCH 126 +#define PLL_ERROR_STEPWISE_STRUCT 127 +#define PLL_ERROR_STEPWISE_TIPS 128 +#define PLL_ERROR_STEPWISE_UNSUPPORTED 129 +#define PLL_ERROR_EINVAL 130 /* utree specific */ @@ -243,6 +247,22 @@ typedef struct pll_fasta long stripped[256]; } pll_fasta_t; +/* Simple structure for handling PHYLIP parsing */ +typedef struct pll_phylip_s +{ + FILE * fp; + char * line; + size_t line_size; + size_t line_maxsize; + char buffer[PLL_LINEALLOC]; + const unsigned int * chrstatus; + long no; + long filesize; + long lineno; + long stripped_count; + long stripped[256]; +} pll_phylip_t; + /* Simple unrooted and rooted tree structure for parsing newick */ typedef struct pll_unode_s @@ -410,6 +430,7 @@ PLL_EXPORT extern const unsigned int pll_map_bin[256]; PLL_EXPORT extern const unsigned int pll_map_nt[256]; PLL_EXPORT extern const unsigned int pll_map_aa[256]; PLL_EXPORT extern const unsigned int pll_map_fasta[256]; +PLL_EXPORT extern const unsigned int pll_map_phylip[256]; PLL_EXPORT extern const double pll_aa_rates_dayhoff[190]; PLL_EXPORT extern const double pll_aa_rates_lg[190]; @@ -695,13 +716,21 @@ PLL_EXPORT void pll_utree_create_pars_buildops(pll_unode_t * const* trav_buffer, pll_pars_buildop_t * ops, unsigned int * ops_count); -/* functions in parse_phylip.y */ - -PLL_EXPORT pll_msa_t * pll_phylip_parse_msa(const char * filename, - unsigned int * msa_count); +/* functions in phylip.c */ PLL_EXPORT void pll_msa_destroy(pll_msa_t * msa); +PLL_EXPORT pll_phylip_t * pll_phylip_open(const char * filename, + const unsigned int * map); + +PLL_EXPORT int pll_phylip_rewind(pll_phylip_t * fd); + +PLL_EXPORT void pll_phylip_close(pll_phylip_t * fd); + +PLL_EXPORT pll_msa_t * pll_phylip_parse_interleaved(pll_phylip_t * fd); + +PLL_EXPORT pll_msa_t * pll_phylip_parse_sequential(pll_phylip_t * fd); + /* functions in rtree.c */ PLL_EXPORT void pll_rtree_show_ascii(const pll_rnode_t * root, int options);