diff --git a/examples/newick-phylip-unrooted/newick-phylip-unrooted.c b/examples/newick-phylip-unrooted/newick-phylip-unrooted.c
index a6b8469..97198f4 100644
--- a/examples/newick-phylip-unrooted/newick-phylip-unrooted.c
+++ b/examples/newick-phylip-unrooted/newick-phylip-unrooted.c
@@ -93,7 +93,6 @@ int main(int argc, char * argv[])
{
unsigned int i;
unsigned int tip_nodes_count, inner_nodes_count, nodes_count, branch_count;
- unsigned int sequence_count;
unsigned int matrix_count, ops_count;
unsigned int * matrix_indices;
double * branch_lengths;
@@ -104,7 +103,7 @@ int main(int argc, char * argv[])
/* we accept only two arguments - the newick tree (unrooted binary) and the
alignment in the form of FASTA reads */
if (argc != 3)
- fatal(" syntax: %s [newick] [fasta]", argv[0]);
+ fatal(" syntax: %s [newick] [phylip]", argv[0]);
/* parse the unrooted binary tree in newick format, and store the number
of tip nodes in tip_nodes_count */
@@ -162,12 +161,18 @@ int main(int argc, char * argv[])
}
/* read PHYLIP alignment */
- pll_msa_t * msa = pll_phylip_parse_msa(argv[2], &sequence_count);
+ pll_phylip_t * fd = pll_phylip_open(argv[2], pll_map_phylip);
+ if (!fd)
+ fatal(pll_errmsg);
+
+ pll_msa_t * msa = pll_phylip_parse_interleaved(fd);
if (!msa)
fatal(pll_errmsg);
+ pll_phylip_close(fd);
+
/* compress site patterns */
- if (sequence_count != tip_nodes_count)
+ if ((unsigned int)(msa->count) != tip_nodes_count)
fatal("Number of sequences does not match number of leaves in tree");
#ifdef COMPRESS
diff --git a/examples/parsimony/npr-pars.c b/examples/parsimony/npr-pars.c
index 871ffa7..0175629 100644
--- a/examples/parsimony/npr-pars.c
+++ b/examples/parsimony/npr-pars.c
@@ -66,7 +66,6 @@ int main(int argc, char * argv[])
{
unsigned int i,j,n;
unsigned int tip_nodes_count, inner_nodes_count, nodes_count, branch_count;
- unsigned int sequence_count;
unsigned int ops_count;
pll_parsimony_t * pars;
pll_pars_buildop_t * operations;
@@ -129,11 +128,17 @@ int main(int argc, char * argv[])
}
/* read PHYLIP alignment */
- pll_msa_t * msa = pll_phylip_parse_msa(argv[2], &sequence_count);
+ pll_phylip_t * fd = pll_phylip_open(argv[2], pll_map_phylip);
+ if (!fd)
+ fatal(pll_errmsg);
+
+ pll_msa_t * msa = pll_phylip_parse_interleaved(fd);
if (!msa)
fatal(pll_errmsg);
- if (sequence_count != tip_nodes_count)
+ pll_phylip_close(fd);
+
+ if ((unsigned int)(msa->count) != tip_nodes_count)
fatal("Number of sequences does not match number of leaves in tree");
/* create the PLL parsimony instance
diff --git a/src/Makefile.am b/src/Makefile.am
index 30f21fb..7f7ee8b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -31,13 +31,12 @@ core_pmatrix.c \
core_likelihood.c \
parse_utree.y \
parse_rtree.y \
-parse_phylip.y \
lex_utree.l \
lex_rtree.l \
-lex_phylip.l \
fast_parsimony.c \
stepwise.c \
-random.c
+random.c \
+phylip.c
libpll_la_CFLAGS = $(AM_CFLAGS)
diff --git a/src/lex_phylip.l b/src/lex_phylip.l
deleted file mode 100644
index 58fe93c..0000000
--- a/src/lex_phylip.l
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- Copyright (C) 2015 Tomas Flouri
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as
- published by the Free Software Foundation, either version 3 of the
- License, or (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see .
-
- Contact: Tomas Flouri ,
- Heidelberg Institute for Theoretical Studies,
- Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
-*/
-%{
-#include "pll.h"
-#include "parse_phylip.h"
-
-static size_t string_length = 0;
-
-static char * xstrndup(const char * s, size_t len)
-{
- char * p = (char *)malloc(len+1);
- if (!p)
- {
- pll_errno = PLL_ERROR_MEM_ALLOC;
- snprintf(pll_errmsg, 200, "Unable to allocate enough memory.");
- return PLL_FAILURE;
- }
- strncpy(p,s,len);
- p[len] = 0;
- return p;
-}
-
-static char * append(size_t * dstlen, const char * src, size_t srclen)
-{
- char * mem = (char *)realloc(pll_phylip_lval.s, (*dstlen + srclen + 1));
- if (!mem)
- {
- pll_errno = PLL_ERROR_MEM_ALLOC;
- snprintf(pll_errmsg, 200, "Unable to allocate enough memory.");
- return PLL_FAILURE;
- }
- pll_phylip_lval.s = mem;
- strncpy(&pll_phylip_lval.s[*dstlen], src, srclen);
- pll_phylip_lval.s[*dstlen + srclen] = 0;
- *dstlen += srclen;
- return pll_phylip_lval.s;
-}
-
-%}
-%option noyywrap
-%option prefix="pll_phylip_"
-%option nounput
-%option noinput
-%s apos
-%s quot
-
-%%
-{
-\\\" { append(&string_length, "\"", 1); }
-\' { append(&string_length, "\'", 1); }
-\" { BEGIN(INITIAL); return STRING; }
-}
-
-{
-\\\' { append(&string_length, "\'", 1); }
-\" { append(&string_length, "\"", 1); }
-\' { BEGIN(INITIAL); return STRING; }
-}
-
-{
-\\n { append(&string_length, "\n", 1); }
-\\t { append(&string_length, "\t", 1); }
-\\\\ { append(&string_length, "\\", 1); }
-([^\"\'\\]|\n)+ { append(&string_length, pll_phylip_text, pll_phylip_leng); }
-}
-
-\" { string_length = 0; pll_phylip_lval.s = NULL; BEGIN(quot); }
-\' { string_length =0; pll_phylip_lval.s = NULL; ;BEGIN(apos); }
-([^\t\n\r ])+ { pll_phylip_lval.s=xstrndup(pll_phylip_text,pll_phylip_leng);
- return STRING; }
-[ \t\n\r]*\#[^\n]* {
- ; }
-[ \t\n\r] { ; }
-. { snprintf(pll_errmsg, 200,
- "Syntax error (%c)\n", pll_phylip_text[0]);
- pll_errno = PLL_ERROR_PHYLIP_SYNTAX;
- return PLL_FAILURE; }
-%%
diff --git a/src/maps.c b/src/maps.c
index 004af7c..83398f6 100644
--- a/src/maps.c
+++ b/src/maps.c
@@ -101,7 +101,7 @@ const unsigned int pll_map_aa[256] =
/*
- map for fasta parsing
+ maps for fasta and phylip parsing
legal symbols: ?*- 0123456789 abcdefghijklmnopqrstuvxyz (also upper case)
fatal symbols: period (.), ascii 0-8, ascii 14-31
@@ -109,8 +109,37 @@ const unsigned int pll_map_aa[256] =
stripped: !"#$&'()+,/:;<=>@^_`æøåÆØŧ¨´
includes both amino acid and nucleotide sequences, adapt to nt only
+
+ TODO: Would be more suitable to create separate maps for parsing nt data,
+ aa data, binary, etc
*/
+const unsigned int pll_map_phylip[256] =
+ {
+ /*
+ 0=stripped, 1=legal, 2=fatal, 3=silently stripped
+ @ A B C D E F G H I J K L M N O
+ P Q R S T U V W X Y Z [ \ ] ^ _
+ */
+
+/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 2, /* 0 */
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 1 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, /* 2 */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, /* 3 */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4 */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 5 */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, /* 6 */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 8 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* A */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* D */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* E */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F */
+ };
const unsigned int pll_map_fasta[256] =
{
/*
diff --git a/src/parse_phylip.y b/src/parse_phylip.y
deleted file mode 100644
index ec7ac9a..0000000
--- a/src/parse_phylip.y
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- Copyright (C) 2015 Tomas Flouri
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as
- published by the Free Software Foundation, either version 3 of the
- License, or (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see .
-
- Contact: Tomas Flouri ,
- Heidelberg Institute for Theoretical Studies,
- Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
-*/
-%{
-#include "pll.h"
-
-extern int pll_phylip_lex();
-extern FILE * pll_phylip_in;
-extern void pll_phylip_lex_destroy();
-extern int pll_phylip_lineno;
-
-static int seq_count = 0;
-static int g_error = 0;
-
-struct sequence_s
-{
- int len;
- char * label;
- char * sequence;
-};
-
-void pll_phylip_error(pll_msa_t ** msa, const char * s)
-{
- fprintf(stderr, "%s.\n", s);
-}
-
-PLL_EXPORT void pll_msa_destroy(pll_msa_t * msa)
-{
- if (!msa) return;
-
- int i;
-
- if (msa->label)
- {
- for (i = 0; i < msa->count; ++i)
- free(msa->label[i]);
- free(msa->label);
- }
-
- if (msa->sequence)
- {
- for (i = 0; i < msa->count; ++i)
- free(msa->sequence[i]);
- free(msa->sequence);
- }
-
- free(msa);
-}
-
-void msa_destroy(pll_msa_t * msa)
-{
- if (msa)
- {
- msa->count = seq_count;
- pll_msa_destroy(msa);
- }
-}
-%}
-
-%union
-{
- char * s;
- pll_msa_t * msa;
- struct sequence_s ** sequence;
-}
-
-%error-verbose
-%parse-param {pll_msa_t ** msa}
-%destructor { msa_destroy($$); } msa
-
-%token STRING
-%type msa
-%type sequence_list
-%start input
-%%
-
-input: msa
-{
- *msa = $1;
-}
-
-msa: STRING STRING sequence_list
-{
- int i;
-
- if (!g_error)
- for (i = 0 ; $1[i]; ++i)
- if (!isdigit($1[i]))
- {
- pll_errno = PLL_ERROR_PHYLIP_SYNTAX;
- snprintf(pll_errmsg, 200, "Number of taxa must be an integer");
- g_error = 1;
- break;
- }
-
- if (!g_error)
- for (i = 0 ; $2[i]; ++i)
- if (!isdigit($2[i]))
- {
- pll_errno = PLL_ERROR_PHYLIP_SYNTAX;
- snprintf(pll_errmsg, 200, "Sequence length must be an integer");
- g_error = 1;
- break;
- }
-
- if (!g_error)
- if (atoi($1) != seq_count)
- {
- pll_errno = PLL_ERROR_PHYLIP_SYNTAX;
- snprintf(pll_errmsg, 200,
- "Number of sequences read is %d but expected %d\n",
- seq_count,
- atoi($1));
- g_error = 1;
- }
-
- if (!g_error)
- if (atoi($2) != $3[seq_count-1]->len)
- {
- pll_errno = PLL_ERROR_PHYLIP_SYNTAX;
- snprintf(pll_errmsg, 200,
- "Sequence length is %d but expected %d\n",
- $3[seq_count-1]->len,
- atoi($2));
- g_error = 1;
- }
-
- $$ = NULL;
-
- if (!g_error)
- {
- $$ = (pll_msa_t *)calloc(1,sizeof(pll_msa_t));
-
- $$->count = atoi($1);
- $$->length = atoi($2);
- $$->sequence = (char **)malloc(seq_count*sizeof(char *));
- $$->label = (char **)malloc(seq_count*sizeof(char *));
-
- for (i = 0; i < seq_count; ++i)
- {
- $$->sequence[i] = $3[seq_count-i-1]->sequence;
- $$->label[i] = $3[seq_count-i-1]->label;
- free($3[seq_count-i-1]);
- }
- }
- else
- {
- for (i = 0; i < seq_count; ++i)
- {
- free($3[seq_count-i-1]->sequence);
- free($3[seq_count-i-1]->label);
- free($3[seq_count-i-1]);
- }
- seq_count = 0;
- }
-
- free($1);
- free($2);
- free($3);
-}
-
-sequence_list: STRING STRING sequence_list
-{
- $$ = (struct sequence_s **)realloc((void *)$3,
- (seq_count+1)*sizeof(struct sequence_s *));
- $$[seq_count] = (struct sequence_s *)malloc(sizeof(struct sequence_s));
- $$[seq_count]->label = $1;
- $$[seq_count]->sequence = $2;
- $$[seq_count]->len = strlen($2);
-
- if ($$[seq_count]->len != $$[seq_count-1]->len)
- {
- pll_errno = PLL_ERROR_PHYLIP_SYNTAX;
- snprintf(pll_errmsg, 200, "Lengths of sequences %s and %s differ",
- $$[seq_count]->label, $$[seq_count-1]->label);
- g_error = 1;
-
- }
- seq_count++;
-
-}
- | STRING STRING
-{
- $$ = (struct sequence_s **)calloc(1,sizeof(struct sequence_s *));
- $$[seq_count] = (struct sequence_s *)malloc(sizeof(struct sequence_s));
- $$[seq_count]->label = $1;
- $$[seq_count]->sequence = $2;
- $$[seq_count]->len = strlen($2);
- seq_count++;
-}
-
-%%
-
-PLL_EXPORT pll_msa_t * pll_phylip_parse_msa(const char * filename,
- unsigned int * msa_count)
-{
- pll_msa_t * msa = NULL;
-
- pll_phylip_lineno = 1;
- seq_count= 0;
- g_error = 0;
-
- pll_phylip_in = fopen(filename, "r");
- if (!pll_phylip_in)
- {
- msa_destroy(msa);
- pll_errno = PLL_ERROR_FILE_OPEN;
- snprintf(pll_errmsg, 200, "Unable to open file (%s)", filename);
- return PLL_FAILURE;
- }
- else if (pll_phylip_parse(&msa))
- {
- msa_destroy(msa);
- msa = NULL;
- fclose(pll_phylip_in);
- pll_phylip_lex_destroy();
- return PLL_FAILURE;
- }
-
- if (pll_phylip_in) fclose(pll_phylip_in);
-
- pll_phylip_lex_destroy();
-
- *msa_count = seq_count;
-
- return msa;
-}
diff --git a/src/pll.h b/src/pll.h
index 8f0c770..79d990c 100644
--- a/src/pll.h
+++ b/src/pll.h
@@ -118,26 +118,30 @@
#define PLL_ERROR_FASTA_UNPRINTABLECHAR 104
#define PLL_ERROR_FASTA_INVALIDHEADER 105
#define PLL_ERROR_PHYLIP_SYNTAX 106
-#define PLL_ERROR_NEWICK_SYNTAX 107
-#define PLL_ERROR_MEM_ALLOC 108
-#define PLL_ERROR_PARAM_INVALID 109
-#define PLL_ERROR_TIPDATA_ILLEGALSTATE 110
-#define PLL_ERROR_TIPDATA_ILLEGALFUNCTION 111
-#define PLL_ERROR_TREE_CONVERSION 112
-#define PLL_ERROR_INVAR_INCOMPAT 113
-#define PLL_ERROR_INVAR_PROPORTION 114
-#define PLL_ERROR_INVAR_PARAMINDEX 115
-#define PLL_ERROR_INVAR_NONEFOUND 116
-#define PLL_ERROR_AB_INVALIDMETHOD 117
-#define PLL_ERROR_AB_NOSUPPORT 118
-#define PLL_ERROR_SPR_TERMINALBRANCH 119
-#define PLL_ERROR_SPR_NOCHANGE 120
-#define PLL_ERROR_NNI_INVALIDMOVE 121
-#define PLL_ERROR_NNI_TERMINALBRANCH 122
-#define PLL_ERROR_STEPWISE_STRUCT 123
-#define PLL_ERROR_STEPWISE_TIPS 124
-#define PLL_ERROR_STEPWISE_UNSUPPORTED 125
-#define PLL_ERROR_EINVAL 126
+#define PLL_ERROR_PHYLIP_LONGSEQ 107
+#define PLL_ERROR_PHYLIP_NONALIGNED 108
+#define PLL_ERROR_PHYLIP_ILLEGALCHAR 109
+#define PLL_ERROR_PHYLIP_UNPRINTABLECHAR 110
+#define PLL_ERROR_NEWICK_SYNTAX 111
+#define PLL_ERROR_MEM_ALLOC 112
+#define PLL_ERROR_PARAM_INVALID 113
+#define PLL_ERROR_TIPDATA_ILLEGALSTATE 114
+#define PLL_ERROR_TIPDATA_ILLEGALFUNCTION 115
+#define PLL_ERROR_TREE_CONVERSION 116
+#define PLL_ERROR_INVAR_INCOMPAT 117
+#define PLL_ERROR_INVAR_PROPORTION 118
+#define PLL_ERROR_INVAR_PARAMINDEX 119
+#define PLL_ERROR_INVAR_NONEFOUND 120
+#define PLL_ERROR_AB_INVALIDMETHOD 121
+#define PLL_ERROR_AB_NOSUPPORT 122
+#define PLL_ERROR_SPR_TERMINALBRANCH 123
+#define PLL_ERROR_SPR_NOCHANGE 124
+#define PLL_ERROR_NNI_INVALIDMOVE 125
+#define PLL_ERROR_NNI_TERMINALBRANCH 126
+#define PLL_ERROR_STEPWISE_STRUCT 127
+#define PLL_ERROR_STEPWISE_TIPS 128
+#define PLL_ERROR_STEPWISE_UNSUPPORTED 129
+#define PLL_ERROR_EINVAL 130
/* utree specific */
@@ -243,6 +247,22 @@ typedef struct pll_fasta
long stripped[256];
} pll_fasta_t;
+/* Simple structure for handling PHYLIP parsing */
+typedef struct pll_phylip_s
+{
+ FILE * fp;
+ char * line;
+ size_t line_size;
+ size_t line_maxsize;
+ char buffer[PLL_LINEALLOC];
+ const unsigned int * chrstatus;
+ long no;
+ long filesize;
+ long lineno;
+ long stripped_count;
+ long stripped[256];
+} pll_phylip_t;
+
/* Simple unrooted and rooted tree structure for parsing newick */
typedef struct pll_unode_s
@@ -410,6 +430,7 @@ PLL_EXPORT extern const unsigned int pll_map_bin[256];
PLL_EXPORT extern const unsigned int pll_map_nt[256];
PLL_EXPORT extern const unsigned int pll_map_aa[256];
PLL_EXPORT extern const unsigned int pll_map_fasta[256];
+PLL_EXPORT extern const unsigned int pll_map_phylip[256];
PLL_EXPORT extern const double pll_aa_rates_dayhoff[190];
PLL_EXPORT extern const double pll_aa_rates_lg[190];
@@ -695,13 +716,21 @@ PLL_EXPORT void pll_utree_create_pars_buildops(pll_unode_t * const* trav_buffer,
pll_pars_buildop_t * ops,
unsigned int * ops_count);
-/* functions in parse_phylip.y */
-
-PLL_EXPORT pll_msa_t * pll_phylip_parse_msa(const char * filename,
- unsigned int * msa_count);
+/* functions in phylip.c */
PLL_EXPORT void pll_msa_destroy(pll_msa_t * msa);
+PLL_EXPORT pll_phylip_t * pll_phylip_open(const char * filename,
+ const unsigned int * map);
+
+PLL_EXPORT int pll_phylip_rewind(pll_phylip_t * fd);
+
+PLL_EXPORT void pll_phylip_close(pll_phylip_t * fd);
+
+PLL_EXPORT pll_msa_t * pll_phylip_parse_interleaved(pll_phylip_t * fd);
+
+PLL_EXPORT pll_msa_t * pll_phylip_parse_sequential(pll_phylip_t * fd);
+
/* functions in rtree.c */
PLL_EXPORT void pll_rtree_show_ascii(const pll_rnode_t * root, int options);