diff --git a/data/COVID_19.fa b/data/COVID_19.fa index 8fa02d5..a2b2a53 100644 --- a/data/COVID_19.fa +++ b/data/COVID_19.fa @@ -1,4 +1,4 @@ ->kraken:taxid|2697049|NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome +>taxid|2697049|NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAA AATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGG ACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT diff --git a/data/FluA_H1N1.fa b/data/FluA_H1N1.fa index 119b0d2..9a72e15 100644 --- a/data/FluA_H1N1.fa +++ b/data/FluA_H1N1.fa @@ -1,4 +1,4 @@ ->kraken:taxid|211044|NC_002023.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 1, complete sequence +>taxid|211044|NC_002023.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 1, complete sequence AGCGAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAACTAAGAAATCTAATGTCGCAGTCTCGCACCCGCGA GATACTCACAAAAACCACCGTGGACCATATGGCCATAATCAAGAAGTACACATCAGGAAGACAGGAGAAGAACCCAGCAC TTAGGATGAAATGGATGATGGCAATGAAATATCCAATTACAGCAGACAAGAGGATAACGGAAATGATTCCTGAGAGAAAT @@ -29,7 +29,7 @@ GAGTGGAGTCCGCTGTTCTGAGGGGATTCCTCATTCTGGGCAAAGAAGACAGGAGATATGGGCCAGCATTAAGCATCAAT GAACTGAGCAACCTTGCGAAAGGAGAGAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAGTGTCGAATAGTTT AAAAACGACCTTGTTTCTACT ->kraken:taxid|211044|NC_002021.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 2, complete sequence +>taxid|211044|NC_002021.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 2, complete sequence AGCGAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTTTTCTTAAAAGTGCCAGCACAAAATGCTATAAG CACAACTTTCCCTTATACCGGAGACCCTCCTTACAGCCATGGGACAGGAACAGGATACACCATGGATACTGTCAACAGGA CACATCAGTACTCAGAAAAGGCAAGATGGACAACAAACACCGAAACTGGAGCACCGCAACTCAACCCGATTGATGGGCCA @@ -60,7 +60,7 @@ AACAAATGTACCAAAGGTGCTGCAATTTATTTGAAAAATTCTTCCCCAGCAGTTCATACAGAAGACCAGTCGGGATATCC AGTATGGTGGAGGCTATGGTTTCCAGAGCCCGAATTGATGCACGGATTGATTTCGAATCTGGAAGGATAAAGAAAGAAGA GTTCACTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAAAAATAGTGAATTTAGCTTGTCCTTCATG AAAAAATGCCTTGTTCCTACT ->kraken:taxid|211044|NC_002022.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 3, complete sequence +>taxid|211044|NC_002022.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 3, complete sequence AGCGAAAGCAGGTACTGATCCAAAATGGAAGATTTTGTGCGACAATGCTTCAATCCGATGATTGTCGAGCTTGCGGAAAA AACAATGAAAGAGTATGGGGAGGACCTGAAAATCGAAACAAACAAATTTGCAGCAATATGCACTCACTTGGAAGTATGCT TCATGTATTCAGATTTCCACTTCATCAATGAGCAAGGCGAGTCAATAATCGTAGAACTTGGTGATCCTAATGCACTTTTG @@ -89,7 +89,7 @@ ATTGGGAAGGTCTGCAGGACTTTATTAGCAAAGTCGGTATTTAACAGCTTGTATGCATCTCCACAACTAGAAGGATTTTC AGCTGAATCAAGAAAACTGCTTCTTATCGTTCAGGCTCTTAGGGACAATCTGGAACCTGGGACCTTTGATCTTGGGGGGC TATATGAAGCAATTGAGGAGTGCCTAATTAATGATCCCTGGGTTTTGCTTAATGCTTCTTGGTTCAACTCCTTCCTTACA CATGCATTGAGTTAGTTGTGGCAGTGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTGTTTCTACT ->kraken:taxid|211044|NC_002017.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 4, complete sequence +>taxid|211044|NC_002017.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 4, complete sequence AGCAAAAGCAGGGGAAAATAAAAACAACCAAAATGAAGGCAAACCTACTGGTCCTGTTATGTGCACTTGCAGCTGCAGAT GCAGACACAATATGTATAGGCTACCATGCGAACAATTCAACCGACACTGTTGACACAGTGCTCGAGAAGAATGTGACAGT GACACACTCTGTTAACCTGCTCGAAGACAGCCACAACGGAAAACTATGTAGATTAAAAGGAATAGCCCCACTACAATTGG @@ -113,7 +113,7 @@ AGAAATGGGACTTATGATTATCCCAAATATTCAGAAGAGTCAAAGTTGAACAGGGAAAAGGTAGATGGAGTGAAATTGGA ATCAATGGGGATCTATCAGATTCTGGCGATCTACTCAACTGTCGCCAGTTCACTGGTGCTTTTGGTCTCCCTGGGGGCAA TCAGTTTCTGGATGTGTTCTAATGGATCTTTGCAGTGCAGAATATGCATCTGAGATTAGAATTTCAGAAATATGAGGAAA AACACCCTTGTTTCTACT ->kraken:taxid|211044|NC_002019.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 5, complete sequence +>taxid|211044|NC_002019.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 5, complete sequence AGCAAAAGCAGGGTAGATAATCACTCACTGAGTGACATCAAAATCATGGCGTCCCAAGGCACCAAACGGTCTTACGAACA GATGGAGACTGATGGAGAACGCCAGAATGCCACTGAAATCAGAGCATCCGTCGGAAAAATGATTGGTGGAATTGGACGAT TCTACATCCAAATGTGCACAGAACTTAAACTCAGTGATTATGAGGGACGGTTGATCCAAAACAGCTTAACAATAGAGAGA @@ -134,7 +134,7 @@ CTCAGTACAGAGAAATCTCCCTTTTGACAGAACAACCGTTATGGCAGCATTCACTGGGAATACAGAGGGGAGAACATCTG ACATGAGGACCGAAATCATAAGGATGATGGAAAGTGCAAGACCAGAAGATGTGTCTTTCCAGGGGCGGGGAGTCTTCGAG CTCTCGGACGAAAAGGCAGCGAGCCCGATCGTGCCTTCCTTTGACATGAGTAATGAAGGATCTTATTTCTTCGGAGACAA TGCAGAGGAGTACGACAATTAAAGAAAAATACCCTTGTTTCTACT ->kraken:taxid|211044|NC_002018.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 6, complete sequence +>taxid|211044|NC_002018.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 6, complete sequence AGCGAAAGCAGGGGTTTAAAATGAATCCAAATCAGAAAATAATAACCATTGGATCAATCTGTCTGGTAGTCGGACTAATT AGCCTAATATTGCAAATAGGGAATATAATCTCAATATGGATTAGCCATTCAATTCAAACTGGAAGTCAAAACCATACTGG AATATGCAACCAAAACATCATTACCTATAAAAATAGCACCTGGGTAAAGGACACAACTTCAGTGATATTAACCGGCAATT @@ -153,7 +153,7 @@ GACTGATAGTAAGTTCTCTGTGAGGCAAGATGTTGTGGCAATGACTGATTGGTCAGGGTATAGCGGGAGTTTCGTTCAAC ATCCTGAGCTAACAGGGCTAGACTGTATAAGGCCGTGCTTCTGGGTTGAATTAATCAGGGGACGACCTAAAGAAAAAACA ATCTGGACTAGTGCGAGCAGCATTTCTTTTTGTGGCGTGAATAGTGATACTGTAGATTGGTCTTGGCCAGACGGTGCTGA GTTGCCATTCACCATTGACAAGTAGTCTGTTCAAAAAACTCCTTGTTTCTACT ->kraken:taxid|211044|NC_002016.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 7, complete sequence +>taxid|211044|NC_002016.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 7, complete sequence AGCGAAAGCAGGTAGATATTGAAAGATGAGTCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTATCATCCCGTCAGGCC CCCTCAAAGCCGAGATCGCACAGAGACTTGAAGATGTCTTTGCAGGGAAGAACACCGATCTTGAGGTTCTCATGGAATGG CTAAAGACAAGACCAATCCTGTCACCTCTGACTAAGGGGATTTTAGGATTTGTGTTCACGCTCACCGTGCCCAGTGAGCG @@ -167,7 +167,7 @@ TCTTCTTGAAAATTTGCAGGCCTATCAGAAACGAATGGGGGTGCAGATGCAACGGTTCAAGTGATCCTCTCGCTATTGCC GCAAATATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCATTTACCGTCGCTTTAA ATACGGACTGAAAGGAGGGCCTTCTACGGAAGGAGTGCCAAAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAGTG CTGTGGATGCTGACGATGGTCATTTTGTCAGCATAGAGCTGGAGTAAAAAACTACCTTGTTTCTACT ->kraken:taxid|211044|NC_002020.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 8, complete sequence +>taxid|211044|NC_002020.1 Influenza A virus (A/Puerto Rico/8/1934(H1N1)) segment 8, complete sequence AGCAAAAGCAGGGTGACAAAGACATAATGGATCCAAACACTGTGTCAAGCTTTCAGGTAGATTGCTTTCTTTGGCATGTC CGCAAACGAGTTGCAGACCAAGAACTAGGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAAATCCCTAAGAGG AAGGGGCAGCACTCTTGGTCTGGACATCGAGACAGCCACACGTGCTGGAAAGCAGATAGTGGAGCGGATTCTGAAAGAAG diff --git a/data/FluA_H2N2.fa b/data/FluA_H2N2.fa index a8856d3..460db7d 100644 --- a/data/FluA_H2N2.fa +++ b/data/FluA_H2N2.fa @@ -1,4 +1,4 @@ ->kraken:taxid|488241|NC_007378.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 1, complete sequence +>taxid|488241|NC_007378.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 1, complete sequence AGCAAAAGCAGGTCAATTATATTCAATATGGAAAGAATAAAAGAACTACGGAATCTGATGTCGCAGTCTCGCACTCGCGA GATACTAACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCAGGGAGACAGGAAAAGAACCCGTCAC TTAGGATGAAATGGATGATGGCAATGAAATATCCAATTACAGCTGACAAGAGGATAACAGAAATGGTTCCTGAGAGAAAT @@ -29,7 +29,7 @@ GAGTGGAGTCCGCTGTTCTGAGAGGATTCCTCATTCTGGGCAAGGAAGATAGAAGATATGGACCAGCATTAAGCATCAAT GAACTGAGTACCCTTGCAAAAGGAGAAAAGGCTAATGTACTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAATGTTGAATAGTTT AAAAACGACCTTGTTTCTACT ->kraken:taxid|488241|NC_007375.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 2, complete sequence +>taxid|488241|NC_007375.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 2, complete sequence AGCAAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACCTTACTTTTCTTGAAAGTTCCAGCGCAAAATGCCATAAG TACTACATTCCCTTATACTGGAGATCCTCCATACAGCCATGGGACAGGAACAGGATACACCATGGACACAGTCAACAGAA CACATCAATATTCAGAAAAGGGGAAGTGGACAACAAACACGGAAACTGGAGCGCCCCAACTTAACCCAATTGATGGACCA @@ -60,7 +60,7 @@ AACAGATGTATCAGAAGTGTTGCAATCTATTTGAGAAATTCTTCCCTAGCAGTTCGTACAGGAGACCAGTTGGAATTTCC AGCATGGTGGAGGCCATGGTGTCTAGGGCTCGGATTGATGCACGGATTGACTTCGAGTCTGGACGGATTAAGAAAGAGGA GTTCGCTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAAAAATAGTGAATTTAGCTTGTCCTTCATG AAAAAATGCCTTGTTTCTACT ->kraken:taxid|488241|NC_007376.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 3, complete sequence +>taxid|488241|NC_007376.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 3, complete sequence AGCAAAAGCAGGTACTGATTCGAAATGGAAGATTTTGTGCGACAATGCTTCAATCCGATGATTGTCGAACTTGCGGAAAA GGCAATGAAAGAGTATGGAGAAGATCTGAAAATCGAAACAAACAAATTTGCAGCAATATGCACTCACTTGGAAGTATGCT TCATGTATTCAGATTTTCATTTCATCAATGAGCAAGGCGAGTCAATAATGGTAGAGCTTGATGATCCAAATGCACTTTTG @@ -89,7 +89,7 @@ ATTGGGAAGGTCTGCAGGACTTTATTAGCCAAGTCGGTATTCAATAGCCTGTATGCATCCCCACAATTAGAAGGATTTTC AGCTGAATCAAGAAAACTGCTTCTTGTCGTTCAGGCTCTTAGGGACAATCTTGAACCTGGAACCTTTGATCTTGGGGGGC TATATGAAGCAATTGAGGAGTGCCTGATTAATGATCCCTGGGTTTTGCTTAATGCGTCTTGGTTCAACTCCTTCCTAACA CATGCATTAAGATAGTTGTGGCAATGCTACTATTTGCTATCCATACTGTCCAAAAAAGTACCTTGTTTCTACT ->kraken:taxid|488241|NC_007374.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 4, complete sequence +>taxid|488241|NC_007374.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 4, complete sequence AGCAAAAGCAGGGGTTATACCATAGACAACCAAAAGCATAACAATGGCCATCATTTATCTCATACTCCTGTTCACAGCAG TGAGGGGGGACCAGATATGCATTGGATACCATGCCAATAATTCCACAGAAAAGGTCGACACAATTCTAGAGCGGAATGTC ACTGTGACTCATGCCAAGGACATCCTTGAGAAGACCCATAACGGAAAGCTATGCAAACTAAACGGAATCCCTCCACTTGA @@ -113,7 +113,7 @@ AAAACGGGACATATGATTATCCCAAGTATGAAGAAGAATCTAAACTAAATAGAAATGAAATCAAAGGGGTAAAATTGAGC AGCATGGGGGTTTATCAAATCCTTGCCATTTATGCTACAGTAGCAGGTTCTCTGTCACTGGCAATCATGATGGCTGGGAT CTCTTTCTGGATGTGCTCCAACGGGTCTCTGCAGTGCAGAATCTGCATATGATTGTAAGTCATTTTATAATTAAAAACAC CCTTGTTTCCTGA ->kraken:taxid|488241|NC_007381.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 5, complete sequence +>taxid|488241|NC_007381.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 5, complete sequence ATGGCGTCCCAAGGCACCAAACGGTCTTATGAACAGATGGAAACTGATGGGGAACGCCAGAATGCAACTGAGATCAGAGC ATCCGTCGGGAAGATGATTGATGGAATTGGACGATTCTACATCCAAATGTGCACCGAACTTAAACTCAGTGATTATGAGG GGCGACTGATCCAGAACAGCTTAACAATAGAGAGAATGGTGCTCTCTGCTTTTGACGAGAGAAGGAATAAATATCTGGAA @@ -133,7 +133,7 @@ GCCTCTGCAGGTCAAATCAGTGTACAACCTGCATTTTCTGTGCAAAGAAACCTCCCATTTGACAAACCAACCATCATGGC AGCATTCACTGGGAATACAGAGGGAAGAACATCAGACATGAGGGCAGAAATCATAAGGATGATGGAAGGTGCAAAACCAG AAGAAATGTCCTTCCAGGGGCGGGGAGTCTTCGAGCTCTCGGACGAAAAGGCAACGAACCCGATCGTGCCCTCTTTTGAC ATGAGTAATGAAGGATCTTATTTCTTCGGAGACAATGCAGAGGAGTACGACAATTAA ->kraken:taxid|488241|NC_007382.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 6, complete sequence +>taxid|488241|NC_007382.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 6, complete sequence ATGAATCCAAATCAAAAGATAATAACAATTGGCTCTGTCTCTCTCACCATTGCAACAGTATGCTTCCTCATGCAGATTGC CATCCTGGTAACTACTGTGACATTGCATTTTAAGCAACATGAGTGCGACTCCCCCGCGAGCAACCAAGTAATGCCGTGTG AACCAATAATAATAGAAAGGAACATAACAGAGATAGTGTATTTGAATAACACCACCATAGAGAAAGAGATCTGCCCCGAA @@ -152,7 +152,7 @@ ATGAAACTTTCAAAGTCATTGGTGGTTGGTCCACACCTAATTCCAAATCGCAGATCAATAGACAGGTCATAGTTGACAGC AATAATTGGTCAGGTTACTCTGGTATTTTCTCTGTTGAGGGCAAAAGATGCATCAATAGGTGCTTTTATGTGGAGTTGAT AAGGGGAAGGCAACAGGAGACTAGAGTATGGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACTTCAGGTACTTATG GAACAGGCTCATGGCCTGATGGGGCGAACATCAATTTCATGCCTATATAA ->kraken:taxid|488241|NC_007377.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 7, complete sequence +>taxid|488241|NC_007377.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 7, complete sequence AGCAAAAGCAGGTAGATATTGAAAGATGAGCCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTATCGTCCCGTCAGGCC CCCTCAAAGCCGAGATCGCACAGAGACTTGAAGATGTCTTTGCTGGGAAGAACACAGATCTTGAGGCTCTCATGGAATGG CTAAAGACAAGACCAATCCTGTCACCTCTGACTAAGGGGATTTTGGGATTTGTATTCACGCTCACCGTGCCAAGTGAGCG @@ -166,7 +166,7 @@ TCTTCTTGAAAATTTGCAGGCCTATCAGAAACGAATGGGGGTGCAGATGCAACGATTCAAGTGACCCCCTTGTTGTTGCT GCGAGTATCATTGGGATCTTGCACTTTATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCATTTATCGCTTCTTTAA ACACGGTCTGAAAAGAGGGCCTTCTACGGAAGGAGTACCTGAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAGTG CTGTGGATGCTGACGATAGTCATTTTGTCAGCATAGAGCTGGAGTAAAAAACTACCTTGTTTCTACT ->kraken:taxid|488241|NC_007380.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 8, complete sequence +>taxid|488241|NC_007380.1 Influenza A virus (A/Korea/426/1968(H2N2)) segment 8, complete sequence ATGGATTCTAACACTGTGTCAAGTTTTCAGGTAGATTGCTTCCTTTGGCATGTCCGAAAACAAGTTGTAGACCAAGAACT AGGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAAGTCCCTAAGGGGAAGAGGCAGCACTCTCGATCTAGACA TCGAAGCAGCCACCCGTGTTGGAAAGCAGATAGTAGAGAGGATTCTGAAGGAAGAATCCGATGAGGCACTTAAAATGACC diff --git a/data/FluA_H3N2.fa b/data/FluA_H3N2.fa index 092391b..8709d1a 100644 --- a/data/FluA_H3N2.fa +++ b/data/FluA_H3N2.fa @@ -1,4 +1,4 @@ ->kraken:taxid|335341|NC_007373.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 1, complete sequence +>taxid|335341|NC_007373.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 1, complete sequence AGCAAAAGCAGGTCAATTATATTCAGTATGGAAAGAATAAAAGAACTACGGAACCTGATGTCGCAGTCTCGCACTCGCGA GATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCGGGGAGACAGGAAAAGAACCCGTCAC TTAGGATGAAATGGATGATGGCAATGAAATACCCAATCACTGCTGACAAAAGGATAACAGAAATGGTTCCGGAGAGAAAT @@ -29,7 +29,7 @@ GAGTGGAGTCCGCCGTCTTGAGAGGGTTTCTCATTATAGGTAAGGAAGACAGAAGATACGGACCAGCATTAAGCATCAAT GAACTGAGTAACCTTGCAAAAGGGGAAAAGGCTAATGTGCTAATCGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAA ACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAATGTTGAATAGTTT AAAAACGACCTTGTTTCTACT ->kraken:taxid|335341|NC_007372.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 2, complete sequence +>taxid|335341|NC_007372.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 2, complete sequence AGCAAAAGCAGGCAAACCATTTGAATGGATGTCAATCCGACTCTACTGTTCCTAAAGGTTCCAGCGCAAAATGCCATAAG CACCACATTCCCTTATACTGGAGATCCTCCATACAGCCATGGAACAGGAACAGGATACACCATGGACACAGTCAACAGAA CACACCAATATTCAGAGAAGGGGAAGTGGACGACAAATACAGAAACTGGGGCACCCCAACTCAACCCAATTGATGGACCA @@ -60,7 +60,7 @@ AACAGATGTACCAAAAGTGCTGCAACTTGTTCGAGAAATTTTTCCCTAGTAGTTCATATAGGAGACCGATTGGAATTTCT AGCATGGTGGAGGCCATGGTGTCTAGGGCCCGGATTGATGCCAGAATTGACTTCGAGTCTGGACGGATTAAGAAGGAAGA GTTCTCTGAGATCATGAAGATCTGTTCCACCATTGAAGAACTCAGACGGCAAAAATAATGAATTTAGCTTGTCCTTCATG AAAAAATGCCTTGTTTCTACT ->kraken:taxid|335341|NC_007371.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 3, complete sequence +>taxid|335341|NC_007371.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 3, complete sequence AGCAAAAGCAGGTACTGATTCGAAATGGAAGATTTTGTGCGACAATGCTTCAACCCGATGATTGTCGAACTTGCAGAAAA AGCAATGAAAGAGTATGGAGAGGATCTGAAAATTGAAACAAACAAATTTGCAGCAATATGCACCCACTTGGAGGTATGTT TCATGTATTCAGATTTTCATTTCATCAATGAACAAGGCGAATCAATAGTGGTAGAACTTGATGATCCAAATGCACTGTTA @@ -89,7 +89,7 @@ ATTGGGAAAGTCTGTAGGACTCTATTGGCTAAGTCAGTGTTCAATAGCCTGTATGCATCACCACAATTGGAAGGATTTTC AGCGGAGTCAAGAAAACTGCTTCTTGTTGTTCAGGCTCTTAGGGACAACCTCGAACCTGGGACCTTTGATCTCGGGGGGC TATATGAAGCAATTGAGGAGTGCCTGATTAATGATCCCTGGGTTTTGCTCAATGCATCTTGGTTCAACTCCTTCCTGACA CATGCATTAAAATAGTTATGGCAGTGCTACTATTTGTTATCCGTACTGTCCAAAAAAGTACCTTGTTTCTACT ->kraken:taxid|335341|NC_007366.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 4, complete sequence +>taxid|335341|NC_007366.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 4, complete sequence AGCAAAAGCAGGGGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAA AAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAAC AATCACGAATGACCAAATTGAAGTCACTAATGCTACTGAACTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTC @@ -113,7 +113,7 @@ AGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGTTGAA GTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCA TCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGAGTGCATTAATTAAAAACACCCTTGTTTCTA CT ->kraken:taxid|335341|NC_007369.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 5, complete sequence +>taxid|335341|NC_007369.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 5, complete sequence AGCAAAAGCAGGGTTAATAATCACTCACCGAGTGACATCAAAATCATGGCGTCCCAAGGCACCAAACGGTCTTATGAACA GATGGAAACTGATGGGGATCGCCAGAATGCAACTGAGATTAGGGCATCCGTCGGGAAGATGATTGATGGAATTGGGAGAT TCTACATCCAAATGTGCACTGAACTTAAACTCAGTGATCATGAAGGGCGGTTGATCCAGAACAGCTTGACAATAGAGAAA @@ -134,7 +134,7 @@ TTCTGTACAAAGAAACCTCCCATTTGAAAAGTCAACCATCATGGCAGCATTCACTGGAAATACGGAGGGAAGGACTTCAG ACATGAGGGCAGAAATCATAAGAATGATGGAAGGTGCAAAACCAGAAGAAGTGTCATTCCGGGGGAGGGGAGTTTTCGAG CTCTCAGACGAGAAGGCAACGAACCCGATCGTGCCCTCTTTTGATATGAGTAATGAAGGATCTTATTTCTTCGGAGACAA TGCAGAAGAGTACGACAATTAAGGAAAAAATACCCTTGTTTCTACT ->kraken:taxid|335341|NC_007368.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 6, complete sequence +>taxid|335341|NC_007368.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 6, complete sequence AGCAAAAGCAGGAGTAAAGATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATAT GCTTCTTCATGCAAATTGCCATCCTGATAACCACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAAC AACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGA @@ -154,7 +154,7 @@ GCAAGTCATAGTTGACAGAGGTAATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGT GCTTTTATGTGGAGTTGATAAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGT GGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACATCAATCTCATGCCTATATAAGCTTTCGCAAT TTTAGAAAAAAACTCCTTGTTTCTACT ->kraken:taxid|335341|NC_007367.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 7, complete sequence +>taxid|335341|NC_007367.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 7, complete sequence AGCAAAAGCAGGTAGATATTGAAAGATGAGCCTTCTAACCGAGGTCGAAACGTATGTTCTCTCTATCGTTCCATCAGGCC CCCTCAAAGCCGAGATCGCGCAGAGACTTGAAGATGTCTTTGCTGGGAAAAACACAGATCTTGAGGCTCTCATGGAATGG CTAAAGACAAGACCAATTCTGTCACCTCTGACTAAGGGGATTTTGGGGTTTGTGTTCACGCTCACCGTGCCCAGTGAGCG @@ -168,7 +168,7 @@ TCTTCTTGAAAATTTGCAGACCTATCAGAAACGAATGGGGGTGCAGATGCAACGATTCAAGTGACCCGCTTGTTGTTGCC GCGAGTATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGTCTTTTTTTCAAATGCGTCTATCGACTCTTCAA ACACGGCCTTAAAAGAGGCCCTTCTACGGAAGGAGTACCTGAGTCTATGAGGGAAGAATATCGAAAGGAACAGCAGAATG CTGTGGATGCTGACGACAGTCATTTTGTCAGCATAGAGTTGGAGTAAAAAACTACCTTGTTTCTACT ->kraken:taxid|335341|NC_007370.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 8, complete sequence +>taxid|335341|NC_007370.1 Influenza A virus (A/New York/392/2004(H3N2)) segment 8, complete sequence AGCAAAAGCAGGGTGACAAAGACATAATGGATTCCAACACTGTGTCAAGTTTCCAGGTAGATTGCTTTCTTTGGCATATC CGGAAACAAGTTGTAGACCAAGAACTGAGTGATGCCCCATTCCTTGATCGGCTTCGCCGAGATCAGAGGTCCCTAAGGGG AAGAGGCAATACTCTCGGTCTAGACATCAAAGCAGCCACCCATGTTGGAAAGCAAATTGTAGAAAAGATTCTGAAAGAAG diff --git a/data/FluB.fa b/data/FluB.fa index 24250dc..10ec115 100644 --- a/data/FluB.fa +++ b/data/FluB.fa @@ -1,4 +1,4 @@ ->kraken:taxid|518987|NC_002204.1 Influenza B virus RNA 1, complete sequence +>taxid|518987|NC_002204.1 Influenza B virus RNA 1, complete sequence AGCAGAAGCGGAGCTTTAAGATGAATATAAATCCATATTTTCTTTTCATAGATGTACCTATACAGGCAGCAATTTCAACA ACATTCCCATACACCGGTGTTCCCCCTTATTCTCATGGAACGGGAACAGGCTACACAATAGACACCGTGATTAGAACACA CGAGTACTCAAACAAGGGAAAACAATACATTTCTGATGTTACAGGATGTGTAATGGTAGATCCAACAAATGGGCCATTAC @@ -29,7 +29,7 @@ CTACGCTAAGTGTTGCAACCTTTTTGAGGCTTGCTTTAACAGTGCGTCATACAGGAAACCAGTAGGCCAGCACAGCATGC TTGAAGCTATGGCCCACAGATTAAGAATGGATGCACGACTGGACTATGAGTCAGGAAGGATGTCAAAAGAGGATTTCGAA AAAGCAATGGCTCACCTTGGTGAGATTGGGTACATGTAAGCTCCGGAAATGTCTATGGGGTTATTGGTCATCGTTGAATA CATGCGGTGCACAAATGATTAAAATGAAAAAAGGCTCGTGTTTCTACT ->kraken:taxid|518987|NC_002205.1 Influenza B virus (B/Lee/1940) segment 2, complete sequence +>taxid|518987|NC_002205.1 Influenza B virus (B/Lee/1940) segment 2, complete sequence ATGACGTTGGCTAAAATTGAACTACTAAAGCAGCTGTTAAGGGACAATGAAGCCAAAACGGTGTTGAGACAGACAACGGT AGACCAATACAACATAATAAGAAAATTCAATACATCAAGAATTGAAAAGAACCCTTCATTAAGAATGAAGTGGGCCATGT GTTCCAATTTTCCCTTAGCTCTGACCAAGGGTGATATGGCAAATCGAATCCCCTTGGAATACAAGGGAATACAACTTAAA @@ -59,7 +59,7 @@ CCTAACTATATGCGGCAGAATGATGTCATTAAAAGGAAAAATTGAGGATGAAGAAAGAAATAGATCAATGGGGAATGCAG TACTGGCAGGCTTTCTTGTTAGTGGCAAATATGACCCTGATCTTGGAGATTTCAAAACCATTGAGGAACTTGAAAGACTA AAACCGGGAGAAAAAGCCAACATCTTACTTTACCAAGGAAAGCCCGTTAAAGTAGTTAAAAGGAAAAGATATAGTGCTTT ATCCAATGATATTTCACAAGGGATTAAGAGACAAAGAATGACAGTTGAGTCCATGGGGTGGGCCTTGAGCTAA ->kraken:taxid|518987|NC_002206.1 Influenza B virus (B/Lee/1940) segment 3, complete sequence +>taxid|518987|NC_002206.1 Influenza B virus (B/Lee/1940) segment 3, complete sequence ATGGATACTTTTATTACAAAGAATTTCCAGACTACAATAATACAAAAGGCCAAAAACACAATGGCAGAATTTAGTGAAGA TCCTGAATTACAGCCAGCAGTACTATTCAACATCTGCGTCCATCTGGAGGTCTGCTATGTAATAAGTGATATGAACTTTC TTGATGAGGAAGGAAAGACATATACAGCATTAGAAGGACAAGGAAAAGAGCAAAATTTGAGACCACAGTATGAAGTGATT @@ -88,7 +88,7 @@ TTGATGCATTATGTATTTGGAAATGCTCAATTGGAGGGGTTTAGTGCCGAATCTAGGAGACTTCTACTGTTAATTCAGGC ATTAAAAGACAGGAAGGGCCCTTGGGTATTTGACTTGGAGGGAATGTACTTTGGAGTAGAGGAATGTATTAGTAACAATC CTTGGGTAATACAGAGTGCATACTGGTTTAATGAATGGTTGGGCATTGAAAAAGAAGGAAGTAAAGTGTTAGAATCAATA GATGAAATAATGGATGAATGAACGAAGGGCATAGCGCTCAATTT ->kraken:taxid|518987|NC_002207.1 Influenza B virus (B/Lee/1940) segment 4, complete sequence +>taxid|518987|NC_002207.1 Influenza B virus (B/Lee/1940) segment 4, complete sequence AGCAGAAGCGTTGCATTTTCTAATATCCACAAAATGAAGGCAATAATTGTACTACTCATGGTAGTAACATCCAATGCAGA TCGAATCTGCACTGGGATAACATCGTCAAACTCACCTCATGTGGTTAAAACTGCCACTCAAGGGGAAGTCAATGTGACTG GTGTGATACCACTAACAACAACACCTACCAAATCTCATTTTGCAAATCTCAAAGGAACACAGACCAGAGGAAAACTATGC @@ -113,7 +113,7 @@ TTTCTCTTCCCACTTTTGATTCATTAAACATTACTGCTGCATCTTTAAATGATGATGGCTTGGATAATCATACTATACTG CTCTACTACTCAACTGCTGCTTCTAGCTTGGCTGTAACATTAATGATAGCTATCTTCATTGTCTACATGGTCTCCAGAGA CAATGTTTCTTGTTCCATCTGTCTGTGAGGGAGATTAAGCCCTGTGTTTTCCTTTACTGTAGTGCTCATTTGCTTGTCAC CATTACAAAGAAACGTTATTGAAAAATGCTCTTGTTACTACT ->kraken:taxid|518987|NC_002208.1 Influenza B virus (B/Lee/1940) segment 5, complete sequence +>taxid|518987|NC_002208.1 Influenza B virus (B/Lee/1940) segment 5, complete sequence GGCAGAAGCACAGCATTTTCTTGTGAGCTTCGAGCACTAATAAAACTGAAAATCAAAATGTCCAACATGGATATTGACAG TATAAATACCGGAACAATCGATAAAACACCAGAAGAACTGACTCCCGGAACCAGTGGGGCAACCAGACCAATCATCAAGC CAGCAACCCTTGCTCCGCCAAGCAACAAACGAACCCGAAATCCATCTCCAGAAAGGACAACCACAAGCAGTGAAACCGAT @@ -138,7 +138,7 @@ ATGCTTTCATTGGGAAGAAAATGTTTCAAATATCAGACAAAAACAAAGTCAATCCCATTGAGATTCCAATTAAGCAGACC ATCCCCAATTTCTTCTTTGGGAGGGACACAGCAGAGGATTATGATGACCTCGATTATTAAAGCAATAAAATAGACACTAT GGCTGTGACTGTTTCAGTACGTTTGGGATGTGGGTGTTTACTCTTATTGAAATAAATGTAAAAAATGCTGTTGTTTCTAC T ->kraken:taxid|518987|NC_002209.1 Influenza B virus (B/Lee/1940) segment 6, complete sequence +>taxid|518987|NC_002209.1 Influenza B virus (B/Lee/1940) segment 6, complete sequence AGCAGAAGCAGAGCATATTCTTAGAACTGAAGTGAACAGGCCAAAAATGAACAATGCTACCTTCAACTGTACAAACATTA ACCCTATTACTCACATCAGGGGGAGTATTATTATCACTATATGTGTCAGCCTCATTGTCATACTTATTGTATTCGGATGT ATTGCTAAAATTTTCATCAACAAAAACAACTGCACCAACAATGTCATTAGAGTGCACAAACGCATCAAATGCCCAGACTG @@ -159,7 +159,7 @@ TCTTTTGGCTTCGAAATAAAGGACAAGAAATGTGATGTCCCTTGTATTGGGATAGAGATGGTACACGATGGTGGAAAAGA TACTTGGCATTCAGCTGCAACAGCCATTTACTGTTTGATGGGCTCAGGACAATTGCTATGGGACACTGTCACAGGCGTTG ATATGGCTTTATAATAGAGGAATGGTTGGATCTGTTCTAAACCCTTTGTTCCTATTTTATTTGAACAGTTGTTCTTACTA GATTTAATTGTTTCTGAAAAATGCTCTTGTTACTACT ->kraken:taxid|518987|NC_002210.1 Influenza B virus (B/Lee/1940) segment 7, complete sequence +>taxid|518987|NC_002210.1 Influenza B virus (B/Lee/1940) segment 7, complete sequence AGCAGAAGCACGCACTTTCTTAAAATGTCGCTGTTTGGAGACACAATTGCCTACCTGCTTTCACTAATAGAAGATGGAGA AGGCAAAGCAGAACTAGCTGAAAAATTACACTGTTGGTTCGGTGGGAAAGAATTTGACCTAGATTCTGCTTTGGAATGGA TAAAAAACAAAAGGTGCCTAACTGATATACAAAAAGCACTAATTGGTGCCTCTATATGCTTTTTAAAACCCAAAGACCAA @@ -175,7 +175,7 @@ GAAAATACAAATAAGGAATCCAAATAAGGAGGCAATAAACAGAGAGGTGTCAATTCTGAGACACAATTACCAAAAGGAAA TCCAAGCCAAAGAAACAATGAAGAAAATACTCTCTGACAACATGGAAGTATTGGGTGACCACATAGTAGTTGAAGGGCTT TCAACTGATGAGATAATAAAAATGGGTGAAACAGTTTTGGAGGTGGAAGAATTGCAATGAGCCCAATTTTCACTGTATTT CTTACTATGCATTTAAGCAAATTGTAATCAATGTCAGTGAATAAAACTGGAAAAAGTGCGTTGTTTCTACT ->kraken:taxid|518987|NC_002211.1 Influenza B virus (B/Lee/1940) segment 8, complete sequence +>taxid|518987|NC_002211.1 Influenza B virus (B/Lee/1940) segment 8, complete sequence CGCAGAAGCAGAGGATTTATTTAGTCACTGGCAAACGGAAAGATGGCGGACAACATGACCACAACACAAATTGAGGTGGG TCCGGGAGCAACCAATGCCACTATAAACTTTGAAGCAGGAATTCTGGAGTGCTATGAAAGGTTTTCATGGCAAAGAGCCC TTGACTATCCTGGTCAAGACCGCCTACACAGACTAAAACGAAAATTAGAATCAAGAATAAAGACTCACAACAAGAGTGAG diff --git a/data/HIV_1.fna b/data/HIV_1.fna index 6ed3f29..a5ac864 100644 --- a/data/HIV_1.fna +++ b/data/HIV_1.fna @@ -1,4 +1,4 @@ ->kraken:taxid|11676|NC_001802.1 Human immunodeficiency virus 1, complete genome +>taxid|11676|NC_001802.1 Human immunodeficiency virus 1, complete genome GGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGC TTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTC AGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAG diff --git a/data/MERS.fa b/data/MERS.fa index 18de91c..6572154 100644 --- a/data/MERS.fa +++ b/data/MERS.fa @@ -1,4 +1,4 @@ ->kraken:taxid|1335626|NC_019843.3 Middle East respiratory syndrome coronavirus, complete genome +>taxid|1335626|NC_019843.3 Middle East respiratory syndrome coronavirus, complete genome GATTTAAGTGAATAGCTTGGCTATCTCACTTCCCCTCGTTCTCTTGCAGAACTTTGATTTTAACGAACTTAAATAAAAGC CCTGTTGTTTAGCGTATCGTTGCACTTGTCTGGTGGGATTGTGGCATTAATTTGCCTGCTCATCTAGGCAGTGGACATAT GCTCAACACTGGGTATAATTCTAATTGAATACTATTTTTCAGTTAGAGCGTCGTGTCTCTTGTACGTCTCGGTCACAATA diff --git a/kr2r/src/bin/kun.rs b/kr2r/src/bin/kun.rs index d8801cd..e21f2d4 100644 --- a/kr2r/src/bin/kun.rs +++ b/kr2r/src/bin/kun.rs @@ -48,6 +48,10 @@ struct BuildArgs { /// between 0 and 1). #[clap(long, long, default_value_t = 0.7)] load_factor: f64, + + /// library fna temp file max size + #[arg(long = "max-file-size", value_parser = parse_size, default_value = "2G")] + pub max_file_size: usize, } #[derive(Parser, Debug)] @@ -126,6 +130,7 @@ impl From for merge_fna::Args { Self { download_dir: item.download_dir, database: item.build.database, + max_file_size: item.max_file_size, } } } diff --git a/kr2r/src/bin/merge_fna.rs b/kr2r/src/bin/merge_fna.rs index 1d243ce..710c868 100644 --- a/kr2r/src/bin/merge_fna.rs +++ b/kr2r/src/bin/merge_fna.rs @@ -1,12 +1,14 @@ use clap::Parser; use flate2::read::GzDecoder; +use kr2r::args::parse_size; use kr2r::db::generate_taxonomy; use kr2r::utils::{find_files, open_file, read_id_to_taxon_map}; use rayon::prelude::*; +use std::collections::HashMap; use std::fs::{create_dir_all, File, OpenOptions}; use std::io::{BufRead, BufReader, BufWriter, Read, Result, Write}; use std::path::PathBuf; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Instant; @@ -20,9 +22,74 @@ pub struct Args { /// ncbi library fna database directory #[arg(long = "db", required = true)] pub database: PathBuf, - // /// seqid2taxid.map file path, default = $database/seqid2taxid.map - // #[arg(short = 'm', long)] - // pub id_to_taxon_map_filename: Option, + + /// library fna temp file max size + #[arg(long = "max-file-size", value_parser = parse_size, default_value = "2G")] + pub max_file_size: usize, +} + +struct SizedWriter { + writer: BufWriter, + bytes_written: u64, + thread_index: usize, + file_suffix: AtomicUsize, + library_dir: PathBuf, + max_file_size: u64, +} + +impl SizedWriter { + fn new(library_dir: &PathBuf, thread_index: usize, max_file_size: u64) -> Result { + let file_suffix = AtomicUsize::new(0); + let path = Self::get_file_path(library_dir, thread_index, 0); + let file = OpenOptions::new() + .create(true) + .append(true) + .write(true) + .open(&path)?; + let writer = BufWriter::new(file); + + Ok(Self { + writer, + bytes_written: 0, + thread_index, + file_suffix, + library_dir: library_dir.to_path_buf(), + max_file_size, + }) + } + + fn get_file_path(library_dir: &PathBuf, thread_index: usize, suffix: usize) -> PathBuf { + library_dir.join(format!("library_{}_{}.fna", thread_index, suffix)) + } + + fn is_kraken_taxid_start(buf: &[u8]) -> bool { + buf.starts_with(b">taxid") + } + + fn write(&mut self, buf: &[u8]) -> Result { + if Self::is_kraken_taxid_start(buf) + && self.bytes_written + buf.len() as u64 > self.max_file_size + { + self.writer.flush()?; + let new_suffix = self.file_suffix.fetch_add(1, Ordering::SeqCst) + 1; + let new_path = Self::get_file_path(&self.library_dir, self.thread_index, new_suffix); + let new_file = OpenOptions::new() + .create(true) + .append(true) + .write(true) + .open(&new_path)?; + self.writer = BufWriter::new(new_file); + self.bytes_written = 0; + } + + let bytes_written = self.writer.write(buf)?; + self.bytes_written += bytes_written as u64; + Ok(bytes_written) + } + + fn flush(&mut self) -> Result<()> { + self.writer.flush() + } } fn parse_assembly_fna(assembly_file: &PathBuf, site: &str) -> Result> { @@ -68,7 +135,7 @@ fn parse_assembly_fna(assembly_file: &PathBuf, site: &str) -> Result, - fna_writer: &mut BufWriter, + fna_writer: &mut SizedWriter, fna_start: ®ex::Regex, taxid: &str, ) -> Result<()> { @@ -83,8 +150,14 @@ fn process_gz_file( while reader.read_line(&mut line)? != 0 { if let Some(caps) = fna_start.captures(&line) { let seqid = &caps[1]; - map_buffer.push_str(&format!("kraken:taxid|{}|{}\t{}\n", taxid, seqid, taxid)); - fna_buffer.push_str(&format!(">kraken:taxid|{}|{}", taxid, &line[1..])); + map_buffer.push_str(&format!("taxid|{}|{}\t{}\n", taxid, seqid, taxid)); + + if !fna_buffer.is_empty() { + fna_writer.write(fna_buffer.as_bytes())?; + fna_buffer.clear(); + } + + fna_buffer.push_str(&format!(">taxid|{}|{}", taxid, &line[1..])); } else { fna_buffer.push_str(&line); } @@ -96,7 +169,7 @@ fn process_gz_file( } if fna_buffer.len() > 10000 { - fna_writer.write_all(fna_buffer.as_bytes())?; + fna_writer.write(fna_buffer.as_bytes())?; fna_buffer.clear(); } @@ -109,7 +182,7 @@ fn process_gz_file( } if !fna_buffer.is_empty() { - fna_writer.write_all(fna_buffer.as_bytes())?; + fna_writer.write(fna_buffer.as_bytes())?; } fna_writer.flush()?; @@ -125,12 +198,15 @@ fn merge_fna_parallel( assembly_files: &Vec, database: &PathBuf, library_dir: &PathBuf, + max_file_size: u64, ) -> Result<()> { let pattern = format!(r"{}_(\S+)\.{}", PREFIX, SUFFIX); let file_site = regex::Regex::new(&pattern).unwrap(); let fna_start: regex::Regex = regex::Regex::new(r"^>(\S+)").unwrap(); let is_empty = AtomicBool::new(true); + let writers: Arc>> = Arc::new(Mutex::new(HashMap::new())); + for assembly_file in assembly_files { if let Some(caps) = file_site.captures(assembly_file.to_string_lossy().as_ref()) { if let Some(matched) = caps.get(1) { @@ -142,19 +218,14 @@ fn merge_fna_parallel( // eprintln!("{} does not exist", gz_file.to_string_lossy()); return; } + let thread_index = rayon::current_thread_index().unwrap_or(0); - let library_fna_path = - library_dir.join(format!("library_{}.fna", thread_index)); + let mut writers = writers.lock().unwrap(); + let mut fna_writer = writers.entry(thread_index).or_insert_with(|| { + SizedWriter::new(&library_dir, thread_index, max_file_size).unwrap() + }); let seqid2taxid_path = database.join(format!("seqid2taxid_{}.map", thread_index)); - let mut fna_writer = BufWriter::new( - OpenOptions::new() - .create(true) - .append(true) - .write(true) - .open(&library_fna_path) - .unwrap(), - ); let mut map_writer = BufWriter::new( OpenOptions::new() .create(true) @@ -164,18 +235,19 @@ fn merge_fna_parallel( .unwrap(), ); - process_gz_file( + if let Err(e) = process_gz_file( &gz_file, &mut map_writer, &mut fna_writer, &fna_start, &taxid, - ) - .unwrap(); - - fna_writer.flush().unwrap(); - map_writer.flush().unwrap(); - is_empty.fetch_and(false, Ordering::Relaxed); + ) { + eprintln!("process_gz_file error: {}", e); + } else { + fna_writer.flush().unwrap(); + map_writer.flush().unwrap(); + is_empty.fetch_and(false, Ordering::Relaxed); + } }); } } @@ -228,6 +300,7 @@ pub fn run(args: Args) -> Result<()> { println!("merge fna start..."); let download_dir = args.download_dir; let database = &args.database; + let max_file_size = &args.max_file_size; let dst_tax_dir = database.join("taxonomy"); create_dir_all(&dst_tax_dir)?; @@ -264,7 +337,12 @@ pub fn run(args: Args) -> Result<()> { } let assembly_files = find_files(&download_dir, &PREFIX, &SUFFIX); - merge_fna_parallel(&assembly_files, &args.database, &library_dir)?; + merge_fna_parallel( + &assembly_files, + &args.database, + &library_dir, + *max_file_size as u64, + )?; let id_to_taxon_map_filename = args.database.join("seqid2taxid.map"); let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?; diff --git a/ncbi/src/fna.rs b/ncbi/src/fna.rs index 1d9c5f4..b96c7b8 100644 --- a/ncbi/src/fna.rs +++ b/ncbi/src/fna.rs @@ -170,7 +170,7 @@ pub async fn write_to_fna( while reader.read_line(&mut line).await? != 0 { if let Some(caps) = re.captures(&line) { let seqid = &caps[1]; - let full_tax_id = format!("kraken:taxid|{}", taxid); + let full_tax_id = format!("taxid|{}", taxid); map_writer .write_all(format!("TAXID\t{}|{}\t{}\n", full_tax_id, seqid, taxid).as_bytes()) .await?;