diff --git a/io/fastq/data/nanosavseq.fastq b/io/fastq/data/nanosavseq.fastq new file mode 100644 index 00000000..c1d451c9 --- /dev/null +++ b/io/fastq/data/nanosavseq.fastq @@ -0,0 +1,16 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC ++ +%&&%$&(()')23-+.:)''.10355=DEAE@E;--&+:<115924-0CJ:<>DE?=6,.+,.//0*123,*7//&'&'%#&$$)*631-/0&%($),&%)(+/0-/29;88=;8EGFHFJFEFFFB===C@?;((426?=<5&':;<8&8()%76:?5.'2-,'()/&20.-3>8+$#'&.1186EBA@B>C;:-/)+...0-+%1-/3*&.)$'(&'$'1&,466663)5+<6)++,.7999;;92;9:977$+61)-124.5970<,8=:-.1--,+'*++(-***,,12@??9:2/61-)&## diff --git a/io/fastq/data/nanosavseq.fastq.gz b/io/fastq/data/nanosavseq.fastq.gz new file mode 100644 index 00000000..986183fd Binary files /dev/null and b/io/fastq/data/nanosavseq.fastq.gz differ diff --git a/io/fastq/data/nanosavseq_emptyseq.fastq b/io/fastq/data/nanosavseq_emptyseq.fastq new file mode 100644 index 00000000..cf6c2494 --- /dev/null +++ b/io/fastq/data/nanosavseq_emptyseq.fastq @@ -0,0 +1,16 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 + ++ +%&&%$&(()')23-+.:)''.10355=DEAE@E;--&+:<115924-0CJ:<>DE?=6,.+,.//0*123,*7//&'&'%#&$$)*631-/0&%($),&%)(+/0-/29;88=;8EGFHFJFEFFFB===C@?;((426?=<5&':;<8&8()%76:?5.'2-,'()/&20.-3>8+$#'&.1186EBA@B>C;:-/)+...0-+%1-/3*&.)$'(&'$'1&,466663)5+<6)++,.7999;;92;9:977$+61)-124.5970<,8=:-.1--,+'*++(-***,,12@??9:2/61-)&## diff --git a/io/fastq/data/nanosavseq_noidentifier.fastq b/io/fastq/data/nanosavseq_noidentifier.fastq new file mode 100644 index 00000000..64a72914 --- /dev/null +++ b/io/fastq/data/nanosavseq_noidentifier.fastq @@ -0,0 +1,16 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC ++ +%&&%$&(()')23-+.:)''.10355=DEAE@E;--&+:<115924-0CJ:<>DE?=6,.+,.//0*123,*7//&'&'%#&$$)*631-/0&%($),&%)(+/0-/29;88=;8EGFHFJFEFFFB===C@?;((426?=<5&':;<8&8()%76:?5.'2-,'()/&20.-3>8+$#'&.1186EBA@B>C;:-/)+...0-+%1-/3*&.)$'(&'$'1&,466663)5+<6)++,.7999;;92;9:977$+61)-124.5970<,8=:-.1--,+'*++(-***,,12@??9:2/61-)&## diff --git a/io/fastq/data/nanosavseq_noplus.fastq b/io/fastq/data/nanosavseq_noplus.fastq new file mode 100644 index 00000000..8e9e6dc3 --- /dev/null +++ b/io/fastq/data/nanosavseq_noplus.fastq @@ -0,0 +1,14 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC diff --git a/io/fastq/data/nanosavseq_noquality.fastq b/io/fastq/data/nanosavseq_noquality.fastq new file mode 100644 index 00000000..cefeab0e --- /dev/null +++ b/io/fastq/data/nanosavseq_noquality.fastq @@ -0,0 +1,13 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 diff --git a/io/fastq/data/nanosavseq_noquality2.fastq b/io/fastq/data/nanosavseq_noquality2.fastq new file mode 100644 index 00000000..9cb5fe9e --- /dev/null +++ b/io/fastq/data/nanosavseq_noquality2.fastq @@ -0,0 +1,15 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC ++ diff --git a/io/fastq/data/nanosavseq_noseq.fastq b/io/fastq/data/nanosavseq_noseq.fastq new file mode 100644 index 00000000..491749c4 --- /dev/null +++ b/io/fastq/data/nanosavseq_noseq.fastq @@ -0,0 +1,13 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 diff --git a/io/fastq/example_test.go b/io/fastq/example_test.go new file mode 100644 index 00000000..498b1e38 --- /dev/null +++ b/io/fastq/example_test.go @@ -0,0 +1,64 @@ +package fastq_test + +import ( + _ "embed" + "fmt" + "os" + "strings" + + "github.com/TimothyStiles/poly/io/fastq" +) + +//go:embed data/nanosavseq.fastq +var baseFastq string + +// ExampleRead shows basic usage for Read. +func ExampleRead() { + fastqs, _ := fastq.Read("data/nanosavseq.fastq") + fmt.Println(fastqs[0].Identifier) + //Output: + //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 +} + +// ExampleReadGz shows basic usage for ReadGz. +func ExampleReadGz() { + fastqs, _ := fastq.ReadGz("data/nanosavseq.fastq.gz") + fmt.Println(fastqs[0].Identifier) + //Output: + //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 +} + +// ExampleWrite shows basic usage of the writer. +func ExampleWrite() { + fastqs, _ := fastq.Read("data/nanosavseq.fastq") // get example data + _ = fastq.Write(fastqs, "data/test.fastq") // write it out again + testSequence, _ := fastq.Read("data/test.fastq") // read it in again + + os.Remove("data/test.fastq") // getting rid of test file + + fmt.Println(testSequence[0].Identifier) + fmt.Println(testSequence[0].Sequence) + fmt.Println(testSequence[0].Quality) + //Output: + //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 + //GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT + //$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;, and includes quality +values for a sequence. + +This package provides a parser and writer for working with Fastq formatted +sequencing data. +*/ +package fastq + +import ( + "bufio" + "bytes" + "compress/gzip" + "errors" + "fmt" + "io" + "math" + "os" + "strings" + "unsafe" +) + +/****************************************************************************** +March 22, 2023 + +Fastq Parser begins here + +I basically stole everything from the fasta parser, and added a few bits +for parsing out additional data from fastq nanopore files. Mwhahaha, stealing +code! + +Keoni + +******************************************************************************/ + +var ( + gzipReaderFn = gzip.NewReader + openFn = os.Open + buildFn = Build +) + +// Fastq is a struct representing a single Fastq file element with an Identifier, its corresponding sequence, its quality score, and any optional pieces of data. +type Fastq struct { + Identifier string `json:"identifier"` + Optionals map[string]string `json:"optionals"` // Nanopore, for example, carries along data like: read=13956 ch=53 start_time=2020-11-11T01:49:01Z + Sequence string `json:"sequence"` + Quality string `json:"quality"` +} + +// Parse parses a given Fastq file into an array of Fastq structs. Internally, it uses ParseFastqConcurrent. +func Parse(r io.Reader) ([]Fastq, error) { + // 32kB is a magic number often used by the Go stdlib for parsing. We multiply it by two. + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(r, maxLineSize) + return parser.ParseAll() +} + +// Parser is a flexible parser that provides ample +// control over reading fastq-formatted sequences. +// It is initialized with NewParser. +type Parser struct { + // reader keeps state of current reader. + reader bufio.Reader + line uint +} + +// NewParser returns a Parser that uses r as the source +// from which to parse fastq formatted sequences. +func NewParser(r io.Reader, maxLineSize int) *Parser { + return &Parser{ + reader: *bufio.NewReaderSize(r, maxLineSize), + } +} + +// ParseAll parses all sequences in underlying reader only returning non-EOF errors. +// It returns all valid fastq sequences up to error if encountered. +func (parser *Parser) ParseAll() ([]Fastq, error) { + return parser.ParseN(math.MaxInt) +} + +// ParseN parses up to maxSequences fastq sequences from the Parser's underlying reader. +// ParseN does not return EOF if encountered. +// If an non-EOF error is encountered it returns it and all correctly parsed sequences up to then. +func (parser *Parser) ParseN(maxSequences int) (fastqs []Fastq, err error) { + for counter := 0; counter < maxSequences; counter++ { + fastq, _, err := parser.ParseNext() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil // EOF not treated as parsing error. + } + return fastqs, err + } + fastqs = append(fastqs, fastq) + } + return fastqs, nil +} + +// ParseNext reads next fastq genome in underlying reader and returns the result +// and the amount of bytes read during the call. +// ParseNext only returns an error if it: +// - Attempts to read and fails to find a valid fastq sequence. +// - Returns reader's EOF if called after reader has been exhausted. +// - If a EOF is encountered immediately after a sequence with no newline ending. +// In this case the Fastq up to that point is returned with an EOF error. +// +// It is worth noting the amount of bytes read are always right up to before +// the next fastq starts which means this function can effectively be used +// to index where fastqs start in a file or string. +// +// ParseNext is simplified for fastq files from fasta files. Unlike fasta +// files, fastq always have 4 lines following each other - not variable with +// a line limit of 80 like fasta files have. So instead of a for loop, you +// can just parse 4 lines at once. +func (parser *Parser) ParseNext() (Fastq, int64, error) { + if _, err := parser.reader.Peek(1); err != nil { + // Early return on error. Probably will be EOF. + return Fastq{}, 0, err + } + + // More general case of error handling. + handleErr := func(err error) error { + isEOF := errors.Is(err, io.EOF) + if errors.Is(err, bufio.ErrBufferFull) { + // Buffer size too small to read fastq line. + return fmt.Errorf("line %d too large for buffer, use larger maxLineSize: %w", parser.line+1, err) + } else if isEOF { + return fmt.Errorf("line %d failed: unexepcted EOF encountered", parser.line+1) + } + return err + } + + // Initialization of parser state variables. + var ( + // Parser looks for a line starting with '@' + // that contains the next fastq sequence identifier. + lookingForIdentifier = true + seqIdentifier, quality string + optionals map[string]string + sequence, line []byte + err error + totalRead int64 + ) + + // parse identifier + line, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + + line = line[:len(line)-1] // Exclude newline delimiter. + if string(line)[0] == '@' { + lookingForIdentifier = false + } + lineSplits := strings.Split(string(line), " ") + seqIdentifier = lineSplits[0][1:] + optionals = make(map[string]string) + for _, optionalDatum := range lineSplits[1:] { + optionalSplits := strings.Split(optionalDatum, "=") + optionalKey := optionalSplits[0] + optionalValue := optionalSplits[1] + optionals[optionalKey] = optionalValue + } + + // parse sequence + line, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + if len(line) <= 1 { // newline delimiter - actually checking for empty line + return Fastq{}, totalRead, fmt.Errorf("empty fastq sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + } + sequence = line[:len(line)-1] // Exclude newline delimiter. + + // skip + + _, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + + // parse quality + line, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + if len(line) <= 1 { // newline delimiter - actually checking for empty line + return Fastq{}, totalRead, fmt.Errorf("empty quality sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + } + quality = string(line[:len(line)-1]) + + // Parsing ended. Check for inconsistencies. + if lookingForIdentifier { + return Fastq{}, totalRead, fmt.Errorf("did not find fastq start '@', got to line %d: %w", parser.line, err) + } + fastq := Fastq{ + Identifier: seqIdentifier, + Optionals: optionals, + Quality: quality, + Sequence: *(*string)(unsafe.Pointer(&sequence)), // Stdlib strings.Builder.String() does this so it *should* be safe. + } + // Gotten to this point err is non-nil only in EOF case. + // We report this error to note the fastq may be incomplete/corrupt + // like in the case of using an io.LimitReader wrapping the underlying reader. + return fastq, totalRead, err +} + +// Reset discards all data in buffer and resets state. +func (parser *Parser) Reset(r io.Reader) { + parser.reader.Reset(r) + parser.line = 0 +} + +/****************************************************************************** + +Start of Read functions + +******************************************************************************/ + +// ReadGz reads a gzipped file into an array of Fastq structs. +func ReadGz(path string) ([]Fastq, error) { + file, err := openFn(path) + if err != nil { + return nil, err + } + defer file.Close() + reader, err := gzipReaderFn(file) + if err != nil { + return nil, err + } + defer reader.Close() + return Parse(reader) +} + +// Read reads a file into an array of Fastq structs +func Read(path string) ([]Fastq, error) { + file, err := openFn(path) + if err != nil { + return nil, err + } + defer file.Close() + return Parse(file) +} + +/****************************************************************************** + +Start of Write functions + +******************************************************************************/ + +// Build converts a Fastqs array into a byte array to be written to a file. +func Build(fastqs []Fastq) ([]byte, error) { + var fastqString bytes.Buffer + for _, fastq := range fastqs { + fastqString.WriteString("@") + fastqString.WriteString(fastq.Identifier) + fastqString.WriteString("\n") + + // fastq doesn't limit at 80 characters, since it is + // mainly reading big ole' sequencing files without + // human input. + fastqString.WriteString(fastq.Sequence) + fastqString.WriteString("\n+\n") + fastqString.WriteString(fastq.Quality) + fastqString.WriteString("\n") + } + return fastqString.Bytes(), nil +} + +// Write writes a fastq array to a file. +func Write(fastqs []Fastq, path string) error { + fastqBytes, _ := buildFn(fastqs) // fastq.Build returns only nil errors. + return os.WriteFile(path, fastqBytes, 0644) +} diff --git a/io/fastq/fastq_test.go b/io/fastq/fastq_test.go new file mode 100644 index 00000000..cc0f4350 --- /dev/null +++ b/io/fastq/fastq_test.go @@ -0,0 +1,66 @@ +package fastq + +import ( + "os" + "testing" +) + +func TestParseNLow(t *testing.T) { + file, err := os.Open("data/nanosavseq.fastq") + if err != nil { + t.Errorf("Failed to read nanosavseq.fastq. Got error: %s", err) + } + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(file, maxLineSize) + _, err = parser.ParseN(0) + if err != nil { + t.Errorf("Failed to parse 0 fastqs. Got error: %s", err) + } +} + +func TestParseSmallLine(t *testing.T) { + file, _ := os.Open("data/nanosavseq.fastq") + parser := NewParser(file, 10) + _, err := parser.ParseAll() + if err == nil { + t.Errorf("Should have encountered a maxLine error") + } + parser.Reset(file) +} + +func TestRead(t *testing.T) { + _, err := Read("data/doesntexist.fastq") + if err == nil { + t.Errorf("Should have failed to read non-existent file") + } + _, err = ReadGz("data/doesntexist.fastq.gz") + if err == nil { + t.Errorf("Should have failed to read non-existent gz file") + } + _, err = ReadGz("data/nanosavseq.fastq") + if err == nil { + t.Errorf("Should have failed to read a file that is not gz'ed") + } +} + +func testException(t *testing.T, filePath string, errorString string) { + file, err := os.Open(filePath) + if err != nil { + t.Errorf("Failed to read %s. Got error: %s", filePath, err) + } + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(file, maxLineSize) + _, err = parser.ParseAll() + if err == nil { + t.Errorf("%s parser should have gotten error: %s", filePath, errorString) + } +} + +func TestParseExceptions(t *testing.T) { + testException(t, "data/nanosavseq_noseq.fastq", "no seq") + testException(t, "data/nanosavseq_noquality.fastq", "no quality") + testException(t, "data/nanosavseq_noidentifier.fastq", "no identifier") + testException(t, "data/nanosavseq_emptyseq.fastq", "empty seq") + testException(t, "data/nanosavseq_noplus.fastq", "no plus EOF") + testException(t, "data/nanosavseq_noquality2.fastq", "no quality EOF") +}