diff --git a/data/test_PE1_2.fq b/data/test_PE1_2.fq new file mode 100644 index 0000000000000000000000000000000000000000..17ccf78a57985bc6c9816eedf04c298ca639563e --- /dev/null +++ b/data/test_PE1_2.fq @@ -0,0 +1,160 @@ +@NS500443:42:H3MH2AFXX:4:21612:18494:19326/1 +GGTTCTGTTGGCTCAAACGCCGTCACTTCATTGATCAAAAGCTTATAATGCGTGCCAAAGTCCGCCATCGAGACGACTACGCCTTCCCCTGCTTTCCCGTCAAAAACGAGTCTTGCCGGATCTTCACGGTCTCCCCTCGAAAGCGGCGAAA ++ +6A/AAEEEEAEE6EEEAEEEE/EEAE/EE/EEA/EEEEE6AEAEAEAEEEEE/EEEEEE/EEEEEEEAEE/EEE//AEEEEEEEEAEEA/EAE/EA<<6/AEEA//A//AEEEA6AA<EE/AEEE/A///AEEE//EEEEEEEEAAEAAEA +@NS500443:42:H3MH2AFXX:4:21612:18284:19327/1 +GCGTTTACCCATTCTTCCTATGTCAATGAACATCGTAAAGAACATTTAAAAGACAATGAAAGATTAGAATTTCTAGGAGATGCGGTATTAGAGTTAACTGTATCTGATTATTTATTTAAAAAGTATCCGGACATGGCGGAAGGTCAAATGA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEEEEAEEEEEEEAEEEEEEEEEEEAE<EEEAEAAEEEAEAEEAEE/ +@NS500443:42:H3MH2AFXX:4:21612:12448:19327/1 +CACGATGGGTGCCTTTTTGCATGTGAATACGGACAAATTAGATGTGCGGCGCTATCTTGAGATGGTGGAGTCGAGCAGTCCGTCTTATATGGTGATGGCTTCGCTTGATGTGGCGCGGCGCTATGTGGCGCTTTATTCGGATGAGGATTTT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEE/EEEEEEEAEEEEEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:3226:19328/1 +CTTGAAGCGTCATGATAAGGTGTTTTCGGAAATAGATGAATATATTGCGGACTTGCGGAAAAATAACGCCAAGCTTGCGGATTTGATGCAAGAAATTTGGGACGAGATTCAGGAAAATCCAGAAGCGTATCGTGATAAAAATGTATCGA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEAEEEEEA<EEEEEEAEEEEEEEEEEE< +@NS500443:42:H3MH2AFXX:4:21612:21221:19328/1 +GTGTTAACCAAATTCGTGCACAAATGGAAGAAACTACTTCTGATTTCGATAAAGAAAAATTACAAGAACGTTTGGCAAAACTTGCTGGCGGCGTAGCTGTCATTAAAGTAGGTGCTGCAACTGAAACAGAACTAAAAGAACGTAAACTTCG ++ +AAAAAEAEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEE<EAEEEEEEEEEEEEEEEEEEEEEEAEAEE/EEE +@NS500443:42:H3MH2AFXX:4:21612:11696:19332/1 +ATGTGATACACCGGAGAATATACTAAAAAATCCTGAAACTGATTTTGTTCGTAATTTTCTTGAGTCTGGCAATCTTTTACCAAAAACATTATTTGACCGTTCAATCAAGATTTCTGATTTAGTCACTAAAAATTTCTGTACTCAGGCACAA ++ +AAAAAEEEE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEAEEEEEEEEEEEEE<AEEEEEAEEEEEEEEEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:8548:19332/1 +TAACATGGAAGCTTACTATCAGGAAGCGGGAAGAGCTGGCCGGGATGGACTGCCAAGTGATTGTGTGCTGCTTTTTTCACCACAAGATGCACATCTACAGCATTACTTGATTGAGCAGTCAGAACTTCCTGAAGATCGTAAGGAAAACGAA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEAEA +@NS500443:42:H3MH2AFXX:4:21612:5092:19335/1 +ACCTACTTCTTGCAACATTTCCATGTCATTTAGACCATCGCCGAATGCATAAGTATCTTCCATTGAAAAGCCTAATTTCTTGATCATATTACGAATTCCGCGAGCTTTAGAGCCATCACTTGGACAAACATCAACAGAAACATCATGCCAG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEAEEEEEE/EEEEEEAAEE<EEEEA6AAEAEEEEEAEEEAEEEEEE6EEEEEAEA +@NS500443:42:H3MH2AFXX:4:21612:3939:19336/1 +GGTGGAAAACATGAACGGAAAAAATATTCTACTTGCAGTGTCTGGCGGCATTGCAGTTTATAAAGCAGTAGCGCTAACGAGTAAACTCACGCAAGCAGGTGCCAATGTAAAAGTGATGATGACACGTCACGCACAGGAATTTGTTCCACCG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEAE//EEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEEEAEE/AEEEEE/EEEAEEEEEEEEEEAEEEAEE/ +@NS500443:42:H3MH2AFXX:4:21612:7738:19336/1 +TTCTTTCCCAGAAAGCCCTACAAGTTGAATCAATTCTTGCACACGTTCTTTTCGTTCTGCTCGTTTCACTCCGGCAATTTCAAGCGGAAAAGCAATGTTTTCAAATACTGTTCGCGACCAGAGCAAGTTGAAGTGCTGGAAAATCATCCCG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEAAE<EEEEEEEAEEEEAEEEEEEEEEEEEEEEEE<EEEEAAAEEEEEEEAEEEEEAEEEEEEEEEAEEEEEEEEEAEEEEA< +@NS500443:42:H3MH2AFXX:4:21612:11553:19339/1 +GTGTGAATACTACCGTTTGTTTATGATAACCGATTCCTTCACCGTTTATCCCCATTCTTTTAATTTCAAGCGGAAAAGTCTGTCCCACTTTTATCTTTGCTCCGTCCAACCACATCTCCTCCGTTTCAATTCTTCATCCATACTATCAAAA ++ +AAAAAEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEAE<EEEEEEAEEEEEEEEEEEAAEEEEEEEAEEEEEEEEEEEEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:20775:19341/1 +GCTTTTTCATTTTTATATATTCCATTTCGCGTAAAAAGAATTTATCAATAAAAATCTTGATTTTTTCTTACAACAAAAGACGCTGGCAGATTTGAAAAATCAATCTACCAGCGTCTTTTGTTCGGCTAAAAACTATATATTAAGCATCTC ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEAEEAEEEEEEEEEEEEEEEEEEEEEEEEEAEAEEEE<EEEEEEEE<EEE<EEEEEEEEEEEEEEEEAEEEEE +@NS500443:42:H3MH2AFXX:4:21612:2150:19342/1 +GCTCTTTTTCCGCCGCGATTTCCCTCAGCTTCGCATAAAGCGGCATGTGGAAAATGGTCATAGCTTCCATAACCACAACCTTTTTTTCGCGAGAAACGCGGAGAACATCCTCGAGTTCACGGGAACTCACCGTAATCGCCTTCTCCACCAA ++ +AAAAAAEEEEAEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEA/EEAE//EEEAEAEEEEEEEEEA/EAEEEE/EAAEEEAAA<EE<EAEEEAEEEAEAE/EEEAE/<EEEE<EEEAEEEEE/ +@NS500443:42:H3MH2AFXX:4:21612:2100:19343/1 +AGATGACATTATTGGAGGAAGCGGCAACCCCAAGGTTACTAGCCTTCGTCCAACCAACTAAATTCGATCCATCTTTCATACGTTGCCAAACGACGCCATCAACAGTTACTTCACGGTCAACAGTAAGGTCTTTGCCATACACGTTCTTCAA ++ +AAAAA/EEEEEAEEEEEEEEE/AEEEAEE/AEE/AEAEE/EEAE//EEEEEEEEEAEE/EEE6EAEEE/AEAEEEEEEEEEAEE/<EEEEAE/</EE/EEEE<EE/A/AEE/<///AEE/EEEEEEE<EAEEA//AEEEA<<</E<AAEAE +@NS500443:42:H3MH2AFXX:4:21612:11502:19343/1 +GTTTATGTAGCTTCACAAGACGAAGAAATGAACAAAAAAGCGATGGCAATTATTGAAGATATCGTTCGCGAAGCAAAAGTGGGCGAAATCTACACAGGGAAAGTTCGTCGAATCGAAAAATTTGGGGCTTTCGTTGAATTGTTTAAAGGAA ++ +AAAAAEEEEEEEEEAAEEAEEEEEAAEEEEEEEEEEEEEEEEEEE/AEEEEEEEAEEAEAEEEEEEAAEEEEEEEEEEEEEE6EEEEEEEEE<AEEEEEEEEEEE</EAEAEEEAEEEEEEEEAEEAAEEAEAEEEEE/EE/EEEEE<E/E +@NS500443:42:H3MH2AFXX:4:21612:24968:19344/1 +CGCTTATTTGATACGGCAGCTGTTTATAATAATGAACGTGAAGTTGGCGAAGCGATTCGCGCGGGAGGGGTTTTGCGAGAGGAACTTTTCATCAGTTCAAAAATTTGGAACGGTGACCAGGGCTACGATGAAACACTATTTGCCTTTGAGA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEE/EEEEEEEEEEEEEEEEEEE6AEEEEAAEEEE</AEEEEEE/AEEEEEE/A6<EEEEEE<AEEAEEEE/ +@NS500443:42:H3MH2AFXX:4:21612:6916:19344/1 +GTTTCCGAACGACTTGCTAAAGAATACGGGCTTAAAACGATTTCTGATCTGCGCCCAGTGGAGCAACAAGTAAAAGTGGGCTTTACGGCAGAATTTGCTGACCGTGGTGATGGCTATAAGGGGCTAAAAGAAAAATATGGAATCACCTTTG ++ +AAAAAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEAEEEEEEEEEEEEEEEEEEAEEEEEEEE//EEEEEEA/EEEEEEAEAEEAEEEEEAEEEEEEEEEEEEEEEE6EEEEEE</<EEEEA +@NS500443:42:H3MH2AFXX:4:21612:15429:19346/1 +AGCTTATACAAGCCGCGTTACTTGAGAAGTAAACATCTACAATATTCCCTTTAATCGCACCACCAGTATCCCCAGCGATGGCTTCACCATATCCAGAAACGTGAACGCGCGAGCCAAGTGGAATAACACTTGGATCTACGGCAACTACTTT ++ +AAAAAEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEAEEEEEEEEEEEEEEAE<EEAEAAEEEEEEEEEEAEEEEEEEEEEA/EEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEE<EEAEAEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:1024:19346/1 +TCCTTCCGAAGCGTCTTCGCANTAGTCAAATTCCCATTATGNGCAAGAGCAAGTGAAGAATCTTGAAAATGAAACAAAAACGGTTGAACATTTCCAAGGCTTTTTCCGCCAGCTGTTGCATAGCGCACGTGACCAATTGCGCCACTTC ++ +AAAAAEEEAEEEEEEEEEEAE#EEEEEEEEEAEEEEAEEEE#E/EEEEEEEEEEEEE6EEEEEEEEE6EEAEEEAEEEEEE/E/EE/E/EEEE<EEAEEAEAEEEEAEAA<EEEE<EAE/EEEAEE//EEEEEEEEE/E6EEE/E/EE +@NS500443:42:H3MH2AFXX:4:21612:4182:19348/1 +GCATGGCAATGAAGTTAAATAACGCTTCCGGCAAATAGCCTAAGTCGCGATACTGCTCGATAAATTGGATAATCGAACCATCACGCTTACTTAATTTACGGCGACTTTCATTGACAATCAACGTCATATGACCAAAAGTTGGCGGTTCCCA ++ +A6AAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEAEEEEEEEEEEEEAEEEEEEAEEEEEE<E<AEEEEEEEEEAEEEEEAEE<A/EEAEEEEEEEEEE<EEEEE/EEAAEAAEEEEEEE6<EAAE</</A +@NS500443:42:H3MH2AFXX:4:21612:26730:19348/1 +ATCTCAACGGGGGGAACAGGAATTTCGAAACGGGATGTAACTTTTGAAGCCCTATTCCAACAAAAACACCAGGAGATTCCAGGGTTTGGGGAGTTGTTTCGAATGAACAGCTATCAAGAAATCGGATCAAAGGCAATGGCATCCCGCGCTA ++ +/AAAAEAE/E/EE6E6AAEEE//E6EA//A//6EEEAEEA6/EE/6E66<EE/AE/EE/EEEE////A/EEEEE/E//EE/A//EEEA//AAEEE////E/EEEA//E//6/EA6AE////A/A/AEAEEE/EAAE/<//EE</<<//AE/ +@NS500443:42:H3MH2AFXX:4:21612:25889:19349/1 +GAGCGAATCAAGTGCCTTGGAAAGAGCGGATTGCATATGCAATGAGTGACTTTGGCTGTAATACAGTTTTTCAAATTTTAGGAACGTATTTTCTTGTATTCTGTACGGATACACTTGGCGTTGCGGCAGCAGCGACTGCTGGACTTTTCG ++ +AAAAAE/EEEEEEEEEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEAEAEAEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEE/EEEEEEAEEEAEEE/AEEEAA<EEE6E6EEEEEEEEEEAA/<EE</EEEE<6 +@NS500443:42:H3MH2AFXX:4:21612:6539:19353/1 +GATGCAGTTGTTTCTAAACGGGGCTCCTGTTCAACCCGTTTTAGACCGCTTAAAATCTATTTTTGGCATTCAATCTTTTAGTCCGGTTATCAAAACAGCATTGGATGTTGAAGAAGTGAAAGAAGCTGCCTTTCAACTTGTGAATGACA ++ +A/AAAEEEEEEEEEEEEEEEEEEEEEEEEEEEE6EEEEEEEEEE/EEEEEE/EEEEAEEEAEEEEAAEEEEAAEEEEEEEEEEEEEEE/EEE/AEEEEEEEEEEEEAEEEEEEEEEEEE/EE<E<EEEEA<AEEEAEEEEEEEEEA/EA +@NS500443:42:H3MH2AFXX:4:21612:21998:19353/1 +TCGGTAAGCTTTCTAAATCGAGTATTGTAGAACTTTTTGTGACAACTTTGAAAGAAGAAAATTAGCAGGAGGGGAATTATGAAGCAACCTGTCGTTATTTATGTAATTTCGGATGCGATTGGAGAAACGGCTCAACATACTATTCGAGCCG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEAEE/EAEEEEAE6EEEAEEEEEEEEEEEE<EEEEAEAEEEEEEAEEEEAEEAEE/EEAEEEEEE<AEEEEA +@NS500443:42:H3MH2AFXX:4:21612:9349:19354/1 +CCGCGATACTCCGTTCGAGTAGACACTGCGTCACGGGGCTAAGTTTCAAGCCACTTGCAACAATTTCTTCGAGTTCTGTATCGTTATGGCAAATCCCACCGCCAGACCCGCCAAGTGTATAAGCAGGGCGAACAATGACAGGATAGCCGAT ++ +AAAAAEEEAEEEEEEEEE/EEA/EEE/EAAEEEE/EAEEEAEEEEEE/E/EEEEEE6EEEEEEEEEEEEEEE/EEEEE6E<E/AE/EEEEEEEEE<EEA6/EEAEEE/A<AAEEAEE/EEEEEE/EEEA</<EE/AEE<AEAEAE<EEAA/ +@NS500443:42:H3MH2AFXX:4:21612:10100:19357/1 +CGTTGGATCCACTTCCATCATATGGGATTCCAAAATTGCTTCCTTTCCTGCCGCTAGCTCGTAGGTATAATCTTCCATAAAACCAGTATTTTCGTTATGAGCCATCACTTTCATCAACCGGTCGATCGCCGCCGTTTTCCAGTCTCCTTC ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEAEEEEEEEEEEAEAEEEEEEEEEEEEEEEAEE +@NS500443:42:H3MH2AFXX:4:21612:5905:19357/1 +CACCTGCTACACTCGCTTCATAAAATAAATCGCAGTGGTTTTCTTGCGCCACTGCTACGAGTTCATCTCCGTGAATAGCAATTAAATCTTTATTTGCGGTTACAACACTTTTCCCTGCTTTCAGTGCATGCAAAATGTAATGATGTGCGGT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEE/EEEEEEEEEEEEEEEAEEEEEEEAEEEE +@NS500443:42:H3MH2AFXX:4:21612:22823:19360/1 +GGATGGGCAAGCGGGAAGAGTTGGAGCGGATTAGCCAAATTGAGCCGCTTGCTGCACGTTACCGGGATGAGGTACTTGAAAAGCTTAGGAAAATGGGATGAAAAAAGAGCCGTTCGGAAGAACGGCTCTTGGCGTGTTTTGCGTTTAAAAT ++ +AA6AAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAE<EEE/EEEEAAEAEEEEA</EEEEEEA<EEEEEE +@NS500443:42:H3MH2AFXX:4:21612:17904:19360/1 +GTAACGGTAAAAGTTATGAACGTTGGGGATGACGGTAAGATTGGTCTTTCAATCCGTAAAGCGGTTGATCGTCCGCGTCCAGAACGGAATTATGATCGGAAACCGAAGTATAACAAGAAACCAGCTGCTCAGTCCAAACAGCCGGAAGATT ++ +AAAAAEEEEEEEEEEEEEEEEEE6EEEEEEEEEEEEEEEEAEEEEEEEEEAEEEEEEEEEEAEEEAEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEA<EEEEAEEEEEAEE<EEE/EEEAEEEEA6/EEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:5361:19361/1 +CGCATTACGCGCATTCGTCTGACCCGCTTCTACCTTCTTTAATTTTAAATCTTGATTTACTTCTTCGTCTGACTCTTGTAACGTAACGCCGATGACATAAGCATATTTATTTTCAATACTTATGCCCTTATCTTGCTTTTGACTCGTTGCA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEAEEEEEEEEEEEEE6<EEEEEEEE< +@NS500443:42:H3MH2AFXX:4:21612:10976:19365/1 +TCACTAAATTTAATATCTTTTGACATAATTTAATTCCCTCCAATTTATATCGCTTTTGTTTTCTATTTTTACTTCGTAACTGCTAAAATATCACTTTCGCGAATAATTAACAGATCTTTTCCTTCATAAGTTACTTCTGTTCCAGAATATT ++ +AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEAEEEEEEEEAEEEEEEEEEEEEE6EEEEEEEEEAEAEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEA +@NS500443:42:H3MH2AFXX:4:21612:22942:19365/1 +ATCCATGAATAAATCCACAATATCTTTTAAACGGCAGGTTTGTTTATTGATCAAAAATTCGCTATCGCCATTTCGGTAAATTCGTCTTGTCACCACTACTTCACTATAATCAAGCGGCAAGTAGCGATCATCATTGGCAAGTACGAGCGAT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEAEAAEEEEEEEA/EEEAA +@NS500443:42:H3MH2AFXX:4:21612:14114:19366/1 +CCTTGGAAACATCACCTATGTGTTGATGTATAACCTAGGACTTGAGCTCGGGAATGCCTTTCACGCTTCTACTGAAACCGCACAAACCATTGCCTCTGTTTTTGCGCGAATCACGGGGCTCTCTATGTTCCTTGCTTATACAGGTGCATTT ++ +AAAAAEEEEEAEE6EEEEEEEEEEEEEEEAEEAEEEEEEEEEEEEEEEEEEEEEEEAEEEEE/EEEEEEEEEEEEEEEAEE6EEEEEEEEEEE/EEEEEEEEEEEEAEEEEEEEEEEEEEEEEAEEEEEEE/EE/EAEEEEEEEEEEE<E< +@NS500443:42:H3MH2AFXX:4:21612:13612:19370/1 +GCATCTAAAATCGGATGTGAATAGTCTTCCTGTACTGTTGTTTTTAATGCATCAATTGCTGTATTTCCCGTTACAAAAACGGTCTCCGCCTTCTTATTTTCACGCAGTAAATTTTCTTTTGCCGTATCCGTTGGCGCAAAGTGCAGATCAG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEE/EEEEEEEEEEEEEEEE/EEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEA +@NS500443:42:H3MH2AFXX:4:21612:1049:19371/1 +ATGCTTGTTATGCACTATAAATTGAAAAAGATTGACATGATCGAAGCGCTCAAATCAGTTGAGTAACAAGAGACACTCCTCTTCTAGCCAAAGAGGGGTGTTTTTTGGTTTACTTTTTGTCTAAAAAGCGATAGAATGAAAAATAGTTCAG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEE6EEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEE/EEEEEEEEEEEEEEEEE<EEEEE<EEEEEAEEEEEEEAEEEEEEEEAEEAEE<EEEEEAEE/A</A +@NS500443:42:H3MH2AFXX:4:21612:9601:19373/1 +TCCAAAAGCTGATGGATTGCAGAAAAAGCCTCTTCACTCCCAATCATCGCTTTCATCAATAGATATTCAGAAGCATGGTAAGCTTGAAGCGTTTGTCTAACTGGCTGAGCACTTACTAAGCCACCATAAAAATCATCAGGAGGTGGCGCTA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEAEEEEEEEAAEEEEEEEEEEEAAEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEA<EE<EEEE<EE<EEEEEEEA/EEAEEAEEEEEEAEEEEAEEEEAAEEEEE<A<A<E<<EAAEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:20393:19375/1 +CAGAGATTGATCGCGTACTAAACACATTGCTAGAAACGAGAAAGCCGGTTTACCTTAACTTGCCAATTGATGTTGCGGAAAGCCCCACCTCAAAACCAAGCAGGAAATTATCTGGTTCGACGGAATTAACGAGCGCAGATCAAAATTTG ++ +AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEE6EEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEAEEEEEAEEEEEEAEEEA/EE/EEEAAEE/EEAAA/AEE/EEE/EEEEEAE/AEE/<E/EEEAEEAEEE +@NS500443:42:H3MH2AFXX:4:21612:18713:19376/1 +CCTTAGAAGAAATTATTTTTCCTCACTAGGAACAAAAGTATTTTTGAATTCATCCAGTGCGCTGTTCAATTTTGCTTCATCTGGAAGAGATTTTGTAGTCCGAATTTCGTCTAGTAATTCTGCATGATTATGATCAAACCACGTATTCATT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE<EAEEEEEAEAAEAEEEEEEEAAEEEEEEEEEEAEEA/EEEEA/A/EEEEE +@NS500443:42:H3MH2AFXX:4:21612:16612:19377/1 +AATTGCGCCTTGAATAATAAAGCTTCGTGGAGTTTCCTTATTGTTTAATTCAGCTAAAAGAGCATTTAAAGTTTCCTTAAACTGCCACTTATCCGAATCAGCAATTTGTCCGGCATCAATTTTCATCAGCAAGTCTTCCACCAAATTTTT ++ +AAAAAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEAEEE/EEEEEEEEEEEEAEEEEA/EEE/EEEEE/EEEEEEEEEEEEE6EEEEEEEEEEEEEEEE/EEEAEEAEEE<<EEEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:21100:19378/1 +ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACAT ++ +AAAA/EEAAE///EAEEEEAEEAAEE/EEA6AEAEEEEEEEEEE666//////AAAEE////EE<EEA/EEA/EAEEEEAE//E//E//E///E/66E<E////EE/6/AEEEEA/AA//AEEEEAAE//E//</EAE/AEEEEEEEEE// diff --git a/data/test_PE2_2.fq b/data/test_PE2_2.fq new file mode 100644 index 0000000000000000000000000000000000000000..c6cf6339e936d8ac8a7eca64a51c45c65ba1f812 --- /dev/null +++ b/data/test_PE2_2.fq @@ -0,0 +1,160 @@ +@NS500443:42:H3MH2AFXX:4:21612:18494:19326/2 +TCTTAGAGGAAGGTGGATATAATGCCGTCACATCGAACTTTGAAGATCTATACGGCATGCAGCAGCTTCCAGGTCTTGCGGTGCAACGTTTAATGGCAGATGGCTACGGTTTTGCGGGGGAAGGAGACTGGAAAACGGCGGCGATCGACCG ++ +AAA6AE/EEAAEA/EEE/EAEEAEE//EE/E/EEE/AA//EEAE//E/E6//EE6AE//E/AEAE//AE/EEE/AEE/E/E/E/EAEE/EAEE//AEEEEEEEAEEEEA/AAEAE/EA<A/A/AEEAE/EEA/A/E///</6E/EEA/AE/ +@NS500443:42:H3MH2AFXX:4:21612:18284:19327/2 +CTAAAAATTGGATGACTTTTGCAAGCCCATTATCGAGATAAAGCGCTCCGATAAAAGCTTCGAAAACATCGGCAAGCAGAGCTGGTCGATTTCTACCACCCGCTTTTTCTTCTCCTTTTCCAAGTCGAACATATTTGGAAAAATGAACG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEE/EEEAEEEEEEEEEAAAEEEEEEEEEEAEEEEEEAEE/E<EEAEEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:12448:19327/2 +GGATAGAATCCAGCTTGTTCCAATTGTTCAGCCAACCTATAACCCGAAAAGCCTTGTTTTCGAACAATCAATTTCAGCGGATCATCCGGACAAATCACTTCGAGTTCTTGTTTCTCCAAAAAACGTAGCCATTTCGCTCGCATTTTCCAAA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEAEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEE<EAEAEEEEEAEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEE<EEAEEEEEEEAEEE +@NS500443:42:H3MH2AFXX:4:21612:3226:19328/2 +TCTTGAAAGACTAGCTATATTTCTACTTCTAGAACCTTACTCTCTTCGTCGTAGCGGTAAAATTTCTTTTTCCACCACATCTTTTAAATCTAATCGAGCTGCTTTCCAGTACTTCAACTTCGGTAGAGGATTTTCTGTTTTCTTTTTATAG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEE/EEEEEEEEEE<EEEEEEEEEEEEEEEEAEEA<EEEE//AAEEEEEEEEEEEAEEEE<E<<6EEE<EEEEAEE +@NS500443:42:H3MH2AFXX:4:21612:21221:19328/2 +GATTGGCGCTTCAAGTGAACGTAGAACAATATTTACACCTGTTTCTTCATCTCCAGTAGCTTCGATTTCTGCTACTTTCTTATAAACATTGACAAGAGCCGTTCCACCACCGGAAACAATACCTTCTTCAACAGCTGCACGTGTCGAGTTT ++ +AAAAAEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAAEE<AEEEEEEEEA/EEEAEEEAEEEEEAE/EAEEAEEEEEAEEEA +@NS500443:42:H3MH2AFXX:4:21612:11696:19332/2 +ATTTGGATGTGCTCAATTAAAGCGGTAAGCAAATCGTCTTTTCGGTCTAGGAATGTCTCGATGAATGTATTCATGGAGTGACACCTTCTTTCAGGTTACTCGACAAGAAGGCAAGTAAATCATTCGTTGTGATAGTTGCAACTAGTAGAC ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEAEEEEEEEEEEEEEAEEEEEEEEEEEEEEEE<EAEEEEEEEEEEAEEEEEAAEE<<EEEAEEE/E +@NS500443:42:H3MH2AFXX:4:21612:8548:19332/2 +CTTTAGCAACCATAACTTTGCCGAAGCGCTCTCCCATTCGCTTCACACAAGAAAAGACTTGCTGAGCTAGAATGGTCACATCTGTCTCAGTACCTGTATCTAAACAATTGCTGCAGTTTCCGCAATCCTCACATTCTTCCCCGAAATAGTG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:5092:19335/2 +TTATAACGGTCAGTATGTGATTTTTGAAGGAGAAGTCTTATTGAAAAATCCACTTCCGCAAGAATCACTTGAACGCCTAATCACTGTAGCAACAGAAAACGATCATCCTATTGTCTTCTCGGCAAGTGATACAATGCGTGTTAATTTGCCG ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEAEEAEEEEEEEEAEEEEEEAEAEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEAEEAEEEEEEEEEEEEE/E/E/EEEEEEEEAEEEEEEEEEEE +@NS500443:42:H3MH2AFXX:4:21612:3939:19336/2 +GCTACCCACACAGGGGCTTCCGTTGCAAGAATGGTTGTTGTGACCATATCGTCAGCAATGCCATTCGCCATTTTCCCAATGACATTGGCGGTAGCTGGTGCAACGATAACAAGATCCGCCCAGTCCGCTAAATCAATGTGCGCAACAACGG ++ +AAAAAEEEE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEEEE<EEEEEAEE//EE/EEEEEEEA<EEEEEEAEEEEEEAEEEAE<EEE<EEEE/EEA<EEA +@NS500443:42:H3MH2AFXX:4:21612:7738:19336/2 +GTCGTGGGCTATTCCGGGGCAGGAAAAAGTAGTTTGATCCGAATGTTTAACGGCCTCGACCTACCAAATGCAGGCAATGTAATATTGGACAACAAAAAAAACAGACAAATTAGAGGCCGAGAACTTAGAAAAGAACGGAAAAAAAACGGGA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEE/E6/////E///<E/EE/E///E//EEE//E//6/AA/A/E</AEEEEEE6/////EE/A/</E<A/A///A//E/A/EEEE///AE//EEEEEA/</A// +@NS500443:42:H3MH2AFXX:4:21612:11553:19339/2 +TAGATGCTTCGATCTAAGACGTTATTTTCGTCACGCAACGCAGCTTTAATACTGCGAAAACGTGTGTTTAAAAAATAAATTTGTAAGGCAAAGGCCCAGCGAGCTGGGTCATCGTAAAACTTTTCGAGAATTCGGTTGTCCGTGATACTTT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEE6EEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEE<EEEE/AEEEEAEAEEEAEEEEE<AAEEA<AEEEEEEAA +@NS500443:42:H3MH2AFXX:4:21612:20775:19341/2 +CATCCCCACTATCCCCCCATCCTTGTTAATTGTAAATAAGTTAACATCAAGACTACCTGAACCATTTTTTTGATGGTAAAGCACTATTCTTTGATCTTCTTGCGTCATGTTTTCCGTCAAGCTTTTTGAAGCATTTACAAAGGTGGATCGG ++ +AAAAAEEEEEEEEEEEEEAEAEAEEEEAEEEEEAEEE/EEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEAEEEE</EEAE<EA<EE<AAEE/EEEEEEEE<EEAEEEAEEEEEEEEEEEEAEEEEE/EEEEEEEEAEEEEEEEE/EE<E +@NS500443:42:H3MH2AFXX:4:21612:2150:19342/2 +AGTATATACTTTTTGACAACGATTTCAAGAAAACATATAATAAATAACATAGAAGGCAAGAGGAAAGGGTGTCACTTATGAGTAATTTGAAATGGGCTATCGTTGGACCGGGCGGAATCGCTCACCAATTTGCAGAAGCTATGGCAAAGGA ++ +AAAAAEEEEEEEEEEAEAEEEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEE<A/<AEEAEEEEEEEAEEEEEEE/AEEAEEE/EEEEEEEEEAEEEE<E/AE/<A<EEEAE/6EEEAAEEEEEAEE/ +@NS500443:42:H3MH2AFXX:4:21612:2100:19343/2 +CCTATTTATTCTGCACCTGTCGGCTCCGCCGCTGCCACATCTGTTGGGAATTTATCGAGCTATTCCGGTAAACAAATGCGCCTCGTTCGCGAAGCAAAAGCCGGCAGTACCGTTTCTTCACAAATTAGTGTCGATGGAAAAGTGATCGG ++ +A/AAAEEEEEEEE/EE/E/EAEAEEEEEE/EEEA/AEEEAEEEEEE/EEEE/EEEEAEEEEE/E/A/AEEE/EEEEAEE//EEEEEEE/EEEEAE/EEEEEEEEEAEE/<//<EE/EEAEEEEEEEE6//EEEEEEEE/EEAE/AE/AA +@NS500443:42:H3MH2AFXX:4:21612:11502:19343/2 +GTGTACGCGGTTTGCGAGTGCGGTCACCGCCACCAAAGTGTTTCTTGTGTTCTTTATCTTGTGATTTCTTTTCGTCTAACCCTGCAGGTTTCTCAAGCAAAGCTTTACGAGATAAATTGATCCGACCTTGGTTATCGATCTCGATCACTTT ++ +AAAAAEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEE<EEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEEEEE/<EEEA<E</AEEE//EE<AEEE<AEEE<EEE6/<<<E</< +@NS500443:42:H3MH2AFXX:4:21612:24968:19344/2 +ATTTTGCATAAGCGTCCCCTTCGCAAGAGGCGACCAAGCCGCATGAGCAATCCCATGTTTTTTCAGGAACGTTCGTAGTTCATTTTGTGGGAGAAGTGGATGCGTTTCTACTTGGTTAAGTACAGGTAATTCATTCGCAACGGCAAGCAAA ++ +AAAAAEEAEEAEEEEEEEEEEEEEEEEEEEEE/EE/EEEEEEAEE/EEEEEEEEEEEEEEEEEE/EE6AEEEEEE<EE/A/EAEEEEEEEE/E/AE6EE//EEE<E6A6AEEAAAEE<AEA///A<6/EAEE/EA<<E/AEEEE//EEE// +@NS500443:42:H3MH2AFXX:4:21612:6916:19344/2 +GGCAACGACTTCGGCAGACTTACCATCAACGTTGACTTGATAATTCATTTTTCGCATTTCTGCATCCGTAATTTTCCCGGCCAGTTTATTTAGTGGGGCTTTCAGTTCAGGATACTTTTGCAATGTCTTAGTTAACATTAAAGGAGCGCC ++ +AAAA/EE/EEEEAEEEEAEAEEEEEEEEEEEE6E/EEEEEEEEEEE/AEEEEEEEEEEEEEEAEE/EE/EEEEEEEAEEEEEEAAAE/EAEEEA/AEEA/A<EAEA<<EEEEE/EAEA6AAE/<EAEAE6A6E<AEE<E/AAAEEEE<EE +@NS500443:42:H3MH2AFXX:4:21612:15429:19346/2 +ATCTTTTAACTGATAGTAAAAATAAAGTTGTGGCACGCGGTACGAAGCAAGTTCAAGTAGCGACAGTCAGCCATGAAAATTCGAGTGAGAAAACTGTTGCAACAGCACCTGAGCAATCCAGTTCAGCATCAAGTAGCAGTAGTAGCGCAC ++ +AAAAAEEEEEEAEEEEEEAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEEEEEEEEEEEEEEEEE/EA/EEEEEEEEEEEEEE/EEEEE<AEAEAEEEEEEEEAEEEE/E/E +@NS500443:42:H3MH2AFXX:4:21612:1024:19346/2 +GTATTATGGCTTACACAGTTTACAACATCGCGGGCAAGAAGGTGCGGGAATTGTTGCAACAGACGGCAAAGTGTTACGCGGGCACCGCAACTTAGGTTTGCTAGCAGATGTATTTAAACATGGGGAACTGGATGGTTTACCAGGAAGTGGG ++ +A6AAAAEEEAEEEEEEAEAEE/EEAEEEEEAEEEEEEEE6/EEEEEEEEEEEEEEEEEAE6E6E6EE/EEEEEEE/EEEEEEE/EAEE/EEE<AEE6EEE<EEEEAE/EEEEEEEEE/A/66EAE//EEAEA/<A/EEEAAA/EEAEEE</ +@NS500443:42:H3MH2AFXX:4:21612:4182:19348/2 +CCGTCAATCGGAAAGAACGCATATTTATGACCCGCTAATTGAACAACTTTTAGATGAAGGAAAAGCTTATAAATGTTATTGTACGGAAGAAGAACTAGAAGCAGAACGTGAAGCGCAAAAAGCCCGCGGTGAAATGCCTCGTTAGAGCGGA ++ +A/A/AEAEEEEEEEEE6EEEEEEEE/EEE6EEEEEEEEEEEE6AAEEAAEEAEEEE6EEAAE/EEEAEEEEEEEEEEA<6E<E/AEA/A/<EE/E/EEEEAE/E//E6EE//<AEE/E/A/<E/AAEEE6E////6E<//EA/<//A<E// +@NS500443:42:H3MH2AFXX:4:21612:26730:19348/2 +ATCCACGAAACATCACAACACAAATGGCTAACTACGCATCTAAAACACTCAATTAAACAAAACCCGCACTCGAAAAGACTTATCCACTACCACCGATGGGAAAACCAGAAGACGTAACGAAGCACCATCACTTACAGCCTTCACATAAAT ++ +AAAA66EA6EEEE66E/EEAEE6/////AE/EE/EEE//E//E/EEEE/A/EEE/6AAEE/A//A//AEAA/////<EAE/E/EEEE//6E/A///AAEE/////<EE/E/E/<///EAE/E////<A<AEEA/E/A///6E/AEA//// +@NS500443:42:H3MH2AFXX:4:21612:25889:19349/2 +CTTTAGGATTAGAGGTGAGTGCTGGTAAAATTGAAGTAACGGGAATATTAATAGCCGAATAAAGAACTTTCGCGCCAATATAGGCGATATAAATCCATACAATTTTCGCTGTTTCGCTGCCCATAATAAAGCTTGGAGCTGAAAAAGCCAT ++ +AAAAAEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAEEEEEEEAEEEEEEEAEEEEEAEEEEAAAEEE/EEEEEEEEEA/AEEAEEAE<EEEEEEEAAEA6<EEEEEA/EEA<<AEAA<AEEEEEEEEEEAE<EEE<EEEEE +@NS500443:42:H3MH2AFXX:4:21612:6539:19353/2 +GTATAAGGCGGACTATGAAAATGGACGGCTTCCACCTCAACGCCTCGCTTCATTGCTAAATAACCAGCTACTGGGCTATCAATTCCGCCAGATAGCATGAGCATTGCCTTTCCTGACGAACTAACAGGTAGCCCCCCAGCGCCTTGAATGA ++ +AAAAAEEE/EEAEEAEEEEEEEEEEEEEEAAEE/EE6EAEEEEEEEEEEEEEEEEEEEEEEEEEE6EEEEEEEEEEEEEEE/EEEEEEEEAAE/EEAEE/EEEEEEAEEEEEEAA/EE//E</<AEEE//AEEEEEE/EEEEE/EEEA<</ +@NS500443:42:H3MH2AFXX:4:21612:21998:19353/2 +TCTTTGCAGTCATCATATTTCACAGCAAATTCAATTGCGGCAATCCGGTCGAAGTAAGCGCTGTCAAGACGACGCATATTTCCCGGATCTTCTTTTGAATGGAGCCCTGTTTTTCCTTCAATTGCTTTCGTAAGATGGGTAAGTAAATCAA ++ +AAAAAEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEAEEAEEEEE<EEE6E6EEAEE/EEAEEEEEAEEAEEEEAEEAEEEAEAEEEEEA<AEEEEEEEA/AEEEEA<EE<E</EEEEE +@NS500443:42:H3MH2AFXX:4:21612:9349:19354/2 +GGGCTTAATATGGCGATGGAACTAGCTCGAGCTGGCATTTTAGAAGAAATGAATGTCGAAGTCCTTGGAACAGATTTAAGAGCGTTTCAAAAAGCGGAAGATCGCGAAGAATTCCGGAACTTAATGAATGAGCTCGGGGAACCCGTCCCAG ++ +AAAAAE/EEEE/EEEEEEAEEEE/EEAEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEA/<EEEA/E/E6E/AEEEEA/<EE//EEEEEEE/EEEEEEEAAE/EEAE/EE/6E/EA/EEAEEEAEAAEEEE/6EE/A/AEE///EE</ +@NS500443:42:H3MH2AFXX:4:21612:10100:19357/2 +ATTTGAAGAGTACCGGAAGCTTTATGAGTTTGATTATGGCGATTATGAACAGTCTAAGTGGGAAGCACACGTGAAAGTCCAAGCACGTCAGGAAATTGGCATTCGGCGCTTCTTAGAGGAAGGTGGATATAATGCCTTCACATCGAACTTT ++ +AAAAAEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEAEEEEEEEEEAE6AE +@NS500443:42:H3MH2AFXX:4:21612:5905:19357/2 +GTGACGGGCTATGATATCACCGTAAAGAAAGTACTTGTACGCGATTTAGAAAAGAATCGCCGTTACGAGGCAGAAGGTTTTACCTTAACAACCGAACCAAAAGACGTGCTAGATGATCCAGAAATCGCTGTTATTGTAGAAGTAATGGGCA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEA<EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEEAEA +@NS500443:42:H3MH2AFXX:4:21612:22823:19360/2 +GAAAACGGCAGCTTTATTTGGCCCGAAAACGTACAAGATTACGTTCAGTATTTCCACCGAATCGCAGCTGGCATCCTTGTTATTTGGATTTTATATGTGACTTGGCTCGTTTTCCGTGACTACAGACATTATCGCGTGTTAACGTATGGCA ++ +AAAA/EEEEEEEAEEAEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEAEEEEAEEEEEEAEEEEEE/EEEEEEEEEEEEEEEEEEE/EAAEEEEEEE<EEEEEEEAEAEEE/EEEEEEEEEEE/EE6EEEEEEE<E<A +@NS500443:42:H3MH2AFXX:4:21612:17904:19360/2 +TCCTTGTAGCGCTTAAGCAAGAAATCCAGCAACGCGAAAGAATCCGGTCCTCCGGAAACTGCCACAAGCAATTTCTCATCTGGCCGGATTAAATCATGCTTTAGAATAAAGTTTTCGACACGCCTTGAAAATTCATCCATTCTAAAAAC ++ +AAAAAEEEEEEEE/EEEEEEEEE6AEE/EEAEEEEEE/EEEEEEEEEEEEEEEEEE/EEEEEEEEE/EAEEEEEEEEEEEEEAAEEEEEEEEEEEEEEE/EEEEEAEEEEEEAEAEE/EAEEE<EEAEEEEEE<EEEEE<AEEEEE//E +@NS500443:42:H3MH2AFXX:4:21612:5361:19361/2 +ATACGGTGAAGCGAATGTTCAAAGAGATAAGACCTTACCTTATTCATTAAGGGATTTAATAAAGGTGGAAAAGAAAATTACTGTTCCTGCTAAAAAGAGTGTAGATTTGAAGTTAGCTGTTACTATGCCGGATAAACCATATGCGGGAGTA ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEAEEEEEEAEEEEEEEEAEEEAEEEEEEEAEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEEEEEAAEEEEEEEEEAEEEEEEEEEEEEEEEEEAEEEEE/ +@NS500443:42:H3MH2AFXX:4:21612:10976:19365/2 +CCATTAGGAGATCGTGTTGTTATTGAAGTAATTGAAGCAGAAGAAAAAACAGCGAGCGGCATCGTTTTGCCAGATTCGGCGAAAGAAAAACCGCAGGAAGGAAAGGTTGTTGCGGTAGGTTCTGGTCGTGTACTTGATAGTGGCACTAAAG ++ +AAAAAEEEEEAEEEEEEEEAE/EEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEAEEEEEEEEAEEEEEAA/EEEEEE6AEEEEAEEEEEEA/EEAEEEAE +@NS500443:42:H3MH2AFXX:4:21612:22942:19365/2 +CGAAAAAACAAGCCGAACAAAATGCAGCGGAGTTTGCAGTTCGGCAGATGATGCATAAGTAGGAACGGAAGGAGAGAAGCAGTTTGCTATTAAAACGACTGGAAATGAATGGATTTAAATCATTTGCAGATAAAGTAGCAATTGATTTTGT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE6AEEEEEEAE/EEEAEEEEEEE<EAEEEEEEEEAEEAEEEEEEEEE//AE<AEEE<A +@NS500443:42:H3MH2AFXX:4:21612:14114:19366/2 +AAATAAGGAAGCGTCATCGAGACATTCGCCATCAGCGTCAGTTTATTGTAGAAGGCAGAGGCATTGCTTCCGCCAAAAGAGGCGATCAAAATAATGACAACAACGATAAGACACTGAATCCACATTGCAAACTCTGGCATTCCTGCCTTGT ++ +AAAAAEEEEEEEEEEEEE6EEAEEEEEEEEEEEE6EEEEEEEEEAEEEAEEEAEEEEEEE6EEEEEEEEEEEEEEEEEE/EEEEEEEE<E/E/EEE/E/EEEEEEE/EEEEEEAEEEE/AEEEEEEEE///EEEEEEEAAAEEE/EEE/<< +@NS500443:42:H3MH2AFXX:4:21612:13612:19370/2 +GATCAGGTTATGCAGATTTTTAATATCGAAGCAGATGTAGATTTAGATATTATGAAAGCAGGTCAAACGCTAAGTGATATAACATCACGAGTTTTATCTGGTCTAGTCGAAACAATCAAGCAGGAACAACCAGACATTGTGCTTGTCCACG ++ +AAAAAEEEEEEEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEE<EEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEAEEA<AEEE<EEEEEEEEE<E<<EEEE +@NS500443:42:H3MH2AFXX:4:21612:1049:19371/2 +CTCCCTCACCGGCGCGCTTTAATTCTCGTAACACAAAAAATTGTTCCGTTGAAATTCGCTTTCCGCCAATCTCCACTTCTTCAATCGCTTGATCAATTTCTTTTAATGCAAATAAATACGTCCGTGTATAACCATCAATAAAGCGATCTAA ++ +AAAAAEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEAE<EEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEAAEEEEAEEEEAEEEEEEEEEEEEEEEAEEE/ +@NS500443:42:H3MH2AFXX:4:21612:9601:19373/2 +CTATTTGCATGAGCGAATGACTTGGACTGCTTATAAGCTCTTTTATTTGCGGAAAAAGCGAAACTTGCAAAATGATTCAGAACGATTGCAATATATGGATGAAGCGCTGACTGAAATTGGGAAATTAAGCCAAGCGGTTGAGCGTGAACTT ++ +AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEE/EEEEAEEEEEAEEEAEEAE///EEEEEEEEEEAE<AEEEEEEEAEEEEEEEEA +@NS500443:42:H3MH2AFXX:4:21612:20393:19375/2 +GTCTCAAGCGACTTGATCAGTGGTTCAAAATGATGATTTTGACTTATTTTCCCGTAAAGAGTGACTTCATCCCCATTGACTGCAATCACGTTTTCTTCACTAAAACCTTGGCTGAATGTTCCAGTTGCAGAATCGGATAATTTAACGCCTA ++ +AAAAAEEEEE/E/EEEEEEEEE6EEEEEEEEEEEEAEEEEE/AEAEAEEEEEE6AEEEEAEEE/EEEE/EEE/EEEAEEEE6EEAAEA//A/<E/6/EEEAEEE<AAEA/AA/AE/A/<EE/A/<AE/AAE/EE/EE/EEE/EE/AEEEAE +@NS500443:42:H3MH2AFXX:4:21612:18713:19376/2 +GCTTATATCCCAACAAATGTAATCTCCATAACTGACGGACAGATTTTCTTACAGTCTGATTTGTTCTTCTCTGGGGTTCGTCCAGCGATTAATGCTGGTTTATCTGTATCACGTGTTGGTGGTTCCGCACAAATCAAAGCAATGAAGAAAG ++ +AAAAAEEEEEEEEEEEEEEEE/EEEEEEEEEAEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEA<EEEEEEEEEAEAEEEEAEE6EEA<EAEEEEEEEEEE/AEE/EEEE/AA/AE +@NS500443:42:H3MH2AFXX:4:21612:16612:19377/2 +CTTGATGATGACTTTAATCAATTCCTTCTTAGTGTGCGTTCGAAAAGTGTTAAATGGATAAAGGAACAGATGCTACTAGTATATGGGAAAGATACCGAACCTTATTTGAATGATCTTGCAATGATGTTAACGGGAATGATTCTTCTTTTTG ++ +AAAAAEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEAEEE/EEEE/EEEEEEE/AEEEEEEEEAEEEEEEEEEAEEE/EEEAEAE<EEAEAAEEEEE/ +@NS500443:42:H3MH2AFXX:4:21612:21100:19378/2 +GCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC ++ +AAAAAE/E/EEEEEEAAAEEE////EEE/AEA/////E/////E//E66EA/AEEE/EEE//////AE////AEA/EE//AE/EE/EEEEE//E/E//EEAAA//A/EEE/A/A/6A</A///E/EAE/E<A//E//////6/E///6<< diff --git a/src/FqAuxBackend.cpp b/src/FqAuxBackend.cpp index 26912250b7e89aa6cc47f66329b976f637bd2013..ef58d76807b11edfefc8444d77a6a54e64538623 100644 --- a/src/FqAuxBackend.cpp +++ b/src/FqAuxBackend.cpp @@ -13,6 +13,7 @@ #include <errno.h> #include <err.h> #include <stdlib.h> +#include "FqBaseBackend.h" #include "FqAuxBackend.h" @@ -125,8 +126,8 @@ int FqAuxBackend::processBuffer(rinfo * p_nr) { void FqAuxBackend::readBuffer() { - if ((nread=read(f_pe2,buf,bufsize))!=0) { - cur_offset=ftell(fp2); + if ((nread=read(f_desc,buf,bufsize))!=0) { + cur_offset=ftell(f_stream); pos_in_buf=0; } } @@ -134,37 +135,20 @@ void FqAuxBackend::readBuffer() { * Opens file and performs the fist read operation. */ void FqAuxBackend::openFile(char * ficname, unsigned char id) { - int st,s; - // unsigned long cur_offset; - mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - - closeFile(); - filename=ficname; - f_id=id; - - f_pe2=open(filename,O_RDONLY,mode); - if (f_pe2==-1) { - err(errno,"cannot open file: %s.",filename); - } - - fp2=fdopen(f_pe2,"r"); - if (fp2==NULL) { - err(errno,"cannot open file: %s.",filename); - } - buf=(char *) malloc(bufsize*sizeof(char)); - if (buf==NULL) { - err(errno,"cannot allocate memory: %lu bytes.",bufsize); - } - pos_in_buf=bufsize; // do that to force first reading in buffer. + FqBaseBackend::openFile(ficname,id); + buf=(char *) malloc(bufsize*sizeof(char)); + if (buf==NULL) { + err(errno,"cannot allocate memory: %lu bytes.",bufsize); } + pos_in_buf=bufsize; // do that to force first reading in buffer. +} void FqAuxBackend::closeFile() { - if (filename!=NULL) { - close(f_pe2); - filename=NULL; - free(buf); - buf=NULL; - } + if (filename!=NULL) { + FqBaseBackend::closeFile(); + free(buf); + buf=NULL; } +} diff --git a/src/FqAuxBackend.h b/src/FqAuxBackend.h index badb2ee563ba93f36bef92477f88ba4cf08f6875..b52805d9e7b7ef02108106fb4b792cdb06e5b7c0 100644 --- a/src/FqAuxBackend.h +++ b/src/FqAuxBackend.h @@ -9,15 +9,13 @@ #define FQAUXBACKEND_H_ #include "srp.h" +#include "FqBaseBackend.h" + const size_t bufsize=6048000; // It seems that I can't have a much bigger value than that or else my objects construction throws exception. Strange. // const size_t bufsize=500000; //try that in valgrind -class FqAuxBackend { - char * filename; - unsigned char f_id; +class FqAuxBackend:public FqBaseBackend { size_t nread; - int f_pe2; // for calling read - FILE * fp2; // for calling ftell unsigned long cur_offset; char * buf; @@ -30,17 +28,12 @@ public: char current_id[50]; FqAuxBackend() { - filename=NULL; - f_pe2=-1; - fp2=NULL; - f_id=0; cur_offset=0; nread=0; buf=NULL; + pos_in_buf=bufsize; } - void openFile(char * ficname, unsigned char id); - void closeFile(); int getNextRead(rinfo *); diff --git a/src/FqBaseBackend.cpp b/src/FqBaseBackend.cpp new file mode 100644 index 0000000000000000000000000000000000000000..73d4ba9c6482c474b411fb90e24a21399627f112 --- /dev/null +++ b/src/FqBaseBackend.cpp @@ -0,0 +1,75 @@ +/* + * FqBaseBackend.cpp + * + * Created on: Jan 20, 2016 + * Author: vlegrand + */ +//#include <stdio.h> +#include <fcntl.h> +//#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <err.h> + +#include "FqBaseBackend.h" + + +void FqBaseBackend::openFile(char * ficname, unsigned char id) { + int st,s; + // unsigned long cur_offset; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + + closeFile(); + filename=ficname; + f_id=id; + + f_desc=open(filename,O_RDONLY,mode); + if (f_desc==-1) { + err(errno,"cannot open file: %s.",filename); + } + + f_stream=fdopen(f_desc,"r"); + if (f_stream==NULL) { + err(errno,"cannot open file: %s.",filename); + } +} + +void FqBaseBackend::closeFile() { + if (filename!=NULL) { + close(f_desc); + filename=NULL; + f_desc=-1; + f_stream=NULL; + /*free(buf); + buf=NULL;*/ + } +} + +/* + * use C style char* rather than std::string for a matter of performance. + * use read instead of fread for the same reason. + * + * Note: It is up to the caller to determine where the end of the record is (5th \n character in the fq_record string). + */ +void FqBaseBackend::getRead(const long& offset, char * fq_record) { + + + //char fq_record[MAX_FQ_RECORD_LENGTH]; + + int nread,i,nb_lines; + char * pchar; + + if (f_desc==-1) err(0,"No open file currently associated to this backend"); + // if (f_stream==NULL) err(0,"No open file currently associated to this backend"); + + int res=lseek(f_desc,offset,SEEK_SET); + if (res==-1) err(errno,"fseek problem when trying to retrieve dna string."); + nread=read(f_desc,fq_record,MAX_FQ_RECORD_LENGTH); +/* + i=0; + pchar=fq_record; + while (i<=nread-1) { + + }*/ + +} diff --git a/src/FqBaseBackend.h b/src/FqBaseBackend.h new file mode 100644 index 0000000000000000000000000000000000000000..a5bec4a0c9fd75da9b67ea691f7f788debde969d --- /dev/null +++ b/src/FqBaseBackend.h @@ -0,0 +1,43 @@ +/* + * FqBaseBackend.h + * + * Created on: Jan 20, 2016 + * Author: vlegrand + */ + +#ifndef FQBASEBACKEND_H_ +#define FQBASEBACKEND_H_ + +#include "FqConstants.h" +#include "srp.h" + + + + +class FqBaseBackend { + + // TODO: this class will also be associated to writing output "filtered" fastq files. + +protected: + char * filename; + unsigned char f_id; + int f_desc; // for calling read + FILE * f_stream; // for calling ftell + +public: + + FqBaseBackend() { + filename=NULL; + f_desc=-1; + f_stream=NULL; + f_id=0; + } + + void openFile(char * ficname, unsigned char id); + void closeFile(); + void getRead(const long&,char *); +}; + + + +#endif /* FQBASEBACKEND_H_ */ diff --git a/src/FqConstants.h b/src/FqConstants.h index e41463a17efac020316b8347839baf4ee6f7df9f..1fb1ea6b878d7678200e6f7fcf9f73b42054edaf 100644 --- a/src/FqConstants.h +++ b/src/FqConstants.h @@ -12,4 +12,10 @@ #define k_read_qual_start '+' #define k_phred_32 33 +const unsigned char mask=0x0F; // used to retrieve file identifier in the case of PE reads. + +#define MAX_READ_LENGTH 3000 // set this arbitrary for the moment. This will likely become a parameter of the program later. + +#define MAX_FQ_RECORD_LENGTH 3000 + #endif /* FQCONSTANTS_H_ */ diff --git a/src/FqMainBackend.cpp b/src/FqMainBackend.cpp index 30ae15071c4d0747bfa0ff1e1520fd9d3b25b83b..d79f22de1064695cf50dd021abf238f49cf0e8f6 100644 --- a/src/FqMainBackend.cpp +++ b/src/FqMainBackend.cpp @@ -15,31 +15,16 @@ #include <assert.h> #include <stdlib.h> -// for debug only. TODO : remove that. +//#define DEBUG + +#ifdef DEBUG #include <iostream> using namespace std; +#endif #include "FqMainBackend.h" #include "srp.h" -//#define DEBUG - -rpos init_rpos(unsigned char f_id, unsigned long rstart_offset) { - rpos rp; - rp.fileid=f_id <<4; - rp.read_a1=rstart_offset%INT_MAX; - return rp; -} - -/* - * Add Paired read information to already existing rpos structure. - */ -void update_rpos(unsigned char f_id,unsigned long rstart_offset,unsigned long j, rpos * rp) { - rp->fileid=rp->fileid|f_id; - rp->read_a2=rstart_offset-INT_MAX*j; -} - - FqMainBackend::FqMainBackend(srp * io_sr) { p_auxFqProcessor=NULL; @@ -87,8 +72,6 @@ void FqMainBackend::processFile(char * filename,unsigned char f_id) { void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned long cur_offset) { int cnt=0; - // debug stuff, TODO: remove it - // static int nb_read_PE1=0; unsigned int s; static unsigned int st; static int num_l_in_rec; /* counter to know on which line inside the fastq record we are */ @@ -133,7 +116,6 @@ void FqMainBackend::processBuf(char * buf,int nread,unsigned char f_id,unsigned // std::cout<<"BE1 position in buffer "<<cnt+1<<endl; //add 1 to compare value with BE2. qual_score=0;/* end of fastq record */ rpos rp=init_rpos(f_id,rstart_offset); - // nb_read_PE1++; // TODO: remove, this is debug stuff if (p_auxFqProcessor!=NULL) { rinfo pe2info; int eof=p_auxFqProcessor->getNextRead(&pe2info); diff --git a/src/FqMainBackend.h b/src/FqMainBackend.h index 38a9bf30f70da4f1b823c018c89f8a535917dadf..c8b2b1aab351014426b87c256cf3119207987693 100644 --- a/src/FqMainBackend.h +++ b/src/FqMainBackend.h @@ -10,11 +10,13 @@ #ifndef FQMAINBACKEND_H_ #define FQMAINBACKEND_H_ +#include "FqBaseBackend.h" #include "FqAuxBackend.h" #include "FqConstants.h" -class FqMainBackend { + +class FqMainBackend : public FqBaseBackend { FqAuxBackend * p_auxFqProcessor; /* Another fastq processor component is necessary for handling the case of PE reads.*/ srp * p_scoreReadStruct; /* Where we store information about the reads. */ char current_id[50]; diff --git a/src/Makefile.am b/src/Makefile.am index 6763ca1aa7b40c095d98e690d0768270db99cb38..631aeea2b7ca59e68be5d9acaaed610433c11cf8 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -3,7 +3,8 @@ LINTDEFS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) AM_CPPFLAGS = -DDATADIR=\"$(pkgdatadir)\" bin_PROGRAMS=rock -noinst_PROGRAMS = unit_test_fqreader +noinst_PROGRAMS = unit_test_fqreader perf_test_fqreader +## noinst_PROGRAMS = noinst_LIBRARIES = librock.a noinst_HEADERS = $(HDR) @@ -14,7 +15,10 @@ rock_LDADD = librock.a unit_test_fqreader_SOURCES=unit_test_fqreader.cpp unit_test_fqreader_LDADD=librock.a +perf_test_fqreader_SOURCES=perf_test_fqreader.cpp +perf_test_fqreader_LDADD=librock.a + librock_a_SOURCES = $(SRC) -SRC = fqreader.cpp FqAuxBackend.cpp FqMainBackend.cpp -HDR = srp.h fqreader.h FqConstants.h FqAuxBackend.h FqMainBackend.h \ No newline at end of file +SRC = fqreader.cpp FqBaseBackend.cpp FqAuxBackend.cpp FqMainBackend.cpp read_utils.cpp +HDR = srp.h fqreader.h FqConstants.h FqBaseBackend.h FqAuxBackend.h FqMainBackend.h rock_types.h read_utils.h \ No newline at end of file diff --git a/src/fqreader.cpp b/src/fqreader.cpp index 7b8169f1536e35aba574015fc8e20199c18b3dc9..3615555a2a9f07eb6af60e45f9d7f9a107564758 100644 --- a/src/fqreader.cpp +++ b/src/fqreader.cpp @@ -18,20 +18,6 @@ - -/* -typedef struct { - char buf[bufsize]; - unsigned char f_id; - int nread; - unsigned long cur_offset; -}fastq_buf;*/ - - - - - - /* * Processes 1 file containing single reads */ @@ -52,9 +38,34 @@ void processPEFiles(char * fq_1, unsigned char f_id1,char * fq_2, unsigned char } /* - * This function processes the content of an rpos structure and returns the character string "ATCG" corresponding to the read record. + * This function processes the content of an rpos structure and returns the character string "ATCG..." corresponding to the read record. * PE reads are concatenated. */ -void getRead(char ** fname_id_array, const rpos& sr, unsigned int j, char ** dna_read){ -} +/*void getRead(fq_file_map& fic_map,const rpos& sr, unsigned int j, char * dna_read){ + unsigned char f_id1; + unsigned char f_id2=2; + int is_PE=0; + f_id1=sr.fileid >>4; + f_id2=sr.fileid &mask; + + if (f_id2!=0) is_PE=1; + + fq_file struc_f1=fic_map[f_id1]; + if (is_PE) fq_file struc_f2=fic_map[f_id2]; + + if (struc_f1.fic_desc==0) { // file is not yet open + struc_f1.fic_desc=open(filename,O_RDONLY,mode); + if (f_pe2==-1) { + err(errno,"cannot open file: %s.",filename); + } + + fp2=fdopen(f_pe2,"r"); + if (fp2==NULL) { + err(errno,"cannot open file: %s.",filename); + } + + } + + +}*/ diff --git a/src/fqreader.h b/src/fqreader.h index bd823d944facebd88ffdf2ebe43d2b4fe4516925..f5f0eb1266785671541e1445b1cd4216f441b487 100644 --- a/src/fqreader.h +++ b/src/fqreader.h @@ -8,11 +8,9 @@ #define FQREADER_H #include "srp.h" -const int read_length=1000; // this will become a parameter later. - /*void processBuf(char * buf,int nread,int cur_offset);*/ void processSingleFile(char *, unsigned char, srp*); void processPEFiles(char * fq_1, unsigned char f_id1,char * gq_2, unsigned char f_id2,srp *io_sr ); -void getRead(char ** fname_id_array, const rpos& sr, unsigned int j, char ** read); +// void getRead(fq_file_map&, const rpos& sr, const unsigned int& j, char * read); #endif diff --git a/src/read_utils.cpp b/src/read_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..79e6808d1530266f8853b29d2b2580d478c91d33 --- /dev/null +++ b/src/read_utils.cpp @@ -0,0 +1,82 @@ +/* + * read_utils.c + * + * Created on: Jan 20, 2016 + * Author: vlegrand + * * Utility methods for processing DNA reads : decomposition in overlapping k-mers, converting a k-mer into a number, + * reverse complement and so on... + */ + +/* + * This function processes the content of an rpos structure and returns the character string "ATCG..." corresponding to the read record. + * PE reads are concatenated. + */ +#include "FqConstants.h" +#include "rock_types.h" +#include "srp.h" +#include "read_utils.h" + +void getFqRecord(FqBaseBackend* fq_files_be[],const unsigned char& f_id,const long& offset, + char * fq_record) { + FqBaseBackend * be=fq_files_be[f_id]; + be->getRead(offset,fq_record); +} + + +void init_DnaSeqStr(DnaSeqStr * dna_seq) { + dna_seq->fq_record_buf[0]='\0'; + dna_seq->length=0; + dna_seq->start_idx=0; +} + +void processFqRecord(DnaSeqStr * dna_seq) { + char * pchar=dna_seq->fq_record_buf; + int cnt=0; + int nb_l=0; +#ifdef DEBUG + assert(*pchar==k_read_id_start); +#endif + while (cnt<=MAX_FQ_RECORD_LENGTH-1) { + if (*pchar=='\n') { + nb_l++; + if (nb_l==1) dna_seq->start_idx=cnt+1; + if (nb_l==2) { + dna_seq->length=cnt-dna_seq->start_idx; + break; + } + } + pchar++; + cnt++; + } +#ifdef DEBUG + assert(nb_l>=2); +#endif +} + + +void getDNASeqstr(FqBaseBackend* fq_files_be[], + const rpos& sr, + unsigned int j, + DnaSeqStr * dna_seqs) +{ + unsigned char f_id1; + unsigned char f_id2; + DnaSeqStr * p_dna_seqs=dna_seqs; + + unsigned char fid_stored=sr.fileid; + + f_id1=fid_stored >>4; + f_id2=fid_stored &mask; + + long offset1=j*INT_MAX+sr.read_a1; + getFqRecord(fq_files_be,f_id1,offset1,dna_seqs[0].fq_record_buf); + processFqRecord(p_dna_seqs); + + if (f_id2!=0) { // case of PE reads. + p_dna_seqs+=1; + long offset2=sr.read_a2+j*INT_MAX; + getFqRecord(fq_files_be,f_id2,offset2,dna_seqs[1].fq_record_buf); + processFqRecord(p_dna_seqs); + } + +} diff --git a/src/read_utils.h b/src/read_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..6bde5d3ecb4999312fe2c8e8560523768113a59b --- /dev/null +++ b/src/read_utils.h @@ -0,0 +1,33 @@ +/* + * read_utils.h + * + * Created on: Jan 20, 2016 + * Author: vlegrand + */ + +#ifndef READ_UTILS_H_ +#define READ_UTILS_H_ + +#include "FqConstants.h" +#include "rock_types.h" + +/* + * Here, for performance matters, do not want to have to do a memmove nor strcpy or strcat to extract only DNA sequence for all + * the fastq records that we'll have to process (potentially millions). So, use a structure to store start position of DNA sequence in buffer and its length. + * We will not process more than 2 fastq records at a time (case of he PE reads) so it will not take too much space in memory. + */ + +typedef struct { + char fq_record_buf[MAX_FQ_RECORD_LENGTH]; + int start_idx; + int length; +}DnaSeqStr; + +void getDNASeqstr(FqBaseBackend* [], + const rpos&, + unsigned int, + DnaSeqStr *); + +void init_DnaSeqStr(DnaSeqStr * dna_seq); + +#endif /* READ_UTILS_H_ */ diff --git a/src/rock.cpp b/src/rock.cpp index dea30b58558f02e3ce68eddadee5de87274693a6..5cfb275d6c491835ecf093ffcb819ad550017058 100644 --- a/src/rock.cpp +++ b/src/rock.cpp @@ -16,7 +16,7 @@ #include <err.h> #include <string.h> -#include "srp.h" +// #include "srp.h" #include "fqreader.h" #define k_max_input_files 15 diff --git a/src/rock_types.h b/src/rock_types.h new file mode 100644 index 0000000000000000000000000000000000000000..fe80bac75f833b822b98c723fa7dfcbb596c7c8a --- /dev/null +++ b/src/rock_types.h @@ -0,0 +1,29 @@ +/* + * rock_types2.h + * + * Created on: Jan 20, 2016 + * Author: vlegrand + * Gather here typedefs and structures useful for the main program and "inter-modules" data exchange. + * I divided rock into 4 "modules": + * 1- fqreader (parses fastq and fills 3D array of structures reprensenting all reads). + * 2- read-utils (utility functions for DNA reads processing) + * 3- CMS (fill and use). + * 4- fqwriter (writes fq filtered by CMS). + */ + +#ifndef ROCK_TYPES_H_ +#define ROCK_TYPES_H_ + +#include <map> +#include "FqBaseBackend.h" + +/*typedef struct { + FqBaseBackend& fq_be; + +}fq_io;*/ + +// typedef std::map<unsigned char,FqBaseBackend&> fq_file_map; + +#define k_max_input_files 15 + +#endif /* ROCK_TYPES_H_ */ diff --git a/src/srp.h b/src/srp.h index 488cd23e05825bf54aaaa11a988b72c7825eb874..b4b2b22d2743ad1a52305fe64d7f73af0a141990 100644 --- a/src/srp.h +++ b/src/srp.h @@ -20,9 +20,6 @@ typedef struct { /* Here store read offset in files whose ids are stored in the }rpos; - - - typedef struct { unsigned int score; // total quality score for read. unsigned long rstart_offset; // read offset in file @@ -34,4 +31,20 @@ typedef std::map<unsigned long,k_dim> i_dim; typedef std::map<unsigned long,i_dim> srp; +inline rpos init_rpos(unsigned char f_id, unsigned long rstart_offset) { + rpos rp; + rp.fileid=f_id <<4; + rp.read_a1=rstart_offset%INT_MAX; + return rp; +} + +/* + * Add Paired read information to already existing rpos structure. + */ +inline void update_rpos(unsigned char f_id,unsigned long rstart_offset,unsigned long j, rpos * rp) { + rp->fileid=rp->fileid|f_id; + rp->read_a2=rstart_offset-INT_MAX*j; +} + + #endif /* SRP_H_ */ diff --git a/src/unit_test_fqreader.cpp b/src/unit_test_fqreader.cpp index 64160bd84da0ceb0180be6714267a6ed1ffb1145..41887435bd5b10b1f0793912931462eefd27be5d 100644 --- a/src/unit_test_fqreader.cpp +++ b/src/unit_test_fqreader.cpp @@ -11,9 +11,12 @@ #include <limits.h> #include <assert.h> #include <stdlib.h> +#include "FqConstants.h" #include "srp.h" #include "fqreader.h" +// TODO : Add test case where @ character is inside quality score. + using namespace std; void test_processSingleFile() { @@ -146,13 +149,132 @@ void test_processAllFiles() { assert(cnt_read==9); } +#include "read_utils.h" +#include "rock_types.h" +#include "FqMainBackend.h" +//#include "srp.h" + +void test_getReadSingle() { // TODO : move this to another test file. + rpos my_struct1,my_struct2; + my_struct1=init_rpos(4,639); + my_struct2=init_rpos(4,1228); + + //std::cout<<my_struct1.read_a1<<endl; + + assert(my_struct1.read_a1==639); + assert(my_struct2.read_a1==1228); + srp io_sr; + unsigned int j=0; + DnaSeqStr a_seqs; + char dna_read[MAX_READ_LENGTH]; + + char * fq_single2=(char *) "../data/test_single2.fq"; + FqMainBackend be_fq=FqMainBackend(&io_sr); // TODO, remove argument from constructor + be_fq.openFile(fq_single2, 4); + + + FqBaseBackend * fic_map[k_max_input_files]; + fic_map[4]=&be_fq; + + init_DnaSeqStr(&a_seqs); + getDNASeqstr(fic_map, my_struct1, 0, &a_seqs); + char * tmp=(char *) a_seqs.fq_record_buf; + tmp+=a_seqs.start_idx; + memcpy(dna_read,tmp,a_seqs.length); + // std::cout<<dna_read<<endl; + assert(strcmp(dna_read,"TTTTTAGGTGCTACCATAACACCAACTGTTTTCACCATAATTTTAAAATCAAGCATTAGAGACGCTTCTCTAATGTATTGCAAATCTAGTTCTACCATTTGATGAAAATCTAATTCATTTCTTCCACTAACCTGCCATAATCCAGTACAACCTGGTATAACGGTCAAACGCTTTTTATCATAGGAACTGTATTCTCCTACCTCACGTGGCAAAGGAGGTCTTGGACCAACAATTGCCATGTCTCCTTTAACCACATTCCAAAGCTGTGGTA")==0); + + + init_DnaSeqStr(&a_seqs); + assert(strcmp(a_seqs.fq_record_buf,"")==0); + assert(a_seqs.length==0); + assert(a_seqs.start_idx==0); + + getDNASeqstr(fic_map, my_struct2, 0, &a_seqs); + tmp=(char *) a_seqs.fq_record_buf; + tmp+=a_seqs.start_idx; + memcpy(dna_read,tmp,a_seqs.length); + if (a_seqs.length<MAX_READ_LENGTH) dna_read[a_seqs.length]='\0'; + std::cout<<dna_read<<endl; + assert(strcmp(dna_read,"ACCCAAACTTGCCAGACTTGTGTAGAACGTCCAATATGTATCGGCATCGCTTCCACATGAATGAATCCTTGTTCCACACTTTTTATATGATTCGCATTAATTTCTTGTCCGAAAATCAACTGATTTTTTGCAACATTTTCTCCCGCTCCAAGACTGGCTGCATGTTCTGCAAGCGCAACAGAAACACCACCATGCAAGTAGCCAAAGGGTTGTTTGACCTGATCTGTTATTTCAAGCGCCAGTTCCACTC")==0); + be_fq.closeFile(); +} + +void getDnaStr(FqBaseBackend * fic_map[],rpos my_struct,DnaSeqStr* a_seqs,char * dna_read) { // Auxilliary method for test_getReadPE(). + getDNASeqstr(fic_map, my_struct, 0, a_seqs); + + char * tmp=(char *) a_seqs[0].fq_record_buf; + tmp+=a_seqs[0].start_idx; + + memcpy(dna_read,tmp,a_seqs[0].length); + + tmp=(char *) a_seqs[1].fq_record_buf; + tmp+=a_seqs[1].start_idx; + + char * b=(char *) dna_read+a_seqs[0].length; + memcpy(b,tmp,a_seqs[1].length); + + if (a_seqs[0].length+a_seqs[1].length<MAX_READ_LENGTH) dna_read[a_seqs[0].length+a_seqs[1].length]='\0'; +} + +void test_getReadPE() { // TODO: move this function to a separate test file : unit_tests_read_uils.cpp when I will have enough tests for this component. + rpos my_struct1,my_struct2; + char * fq_PE1=(char *) "../data/test_PE1_2.fq"; + char * fq_PE2=(char *) "../data/test_PE2_2.fq"; + srp io_sr; + unsigned int j=0; + char dna_read[MAX_READ_LENGTH]; + + FqMainBackend be_fq1=FqMainBackend(&io_sr); + FqAuxBackend be_fq2; + be_fq1.setAuxProcessor(&be_fq2); + + be_fq1.openFile(fq_PE1, 4); + be_fq2.openFile(fq_PE2, 4); + + FqBaseBackend * fic_map[k_max_input_files]; + fic_map[4]=&be_fq1; + fic_map[5]=&be_fq2; + + DnaSeqStr a_seqs[2]; + + init_DnaSeqStr(&a_seqs[0]); + init_DnaSeqStr(&a_seqs[1]); + + my_struct1=init_rpos(4,0); + update_rpos(5,0,j,&my_struct1); + + my_struct2=init_rpos(4,13648); + update_rpos(5,13654,j,&my_struct2); + + assert(my_struct1.read_a1==0); + assert(my_struct1.read_a2==0); + assert(my_struct2.read_a1==13648); + assert(my_struct2.read_a2==13654); + + getDnaStr(fic_map,my_struct1,a_seqs,dna_read); + std::cout<<dna_read<<endl; + + assert(strcmp(dna_read,"GGTTCTGTTGGCTCAAACGCCGTCACTTCATTGATCAAAAGCTTATAATGCGTGCCAAAGTCCGCCATCGAGACGACTACGCCTTCCCCTGCTTTCCCGTCAAAAACGAGTCTTGCCGGATCTTCACGGTCTCCCCTCGAAAGCGGCGAAATCTTAGAGGAAGGTGGATATAATGCCGTCACATCGAACTTTGAAGATCTATACGGCATGCAGCAGCTTCCAGGTCTTGCGGTGCAACGTTTAATGGCAGATGGCTACGGTTTTGCGGGGGAAGGAGACTGGAAAACGGCGGCGATCGACCG")==0); + + getDnaStr(fic_map,my_struct2,a_seqs,dna_read); + assert(strcmp(dna_read,"ATTGTGGGGTTCCTTTTTGTAGCATTGGAATGGAAATTAAAAATGGGGCTTCAGGATGCCCGCTCCATTATTTAATTCCAGAATGTAACGATGCTGTTTACCGGGGGGACTGGAAAGATGCACTTGAGCTTTTGATTAAAACAAATAACATGCCCAGAACCAATCACTGCAATTTTTTTATCCCACCGCACTATCGGTGGAGTCGGCATGAACCAACCTAAACCAAACCCACGGTCAATAATAGCCCGTTCAATCGAATTAATACCCACAGCAGGATCAGAAATTGCAACCGTACAACTTC")==0); + + be_fq1.closeFile(); + be_fq2.closeFile(); + cout<<"done"<<endl; + +} + int main(int argc, char **argv) { - cout<<"sizeof(char)"<<sizeof(char); - /*char* fq_s_test="data/test_single.fq";*/ cout<<"test for single file"<<endl; test_processSingleFile(); cout<<"test for PE files"<<endl; test_processPEFiles(); - cout<<"test for both single and PE files"; + cout<<"test for both single and PE files"<<endl; test_processAllFiles(); /* mix PE together with single; nearly as in real life.*/ + cout<<"test getting single reads."<<endl; + test_getReadSingle(); + cout<<"test getting PE reads."<<endl; + test_getReadPE(); }