diff --git a/build/splitFasta.py b/build/splitFasta.py new file mode 100644 index 0000000000000000000000000000000000000000..06fc3d30615a7f240cac0088a063a3b75e7e140e --- /dev/null +++ b/build/splitFasta.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import argparse +from math import ceil +from lib import fasta +from statistics import mean + +parser = argparse.ArgumentParser() +parser.add_argument("input", help="fasta file") +parser.add_argument("output", help="prefix of outputed fastas") +parser.add_argument("--nbChunks", help="number of chunks", default=10, type=int) +parser.add_argument("--suffix", help="extension with dot. Default: .fastq", default=".fasta") +parser.add_argument("--maxChunkSize", type=int, help="minimum number of sequences per chunk. If defined, will ignore nbChunk parameter") +parser.add_argument("--maxSitesPerChunk", type=int, help="maximum position number per chunk") + +args = parser.parse_args() + + +if args.maxChunkSize: + # determining number and size of queries + seqLen = tuple(map(lambda s: len(s[1]),fasta(args.input,True))) + print("Read {} sites.".format(sum(seqLen))) + # we have to guess the equivalent number of positions + maxSitesPerChunk = int(mean(seqLen) * args.maxChunkSize) +else: + maxSitesPerChunk = args.maxSitesPerChunk + +# searching for the smallest sequence size +shortestSequenceSize = None +for name,seq in fasta(args.input,True): + if not shortestSequenceSize or len(seq) < shortestSequenceSize: + shortestSequenceSize = len(seq) + +print("Max number of sites per chunk: {}. Now computing best distribution.".format(maxSitesPerChunk)) + +# chunks = [[remainingSites,fileHandler]] +chunks = [[maxSitesPerChunk,None]] + +lastFileNumber=0 +for name,seq in fasta(args.input,True): + for j in range(len(chunks)): + if chunks[j][0] < shortestSequenceSize: + chunks[j][1].close() + del(chunks[j]) + if len(seq) <= chunks[j][0] : + if not chunks[j][1]: + chunks[j][1] = open("{}{}{}".format(args.output,lastFileNumber,args.suffix),"w") + lastFileNumber+=1 + chunks[j][0] -= len(seq) + chunks[j][1].write(">{}\n{}\n".format(name,seq)) + break + if chunks[-1][0] != maxSitesPerChunk: + chunks.append([maxSitesPerChunk,None]) + diff --git a/pipeline/Snakefile b/pipeline/Snakefile index 0f8a80d972b5a6d67e8a3da1775c558eefe71730..2eaf9eaabd8e9e20d0c9ef91feba6f4edb5ac84d 100644 --- a/pipeline/Snakefile +++ b/pipeline/Snakefile @@ -55,7 +55,7 @@ rule Profiles_hmmbuild: hmm="hmm/{part}.hmm", log="hmm/{part}.log" run: - shell("module load hmmer/3.2 ; hmmbuild --informat afa -n {wildcards.part} -o {output.log} {output.hmm} {input}") + shell("module load hmmer/3.2.1 ; hmmbuild --informat afa -n {wildcards.part} -o {output.log} {output.hmm} {input}") rule Profiles_align: input: @@ -77,7 +77,7 @@ rule Profiles_clustersToFamilies: params: minSize=4 shell: - "python3 {workflow.basedir}/../build/makeFastaFromCluster.py --minSize {params.minSize} {input.fasta} {input.clusters} familiesRaw/" + "python3 {workflow.basedir}/../build/makeFastaFromCluster.py --minSize {params.minSize} {input.fasta} {input.clusters} families/" rule Profiles_blastToClusters: @@ -100,7 +100,7 @@ rule Profiles_splitProtFasta: chunkSize=1500, suffix=".fasta", shell: - "python3 /pasteur/homes/tbigot/dev/idPathogenes/scripts/splitFasta.py {input} prot.split/ --suffix {params.suffix} --maxChunkSize {params.chunkSize}" + "python3 {workflow.basedir}/../build/splitFasta.py {input} prot.split/ --suffix {params.suffix} --maxChunkSize {params.chunkSize}" rule Profiles_collapse: input: