Added SplitFasta and fixed directory for generated families

b9d75543 · Thomas BIGOT · b785aac9 · b9d75543 · b9d75543
Commit b9d75543 authored 6 years ago by Thomas BIGOT
--- a/build/splitFasta.py
+++ b/build/splitFasta.py
+#!/usr/bin/env python3
+import argparse
+from math import ceil
+from lib import fasta
+from statistics import mean
+parser = argparse.ArgumentParser()
+parser.add_argument("input", help="fasta file")
+parser.add_argument("output", help="prefix of outputed fastas")
+parser.add_argument("--nbChunks", help="number of chunks", default=10, type=int)
+parser.add_argument("--suffix", help="extension with dot. Default: .fastq", default=".fasta")
+parser.add_argument("--maxChunkSize", type=int, help="minimum number of sequences per chunk. If defined, will ignore nbChunk parameter")
+parser.add_argument("--maxSitesPerChunk", type=int, help="maximum position number per chunk")
+args = parser.parse_args()
+if args.maxChunkSize:
+    # determining number and size of queries
+    seqLen = tuple(map(lambda s: len(s[1]),fasta(args.input,True)))
+    print("Read {} sites.".format(sum(seqLen)))
+    # we have to guess the equivalent number of positions
+    maxSitesPerChunk = int(mean(seqLen) * args.maxChunkSize)
+else:
+    maxSitesPerChunk = args.maxSitesPerChunk
+# searching for the smallest sequence size
+shortestSequenceSize = None
+for name,seq in fasta(args.input,True):
+    if not shortestSequenceSize or len(seq) < shortestSequenceSize:
+        shortestSequenceSize = len(seq)
+print("Max number of sites per chunk: {}. Now computing best distribution.".format(maxSitesPerChunk))
+# chunks = [[remainingSites,fileHandler]]
+chunks = [[maxSitesPerChunk,None]]
+lastFileNumber=0
+for name,seq in fasta(args.input,True):
+    for j in range(len(chunks)):
+        if chunks[j][0] < shortestSequenceSize:
+            chunks[j][1].close()
+            del(chunks[j])
+        if len(seq) <= chunks[j][0] :
+            if not chunks[j][1]:
+                chunks[j][1] = open("{}{}{}".format(args.output,lastFileNumber,args.suffix),"w")
+                lastFileNumber+=1
+            chunks[j][0] -= len(seq)
+            chunks[j][1].write(">{}\n{}\n".format(name,seq))
+            break
+    if chunks[-1][0] != maxSitesPerChunk:
+        chunks.append([maxSitesPerChunk,None])
--- a/pipeline/Snakefile
+++ b/pipeline/Snakefile
@@ -55,7 +55,7 @@ rule Profiles_hmmbuild:
        hmm="hmm/{part}.hmm",
        log="hmm/{part}.log"
    run:
-        shell("module load hmmer/3.2 ; hmmbuild --informat afa -n {wildcards.part} -o {output.log} {output.hmm} {input}")
+        shell("module load hmmer/3.2.1 ; hmmbuild --informat afa -n {wildcards.part} -o {output.log} {output.hmm} {input}")
 rule Profiles_align:
    input:
@@ -77,7 +77,7 @@ rule Profiles_clustersToFamilies:
    params:
        minSize=4
    shell:
-        "python3 {workflow.basedir}/../build/makeFastaFromCluster.py --minSize {params.minSize} {input.fasta} {input.clusters} familiesRaw/"
+        "python3 {workflow.basedir}/../build/makeFastaFromCluster.py --minSize {params.minSize} {input.fasta} {input.clusters} families/"
 rule Profiles_blastToClusters:
@@ -100,7 +100,7 @@ rule Profiles_splitProtFasta:
        chunkSize=1500,
        suffix=".fasta",
    shell:
-        "python3 /pasteur/homes/tbigot/dev/idPathogenes/scripts/splitFasta.py {input} prot.split/ --suffix {params.suffix} --maxChunkSize {params.chunkSize}"
+        "python3 {workflow.basedir}/../build/splitFasta.py {input} prot.split/ --suffix {params.suffix} --maxChunkSize {params.chunkSize}"
 rule Profiles_collapse:
    input: