Skip to content
Snippets Groups Projects
Commit b9d75543 authored by Thomas  BIGOT's avatar Thomas BIGOT
Browse files

Added SplitFasta and fixed directory for generated families

parent b785aac9
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
import argparse
from math import ceil
from lib import fasta
from statistics import mean
parser = argparse.ArgumentParser()
parser.add_argument("input", help="fasta file")
parser.add_argument("output", help="prefix of outputed fastas")
parser.add_argument("--nbChunks", help="number of chunks", default=10, type=int)
parser.add_argument("--suffix", help="extension with dot. Default: .fastq", default=".fasta")
parser.add_argument("--maxChunkSize", type=int, help="minimum number of sequences per chunk. If defined, will ignore nbChunk parameter")
parser.add_argument("--maxSitesPerChunk", type=int, help="maximum position number per chunk")
args = parser.parse_args()
if args.maxChunkSize:
# determining number and size of queries
seqLen = tuple(map(lambda s: len(s[1]),fasta(args.input,True)))
print("Read {} sites.".format(sum(seqLen)))
# we have to guess the equivalent number of positions
maxSitesPerChunk = int(mean(seqLen) * args.maxChunkSize)
else:
maxSitesPerChunk = args.maxSitesPerChunk
# searching for the smallest sequence size
shortestSequenceSize = None
for name,seq in fasta(args.input,True):
if not shortestSequenceSize or len(seq) < shortestSequenceSize:
shortestSequenceSize = len(seq)
print("Max number of sites per chunk: {}. Now computing best distribution.".format(maxSitesPerChunk))
# chunks = [[remainingSites,fileHandler]]
chunks = [[maxSitesPerChunk,None]]
lastFileNumber=0
for name,seq in fasta(args.input,True):
for j in range(len(chunks)):
if chunks[j][0] < shortestSequenceSize:
chunks[j][1].close()
del(chunks[j])
if len(seq) <= chunks[j][0] :
if not chunks[j][1]:
chunks[j][1] = open("{}{}{}".format(args.output,lastFileNumber,args.suffix),"w")
lastFileNumber+=1
chunks[j][0] -= len(seq)
chunks[j][1].write(">{}\n{}\n".format(name,seq))
break
if chunks[-1][0] != maxSitesPerChunk:
chunks.append([maxSitesPerChunk,None])
...@@ -55,7 +55,7 @@ rule Profiles_hmmbuild: ...@@ -55,7 +55,7 @@ rule Profiles_hmmbuild:
hmm="hmm/{part}.hmm", hmm="hmm/{part}.hmm",
log="hmm/{part}.log" log="hmm/{part}.log"
run: run:
shell("module load hmmer/3.2 ; hmmbuild --informat afa -n {wildcards.part} -o {output.log} {output.hmm} {input}") shell("module load hmmer/3.2.1 ; hmmbuild --informat afa -n {wildcards.part} -o {output.log} {output.hmm} {input}")
rule Profiles_align: rule Profiles_align:
input: input:
...@@ -77,7 +77,7 @@ rule Profiles_clustersToFamilies: ...@@ -77,7 +77,7 @@ rule Profiles_clustersToFamilies:
params: params:
minSize=4 minSize=4
shell: shell:
"python3 {workflow.basedir}/../build/makeFastaFromCluster.py --minSize {params.minSize} {input.fasta} {input.clusters} familiesRaw/" "python3 {workflow.basedir}/../build/makeFastaFromCluster.py --minSize {params.minSize} {input.fasta} {input.clusters} families/"
rule Profiles_blastToClusters: rule Profiles_blastToClusters:
...@@ -100,7 +100,7 @@ rule Profiles_splitProtFasta: ...@@ -100,7 +100,7 @@ rule Profiles_splitProtFasta:
chunkSize=1500, chunkSize=1500,
suffix=".fasta", suffix=".fasta",
shell: shell:
"python3 /pasteur/homes/tbigot/dev/idPathogenes/scripts/splitFasta.py {input} prot.split/ --suffix {params.suffix} --maxChunkSize {params.chunkSize}" "python3 {workflow.basedir}/../build/splitFasta.py {input} prot.split/ --suffix {params.suffix} --maxChunkSize {params.chunkSize}"
rule Profiles_collapse: rule Profiles_collapse:
input: input:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment