diff --git a/annotate/sqlAnnotToTxt.py b/annotate/sqlAnnotToTxt.py index 146b0db869ef902568be5e0ae599ac60ff02d45c..0525509b410a1159a500487e2811b5d5f9fd55b6 100644 --- a/annotate/sqlAnnotToTxt.py +++ b/annotate/sqlAnnotToTxt.py @@ -24,6 +24,7 @@ ccf = conn.cursor() ckw = conn.cursor() for currFamR in ccf.execute('SELECT * FROM family'): + print("b") currF,currSize,currNbseq,currLCA = currFamR o = open("{dir}/{prefix}{id}.txt".format(dir=args.txtDir,prefix=args.prefix,id=currF),"w") lineage = tdb_taxid.lineage_name(int(currLCA), reverse=True) @@ -35,6 +36,8 @@ for currFamR in ccf.execute('SELECT * FROM family'): for line in f: if line.startswith(">"): o.write(line[1:]) + o.close() + print("e") diff --git a/build/makeFastaFromCluster.py b/build/makeFastaFromCluster.py index 0077063422c43609beaeae1d83d6d8d4ff21da36..5a691af5d08fb9c4573deb24b0e33c808a6b6694 100644 --- a/build/makeFastaFromCluster.py +++ b/build/makeFastaFromCluster.py @@ -7,22 +7,24 @@ parser = argparse.ArgumentParser() parser.add_argument("master", help="fasta containing all sequences") parser.add_argument("silix", help="result of Silix computation") parser.add_argument("destination", help="directory in which create new fasta files (must exist, should be empty)") +parser.add_argument("--minSize", help="minimum number of sequences to be considered as a cluster",type=int,default=2) args = parser.parse_args() # load clusters in memory -clusters = open(args.silix) -seqToFam = {} -for seq in clusters: - ids = seq.strip().split() - seqToFam[ids[1]] = ids[0] -clusters.close() +clusters = map(lambda l: l.strip().split(),open(args.silix)) +seqToFam = {l[1]: l[0] for l in clusters} +from collections import Counter +famSize = Counter(seqToFam.values()) + def addSeqToFamilyFile(name,sequence): if len(name) == 0: return + elif famSize[seqToFam[name.split()[0]]] < args.minSize: + return family = seqToFam[name.split()[0]] ofasta = open(args.destination + "/" + family + ".fasta","a") ofasta.write(">" + name + "\n" + "".join(sequence)) diff --git a/pipeline/Snakefile b/pipeline/Snakefile index b9ec267095c5a3a738c753137ba6fcc5dbf18182..75ffcece390d8d171bcb7d996da763d4fc4492b2 100644 --- a/pipeline/Snakefile +++ b/pipeline/Snakefile @@ -1,38 +1,39 @@ -localrules: ConvProt_splitInNames, ConvProt_mergeFasta, Profiles_splitProtFasta, Profiles_makeBlastDB, Profiles_blastToClusters, Profiles_clustersToFamilies, Profiles_filterNonUnique, Pofiles_mergeBlast, Profiles_genAnnot, Profiles_mergeHmm, origNucl,Profiles_genTxtAnnot +localrules: ConvProt_splitInNames, ConvProt_mergeFasta, Profiles_splitProtFasta, Profiles_makeBlastDB, Profiles_blastToClusters, Profiles_clustersToFamilies, Pofiles_mergeBlast, Profiles_genAnnot, Profiles_mergeHmm, origNucl,Profiles_genTxtAnnot workdir: "/pasteur/scratch/tbigot/rvdb/profiles/" jobname="rvdb" shell("mkdir -p logs/cluster") +sampleName = "rvdb" + rule all: input: "prot.hmm", - "prot-hmm.sqlite3", - dynamic("annot/{nuId}.txt") + "prot-hmm.sqlite", + dynamic("annot/{famId2}.txt") rule Profiles_genTxtAnnot: input: - sqlite = "prot-hmm.sqlite3", - txtDir = "annot/", + sqlite = "prot-hmm.sqlite", taxadb = "/local/databases/rel/taxadb/current/db/taxadb_full.sqlite", params: - prefix = "" + txtDir = "annot/", output: - dynamic("annot/{nuId}.txt") + dynamic("annot/{famId2}.txt") shell: - "module load Python/3.6.0 ; python3 {workflow.basedir}/../annotate/sqlAnnotToTxt.py {input.sqlite} {input.txtDir} {input.taxadb} --prefix {params.prefix} " + "module load Python/3.6.0 ; python3 {workflow.basedir}/../annotate/sqlAnnotToTxt.py {input.sqlite} {params.txtDir} {input.taxadb}" rule Profiles_genAnnot: input: - dynamic("hmm/{nuId}.hmm") + dynamic("hmm/{famId}.hmm") output: - "prot-hmm.sqlite3" + "prot-hmm.sqlite" params: hmmDir = "hmm", alignedDir = "aligned", taxaDB = "/local/databases/rel/taxadb/current/db/taxadb_full.sqlite", - schema = "{workflow.basedir}/../annotate/annotations_schema.sql" + schema = "{}/../annotate/annotations_schema.sql".format(workflow.basedir) shell: "module load Python/3.6.0 ; python3 {workflow.basedir}/../annotate/makeAnnotationsIntoSQL.py {params.hmmDir} {params.alignedDir} {params.taxaDB} {params.schema} {output}" @@ -40,7 +41,7 @@ rule Profiles_genAnnot: rule Profiles_mergeHmm: input: - dynamic("hmm/{nuId}.hmm") + dynamic("hmm/{famId}.hmm") output: "prot.hmm" run: @@ -54,8 +55,7 @@ rule Profiles_hmmbuild: hmm="hmm/{part}.hmm", log="hmm/{part}.log" run: - famName = wildcards["part"] - shell("module load hmmer/3.1b2 && hmmbuild --informat afa -n {famName} -o {output.log} {output.hmm} {input}") + shell("module load hmmer/3.1b2 ; hmmbuild --informat afa -n {wildcards.part} -o {output.log} {output.hmm} {input}") rule Profiles_align: input: @@ -63,35 +63,21 @@ rule Profiles_align: output: "aligned/{part}.fasta" threads: - 28 + 1 shell: "module load fasta mafft && mafft --thread {threads} --anysymbol --auto {input} > {output}" -rule Profiles_filterNonUnique: - input: - dynamic("familiesRaw/{famId}.fasta") - output: - dynamic("families/{nuId}.fasta") - run: - for raw in input: - cr = open(raw) - nbSeq = 0 - for l in cr: - if l.startswith(">"): - nbSeq+=1 - if nbSeq > 1: - break - if nbSeq > 1: - shell("cd families && ln -s ../{raw} .") rule Profiles_clustersToFamilies: input: clusters="prot.fnodes", fasta="prot_collapsed.fasta" output: - dynamic("familiesRaw/{famId}.fasta") + dynamic("families/{famId}.fasta") + params: + minSize=4 shell: - "python3 {workflow.basedir}/../build/makeFastaFromCluster.py {input.fasta} {input.clusters} familiesRaw/" + "python3 {workflow.basedir}/../build/makeFastaFromCluster.py --minSize {params.minSize} {input.fasta} {input.clusters} familiesRaw/" rule Profiles_blastToClusters: