Init

5fc82e9e · Thomas BIGOT · 5fc82e9e · 5fc82e9e · 5fc82e9e · 5fc82e9e
Commit 5fc82e9e authored 8 years ago by Thomas BIGOT
--- a/collapse.py
+++ b/collapse.py
+#!/usr/bin/env python3
+import argparse,fastaHmmr
+parser = argparse.ArgumentParser()
+parser.add_argument("fastaFile", help="original set of sequences")
+parser.add_argument("--fractionID", help="Fraction identity",default=1.0)
+parser.add_argument("--fractionCov", default=None)
+parser.add_argument("--cores", default=1)
+args = parser.parse_args()
+fastaHmmr.collapseSequence(args.fastaFile,args.fractionID,args.fractionCov,args.cores)
--- a/fastaHmmr.py
+++ b/fastaHmmr.py
--- a/makeAnotations.py
+++ b/makeAnotations.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import glob,re,argparse
+#from taxadb.schema import *
+#from taxadb import accession
+#from taxadb import taxid
+parser = argparse.ArgumentParser()
+parser.add_argument("hmmDB", help="hmm file")
+parser.add_argument("msaDir", help="directory containing .msa.faa")
+parser.add_argument("taxadb", help="taxadb file")
+args = parser.parse_args()
+#on ouvre le fichier refseq viral
+allHMMfile = open(args.hmmDB)
+# pour chaque hmm, on extrait les infos dans un dico
+print("Reading RefSeqViral.hmm")
+currLength = 0
+allHMM = {}
+currID = 0
+#limit = 10
+for line in allHMMfile:
+    if line.startswith("LENG"):
+        currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
+    if line.startswith("NAME"):
+        currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
+    if line.startswith("//"):
+        allHMM[currID] = {"name": currID, "length": currLength}
+        #limit -=1
+    #if limit == 0:
+        #break
+print("Done !")
+print("\nReading each alignement file")
+#On ouvre le fichier d alignements
+# pour chaque alignement, on liste les espèces et les «fonctions»
+for currCluster in allHMM.keys():
+    currMSA = open(args.msaDir + "/" + str(currCluster) + ".msa.faa")
+    allHMM[currCluster]["annotations"] = []
+    for line in currMSA:
+        if line.startswith(">"):
+            allHMM[currCluster]["annotations"].append(line)
+    currMSA.close()
+print("Done !")
+print("\nReading acc number from gathered data")
+#get acc
+for currCluster in allHMM.keys():
+    accs = []
+    for ca in allHMM[currCluster]["annotations"]:
+        print(ca)
+        accs.append(ca.split("|")[3].split(".")[0])
+    #allHMM[currCluster]["taxid"] = accession.taxid(accs, args.taxadb, Prot)
+    allHMM[currCluster]["taxid"] = "1234"
+    # from taxid family
+    allHMM[currCluster]["families"] = {}
+    for ct in allHMM[currCluster]["taxid"]:
+        #lineage = taxid.lineage_name(ct[1], args.taxadb)
+        lineage = (1234,"ABCD")
+        family = lineage[2]
+        if family not in allHMM[currCluster]["families"].keys():
+            allHMM[currCluster]["families"][family] = 1
+        else:
+            allHMM[currCluster]["families"][family] += 1
+print("Done !")
+print("\nWriting files")
+#on output tout par cluster
+for currCluster in allHMM.keys():
+    currAnnot = open("annotations/" + str(currCluster)+ "_annotations.txt","w")
+    currAnnot.write("LENGTH\t" + str(allHMM[currCluster]["length"])+"\n")
+    currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n")
+    currAnnot.write("FASTA SEQUENCE TITLES:\n")
+    for currST in allHMM[currCluster]["annotations"]:
+        currAnnot.write(currST+"\n")
--- a/makeFastaFromCluster.py
+++ b/makeFastaFromCluster.py
+#!/usr/bin/env python3
+import sys,subprocess,argparse
+from io import StringIO
+parser = argparse.ArgumentParser()
+parser.add_argument("master", help="fasta containing all sequences")
+parser.add_argument("silix", help="result of Silix computation")
+parser.add_argument("destination", help="directory in which create new fasta files (must exist, should be empty)")
+args = parser.parse_args()
+# load clusters in memory
+clusters = open(args.silix)
+seqToFam = {}
+for seq in clusters:
+    ids = seq.strip().split()
+    seqToFam[ids[1]] = ids[0]
+clusters.close()    
+def addSeqToFamilyFile(name,sequence):
+    if len(name) == 0:
+        return
+    family = seqToFam[name.split()[0]]
+    ofasta = open(args.destination + "/" + family + ".fasta","a")
+    ofasta.write(">" + name + "\n" + "".join(sequence))
+# now sorting the sequences of fasta
+currName = ""
+currSequence = []
+sequences = open(args.master)
+for line in sequences:
+    if line.startswith(">"):
+        addSeqToFamilyFile(currName,currSequence)
+        currName = line.strip()[1:]
+        currSequence = []
+    else:
+        currSequence.append(line)
--- a/nucl2ProtGolden.py
+++ b/nucl2ProtGolden.py
+#!/usr/bin/env python3
+import sys,subprocess,argparse
+from io import StringIO
+parser = argparse.ArgumentParser()
+parser.add_argument("list", help="file containing list of nucleic accession numbers")
+parser.add_argument("dbName", help="prefix for output files")
+parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
+args = parser.parse_args()
+p = args.p
+ntf = open(args.list)
+dbName = args.dbName
+of = open(dbName+"_"+ p +"_prot.fasta","w")
+nucAccI = open(dbName+"_"+ p +"_included.txt","w")
+nucAccX = open(dbName+"_"+ p +"_excluded.txt","w")
+allList = ntf.readlines()
+accList = allList[round(len(allList)*(int(p)-1)/100.0):round(len(allList)*int(p)/100.0)]
+ntf.close()
+def addCDStoDict(cdsString,cdsDict,organism):
+    if len(cdsString)!=0 and cdsString[0].strip().startswith("CDS"):
+        currProduct = ""
+        currSequence = []
+        currAcc = ""
+        inTranslation=False
+        for cc in cdsString:
+            cc = cc.strip()
+            if not inTranslation:
+                if cc.startswith("/product"):
+                    currProduct = cc.split("=")[1].strip("\"")
+                elif cc.startswith("/protein_id"):
+                    currAcc = cc.split("=")[1].strip("\"")
+                elif cc.startswith("/translation"):
+                    currSequence.append(cc.split("=")[1])
+                    inTranslation = True
+            else:
+                currSequence.append(cc)
+        if currProduct != "":
+            cdsDict[(currAcc + "| " + currProduct + " ["+organism+"]")] = "".join(currSequence).strip("\"")
+    del cdsString[:]
+def extractProducts(geneEntry):
+    prots = {}
+    currCDSstr = []
+    organism = "--unknown organism"
+    inFeatures = False
+    for line in geneEntry:
+        # control the presence in features
+        if not inFeatures:
+            if line.startswith("FEATURES"):
+                inFeatures = True
+                continue
+            else:
+                if line.strip().startswith("ORGANISM"):
+                    organism = line.split()[1]
+                continue
+        if inFeatures:
+            if not line.startswith("     "):
+                addCDStoDict(currCDSstr,prots,organism)
+                break
+        # here we are in Features :
+        if(line[5] != " "):
+            addCDStoDict(currCDSstr,prots,organism)
+        currCDSstr.append(line.rstrip())
+    return prots
+cpt = 0
+for acc in accList:
+    acc = acc.strip().split(".")[0]
+    cpt+=1
+    goldenResult = subprocess.check_output(['golden',"-a",("refseq:" if "_" in acc else "genbank:")+acc])
+    if len(goldenResult) != 0:
+        nucAccI.write(acc + "\n")
+        prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
+        for name,seq in prots.items():
+            of.write(">" + name + "\n" + seq + "\n")
+    else:
+        nucAccX.write(acc + "\n")
--- a/partialBlast.py
+++ b/partialBlast.py
+#!/usr/bin/env python3
+import sys,subprocess,argparse
+from io import StringIO
+parser = argparse.ArgumentParser()
+parser.add_argument("fasta", help="fasta sequences")
+parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
+args = parser.parse_args()
+p = args.p
+fastaf = open(args.fasta)
+tempFasta = open("partial/part_" + p + ".fasta","w")
+allList = fastaf.readlines()
+totalSeq = len(allList)
+firstLine = round(totalSeq*(int(p)-1)/100.0)
+lastLine = round(totalSeq*int(p)/100.0)
+cursor = firstLine
+inSeq = False
+lastSeq = False
+while cursor < totalSeq:
+    line = allList[cursor]
+    if not inSeq:
+        if line.startswith(">"):
+            inSeq = True
+    if lastSeq:
+        if line.startswith(">"):
+            break
+    if cursor == lastLine:
+        lastSeq = True
+    if inSeq:
+        tempFasta.write(line)
+    cursor += 1
--- a/partialHmmbuild.py
+++ b/partialHmmbuild.py
+#!/usr/bin/env python3
+import sys,subprocess,argparse,glob,os
+parser = argparse.ArgumentParser()
+parser.add_argument("dir", help="directory containing alignments")
+parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
+args = parser.parse_args()
+p = args.p
+fastas = glob.glob(args.dir +"/*.msa.faa")
+log = open(p + ".log","w")
+nbFiles = len(fastas)
+firstFile = int(round(nbFiles*(int(p)-1)/100.0))
+lastFile = int(round(nbFiles*int(p)/100.0))
+for cfp in fastas[firstFile:lastFile]:
+    familyName = '.'.join(os.path.basename(cfp).split(".")[:-2])
+    outp = familyName + ".hmm"
+    logp = familyName + ".log"
+    log.write((subprocess.check_output("hmmbuild --informat afa -n %s -o %s %s %s" % (outp,logp,outp,cfp),shell=True)).decode("utf-8"))
--- a/partialMuscle.py
+++ b/partialMuscle.py
+#!/usr/bin/env python3
+import sys,subprocess,argparse,glob,os
+parser = argparse.ArgumentParser()
+parser.add_argument("dir", help="directory containing fasta")
+parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
+args = parser.parse_args()
+p = args.p
+fastas = glob.glob(args.dir +"/*.fasta")
+log = open(p + ".log","w")
+nbFiles = len(fastas)
+firstFile = int(round(nbFiles*(int(p)-1)/100.0))
+lastFile = int(round(nbFiles*int(p)/100.0))
+for cfp in fastas[firstFile:lastFile]:
+    outp = os.path.splitext(os.path.basename(cfp))[0] + ".msa.faa"
+    logp = os.path.splitext(os.path.basename(cfp))[0] + ".log"
+    log.write((subprocess.check_output("muscle -in %s -out %s -log %s" % (cfp,outp,logp),shell=True)).decode("utf-8"))
--- a/removeDup.py
+++ b/removeDup.py
+#!/usr/bin/env python3
+import argparse,fastaHmmr
+parser = argparse.ArgumentParser()
+parser.add_argument("fastaFile", help="original set of sequences")
+parser.add_argument("minSequenceLength", help="minimum sequence length")
+args = parser.parse_args()
+fastaHmmr.filterFASTA(args.fastaFile,int(args.minSequenceLength))
--- a/serialBlast.sh
+++ b/serialBlast.sh
+#! /bin/sh
+#SBATCH --cores=4
+date >> log.txt
+echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
+srun python3 /pasteur/homes/tbigot/bank/scripts/partialBlast.py /pasteur/homes/tbigot/bank/rVDB_prot/rVDB_prot2_plus_noDupes_minL1_s100.fasta ${SLURM_ARRAY_TASK_ID}
+srun blastp -query partial/part_${SLURM_ARRAY_TASK_ID}.fasta -out result/part_${SLURM_ARRAY_TASK_ID}.out -db /pasteur/homes/tbigot/bank/rVDB_prot/rVDB_prot2_plus_noDupes_minL1_s100.fasta -outfmt 6 -num_threads 4
+exit 0
--- a/serialHmmbuild.sh
+++ b/serialHmmbuild.sh
+#! /bin/sh
+#SBATCH --qos=fast 
+#SBATCH --cores=1
+date >> log.txt
+echo "HMMBUILD ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
+srun python3 /pasteur/homes/tbigot/bank/scripts/partialHmmbuild.py /pasteur/homes/tbigot/bank/rVDB_prot/aligned ${SLURM_ARRAY_TASK_ID}
+exit 0
--- a/serialMuscle.sh
+++ b/serialMuscle.sh
+#! /bin/sh
+#SBATCH --qos=fast 
+#SBATCH --cores=1
+date >> log.txt
+echo "MUSCLE ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
+srun python3 /pasteur/homes/tbigot/bank/scripts/partialMuscle.py /pasteur/homes/tbigot/bank/rVDB_prot/clusters ${SLURM_ARRAY_TASK_ID}
+exit 0