updates

58ea43bf · Thomas BIGOT · f489a96b · 58ea43bf · 58ea43bf · 58ea43bf
Commit 58ea43bf authored 8 years ago by Thomas BIGOT
--- a/annotate/makeAnotations.py
+++ b/annotate/makeAnotations.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-import glob,re,argparse
+import glob,re,argparse,os
 from taxadb.schema import *
 from taxadb import accession
 from taxadb import taxid
-parser = argparse.ArgumentParser()
+def getClusterInfo(HMMpath):
-parser.add_argument("hmmDB", help="hmm file")
+    clusterInfo = {}
-parser.add_argument("msaDir", help="directory containing .msa.faa")
+    with open(HMMpath) as HMMfile:
-parser.add_argument("taxadb", help="taxadb file")
+        for line in HMMfile:
-args = parser.parse_args()
-#on ouvre le fichier refseq viral
-allHMMfile = open(args.hmmDB)
-# pour chaque hmm, on extrait les infos dans un dico
-print("Reading RefSeqViral.hmm")
-currLength = 0
-allHMM = {}
-currID = 0
-#limit = 10
-for line in allHMMfile:
            if line.startswith("LENG"):
                currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
            if line.startswith("NAME"):
                currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
            if line.startswith("//"):
-        allHMM[currID] = {"name": currID, "length": currLength}
+                clusterInfo = {"name": currID, "length": currLength}
-        #limit -=1
+            if line.startswith("HMM"):
-    #if limit == 0:
+                break
-        #break
+    return clusterInfo
-print("Done !")
-print("\nReading each alignement file")
-#On ouvre le fichier d alignements
-# pour chaque alignement, on liste les espèces et les «fonctions»
-for currCluster in allHMM.keys():
+def extractSeqNames(MSApath):
-    currMSA = open(args.msaDir + "/" + str(currCluster) + ".msa.faa")
+    currMSA = open(MSApath)
-    allHMM[currCluster]["annotations"] = []
+    annotations = []
    for line in currMSA:
        if line.startswith(">"):
-            allHMM[currCluster]["annotations"].append(line)
+            annotations.append(line)
    currMSA.close()
-print("Done !")
+    return annotations
-print("\nReading acc number from gathered data")
+def getTaxInfo(annotations):
-#get acc
+    taxInfo = {}
-for currCluster in allHMM.keys():
    accs = []
-    for ca in allHMM[currCluster]["annotations"]:
+    for ca in annotations:
        accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
-    allHMM[currCluster]["taxid"] = []
+    taxInfo["taxid"] = []
    currAcc = 0
    # proceeding by lots, to avoid `too many SQL variables` issue
    while currAcc < len(accs):
-        allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot))
+        currAccs=accs[currAcc:(min(currAcc+249,len(accs)))]
-        currAcc += 50
+        print("currAccs = %s"%str(currAccs))
+        taxInfo["taxid"].append(accession.taxid(currAcc, args.taxadb, Prot))
+        currAcc += 249
    # from taxid family
-    allHMM[currCluster]["families"] = {}
+    taxInfo["families"] = {}
+    LCAs = []
    # deloting
-    for ctgen in allHMM[currCluster]["taxid"]:
+    for ctgen in taxInfo["taxid"]:
        for ct in ctgen:
-            lineage = taxid.lineage_name(ct[1], args.taxadb)
+            print("plop %s"%ct[1])
-            family = lineage[2]
+            lineage = taxid.lineage_name(ct[1], args.taxadb, reverse=True)
-            if family not in allHMM[currCluster]["families"].keys():
+            #managing LCA
-                allHMM[currCluster]["families"][family] = 1
+            if len(LCAs) == 0:
+                LCAs = lineage
+                print("LCA: %s"% (str(LCAs)))
+            else:
+                print("LCA: %s"% (str(LCAs)))
+                for index,taxon in enumerate(LCAs):
+                    if lineage[index] != taxon:
+                        LCAs = LCAs[:index]
+                        break
+            family = lineage[-3]
+            if family not in taxInfo["families"].keys():
+                taxInfo["families"][family] = 1
            else:
-                allHMM[currCluster]["families"][family] += 1
+                taxInfo["families"][family] += 1
+    taxInfo["LCA"] = LCAs[-1]
+    return taxInfo
+def writeAnnotations(annotationPath,clusterInfo,taxInfo,annotations):
+    annotFile = open(annotationPath,"w")
+    annotFile.write("LENGTH\t" + str(clusterInfo["length"])+"\n")
+    annotFile.write("FAMILIES\t" + str(taxInfo["families"]) + "\n")
+    annotFile.write("LCA\t%s\n"%(taxInfo["LCA"]))
+    annotFile.write("FASTA SEQUENCE TITLES:\n")
+    for currST in annotations:
+        annotFile.write(currST)
+parser = argparse.ArgumentParser()
+parser.add_argument("hmmDir", help="hmm dir")
+parser.add_argument("msaDir", help="directory containing .msa.faa")
+parser.add_argument("taxadb", help="taxadb file")
+parser.add_argument("annotationDir", help="directory where annotation files have to be written")
+args = parser.parse_args()
-print("Done !")
+clusterList = glob.glob(args.hmmDir+"/*.hmm")
+for currCluster in clusterList:
+    familyName = os.path.basename(currCluster)[:-4]
+    print(familyName)
+    clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName))
+    annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName))
+    taxInfo = getTaxInfo(annotations)
+    writeAnnotations("%s/%s.txt"%(args.annotationDir,familyName))
-print("\nWriting files")
-#on output tout par cluster
-for currCluster in allHMM.keys():
-    currAnnot = open("annotations/" + str(currCluster)+ "_annotations.txt","w")
-    currAnnot.write("LENGTH\t" + str(allHMM[currCluster]["length"])+"\n")
-    currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n")
-    currAnnot.write("FASTA SEQUENCE TITLES:\n")
-    for currST in allHMM[currCluster]["annotations"]:
-        currAnnot.write(currST)
--- a/build/nucl2ProtGolden.py
+++ b/build/nucl2ProtGolden.py
@@ -2,25 +2,36 @@
 import sys,subprocess,argparse
+import Golden
 from io import StringIO
 parser = argparse.ArgumentParser()
 parser.add_argument("list", help="file containing list of nucleic accession numbers")
 parser.add_argument("dbName", help="prefix for output files")
-parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
+parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)")
 args = parser.parse_args()
-p = args.p
+p = args.partial
 ntf = open(args.list)
 dbName = args.dbName
-of = open(dbName+"_"+ p +"_prot.fasta","w")
-nucAccI = open(dbName+"_"+ p +"_included.txt","w")
-nucAccX = open(dbName+"_"+ p +"_excluded.txt","w")
 allList = ntf.readlines()
-accList = allList[round(len(allList)*(int(p)-1)/100.0):round(len(allList)*int(p)/100.0)]
+partialString = ""
+if p:
+    print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p))))
+    lowerBound = int(round(len(allList)*(int(p)-1)/10.0))
+    higherBound = int(round(len(allList)*int(p)/10.0))
+    accList = allList[lowerBound:higherBound]
+    partialString = "_" + p
+else:
+    accList = allList    
+of = open(dbName+ partialString +"_prot.fasta","w")
+nucAccI = open(dbName+ partialString +"_included.txt","w")
+nucAccX = open(dbName+ partialString +"_excluded.txt","w")
 ntf.close()
@@ -59,7 +70,7 @@ def extractProducts(geneEntry):
                continue
            else:
                if line.strip().startswith("ORGANISM"):
-                    organism = line.split()[1]
+                    organism = " ".join(line.split()[1:])
                continue
        if inFeatures:
            if not line.startswith("     "):
@@ -75,8 +86,8 @@ cpt = 0
 for acc in accList:
    acc = acc.strip().split(".")[0]
    cpt+=1
-    goldenResult = subprocess.check_output(['golden',"-a",("refseq:" if "_" in acc else "genbank:")+acc])
+    goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
-    if len(goldenResult) != 0:
+    if goldenResult:
        nucAccI.write(acc + "\n")
        prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
        for name,seq in prots.items():

--- a/build/serialGolden.sh
+++ b/build/serialGolden.sh
 #! /bin/sh
-#SBATCH --qos=fast
+#SBATCH --qos=hubbioit --partition=dedicated
 echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
-srun python3 /pasteur/homes/tbigot/bank/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/accession_rVDB.txt rVDB_prot2 ${SLURM_ARRAY_TASK_ID}
+srun python2 /pasteur/homes/tbigot/dev/meloit_6333/viroumenet/build/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/rVDB/current/list_acc_nucl.txt rVDB_prot -p ${SLURM_ARRAY_TASK_ID}
 exit 0
--- a/decodeResults/hmmerAnnotationDecode.py
+++ b/decodeResults/hmmerAnnotationDecode.py
@@ -50,8 +50,20 @@ def readHmmer(pathIn,pathOut,vfamDir,verbose=False):
                    continue
                subjectData = (line.strip().split())
                #  #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
-                newLine = currSubject+ "," + subjectData[5]+ "," + subjectData[6]+ "," + subjectData[7]+ "," + subjectData[9]+ "," + subjectData[10]+ ","  + currClusterInfo["id"] + ","  + str(currClusterInfo["length"])+ ","  + str(currClusterInfo["families"]).replace(","," ") + "," + str(currClusterInfo["functions"]).replace(","," ") + "," + str(currClusterInfo["nbSeq"]) + "\n"
+                newLine = [ 
-                out.write(newLine)
+                    currSubject,            #subject
+                    subjectData[5],         #evalue
+                    subjectData[6],         #hmfrom
+                    subjectData[7],         #hmmto
+                    subjectData[9],         #alifrom
+                    subjectData[10],        #alito
+                    currClusterInfo["id"],  #clusterID
+                    str(currClusterInfo["length"]),         #clusterLength
+                    str(currClusterInfo["families"]).replace(","," "),  #families
+                    str(currClusterInfo["functions"]).replace(","," "), #functions
+                    str(currClusterInfo["nbSeq"])               #clusterNbSeq
+                ]
+                out.write(",".join(newLine)+"\n")
                if(verbose):
                    print(newLine)