From 58ea43bf9e84d7139c58615741426722736fc7fb Mon Sep 17 00:00:00 2001
From: Thomas Bigot <thomas.bigot@pasteur.fr>
Date: Tue, 16 May 2017 16:46:29 +0200
Subject: [PATCH] updates

---
 annotate/makeAnotations.py             | 148 ++++++++++++++-----------
 build/nucl2ProtGolden.py               |  29 +++--
 build/serialGolden.sh                  |   4 +-
 decodeResults/hmmerAnnotationDecode.py |  16 ++-
 4 files changed, 119 insertions(+), 78 deletions(-)

diff --git a/annotate/makeAnotations.py b/annotate/makeAnotations.py
index 6a9728d..35cef11 100644
--- a/annotate/makeAnotations.py
+++ b/annotate/makeAnotations.py
@@ -1,88 +1,106 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-import glob,re,argparse
+import glob,re,argparse,os
 
 from taxadb.schema import *
 from taxadb import accession
 from taxadb import taxid
 
-parser = argparse.ArgumentParser()
-parser.add_argument("hmmDB", help="hmm file")
-parser.add_argument("msaDir", help="directory containing .msa.faa")
-parser.add_argument("taxadb", help="taxadb file")
-
-args = parser.parse_args()
-
+def getClusterInfo(HMMpath):
+    clusterInfo = {}
+    with open(HMMpath) as HMMfile:
+        for line in HMMfile:
+            if line.startswith("LENG"):
+                currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
+            if line.startswith("NAME"):
+                currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
+            if line.startswith("//"):
+                clusterInfo = {"name": currID, "length": currLength}
+            if line.startswith("HMM"):
+                break
+    return clusterInfo
 
-#on ouvre le fichier refseq viral
-allHMMfile = open(args.hmmDB)
-
-# pour chaque hmm, on extrait les infos dans un dico
-print("Reading RefSeqViral.hmm")
-currLength = 0
-allHMM = {}
-currID = 0
-#limit = 10
-for line in allHMMfile:
-    if line.startswith("LENG"):
-        currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
-    if line.startswith("NAME"):
-        currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
-    if line.startswith("//"):
-        allHMM[currID] = {"name": currID, "length": currLength}
-        #limit -=1
-    #if limit == 0:
-        #break
-    
-print("Done !")
 
-
-print("\nReading each alignement file")
-#On ouvre le fichier d alignements
-# pour chaque alignement, on liste les espèces et les «fonctions»
-
-for currCluster in allHMM.keys():
-    currMSA = open(args.msaDir + "/" + str(currCluster) + ".msa.faa")
-    allHMM[currCluster]["annotations"] = []
+def extractSeqNames(MSApath):
+    currMSA = open(MSApath)
+    annotations = []
     for line in currMSA:
         if line.startswith(">"):
-            allHMM[currCluster]["annotations"].append(line)
+            annotations.append(line)
     currMSA.close()
-print("Done !")
-    
-print("\nReading acc number from gathered data")
-#get acc
-for currCluster in allHMM.keys():
+    return annotations
+
+def getTaxInfo(annotations):
+    taxInfo = {}
     accs = []
-    for ca in allHMM[currCluster]["annotations"]:
+    for ca in annotations:
         accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
-    allHMM[currCluster]["taxid"] = []
+    taxInfo["taxid"] = []
     currAcc = 0
     # proceeding by lots, to avoid `too many SQL variables` issue
     while currAcc < len(accs):
-        allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot))
-        currAcc += 50
+        currAccs=accs[currAcc:(min(currAcc+249,len(accs)))]
+        print("currAccs = %s"%str(currAccs))
+        taxInfo["taxid"].append(accession.taxid(currAcc, args.taxadb, Prot))
+        currAcc += 249
     # from taxid family
-    allHMM[currCluster]["families"] = {}
+    taxInfo["families"] = {}
+    
+    LCAs = []
+    
     # deloting
-    for ctgen in allHMM[currCluster]["taxid"]:
+    for ctgen in taxInfo["taxid"]:
         for ct in ctgen:
-            lineage = taxid.lineage_name(ct[1], args.taxadb)
-            family = lineage[2]
-            if family not in allHMM[currCluster]["families"].keys():
-                allHMM[currCluster]["families"][family] = 1
+            print("plop %s"%ct[1])
+            lineage = taxid.lineage_name(ct[1], args.taxadb, reverse=True)
+            #managing LCA
+            if len(LCAs) == 0:
+                LCAs = lineage
+                print("LCA: %s"% (str(LCAs)))
+            else:
+                print("LCA: %s"% (str(LCAs)))
+                for index,taxon in enumerate(LCAs):
+                    if lineage[index] != taxon:
+                        LCAs = LCAs[:index]
+                        break
+            family = lineage[-3]
+            if family not in taxInfo["families"].keys():
+                taxInfo["families"][family] = 1
             else:
-                allHMM[currCluster]["families"][family] += 1
-        
-print("Done !")
+                taxInfo["families"][family] += 1
+    
+    taxInfo["LCA"] = LCAs[-1]
+    return taxInfo
+
+
+def writeAnnotations(annotationPath,clusterInfo,taxInfo,annotations):
+    annotFile = open(annotationPath,"w")
+    annotFile.write("LENGTH\t" + str(clusterInfo["length"])+"\n")
+    annotFile.write("FAMILIES\t" + str(taxInfo["families"]) + "\n")
+    annotFile.write("LCA\t%s\n"%(taxInfo["LCA"]))
+    annotFile.write("FASTA SEQUENCE TITLES:\n")
+    for currST in annotations:
+        annotFile.write(currST)
+
+
+
+
 
-print("\nWriting files")
+parser = argparse.ArgumentParser()
+parser.add_argument("hmmDir", help="hmm dir")
+parser.add_argument("msaDir", help="directory containing .msa.faa")
+parser.add_argument("taxadb", help="taxadb file")
+parser.add_argument("annotationDir", help="directory where annotation files have to be written")
 
-#on output tout par cluster
-for currCluster in allHMM.keys():
-    currAnnot = open("annotations/" + str(currCluster)+ "_annotations.txt","w")
-    currAnnot.write("LENGTH\t" + str(allHMM[currCluster]["length"])+"\n")
-    currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n")
-    currAnnot.write("FASTA SEQUENCE TITLES:\n")
-    for currST in allHMM[currCluster]["annotations"]:
-        currAnnot.write(currST)
+args = parser.parse_args()
+
+clusterList = glob.glob(args.hmmDir+"/*.hmm")
+for currCluster in clusterList:
+    familyName = os.path.basename(currCluster)[:-4]
+    print(familyName)
+    clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName))
+    annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName))
+    taxInfo = getTaxInfo(annotations)
+    writeAnnotations("%s/%s.txt"%(args.annotationDir,familyName))
+
+    
diff --git a/build/nucl2ProtGolden.py b/build/nucl2ProtGolden.py
index 0be6ae5..a7f18c7 100644
--- a/build/nucl2ProtGolden.py
+++ b/build/nucl2ProtGolden.py
@@ -2,25 +2,36 @@
 
 
 import sys,subprocess,argparse
+import Golden
 from io import StringIO
 
 parser = argparse.ArgumentParser()
 parser.add_argument("list", help="file containing list of nucleic accession numbers")
 parser.add_argument("dbName", help="prefix for output files")
-parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
+parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)")
 args = parser.parse_args()
 
-p = args.p
+p = args.partial
 
 ntf = open(args.list)
 dbName = args.dbName
-of = open(dbName+"_"+ p +"_prot.fasta","w")
-nucAccI = open(dbName+"_"+ p +"_included.txt","w")
-nucAccX = open(dbName+"_"+ p +"_excluded.txt","w")
+
 
 allList = ntf.readlines()
-accList = allList[round(len(allList)*(int(p)-1)/100.0):round(len(allList)*int(p)/100.0)]
+partialString = ""
+
+if p:
+    print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p))))
+    lowerBound = int(round(len(allList)*(int(p)-1)/10.0))
+    higherBound = int(round(len(allList)*int(p)/10.0))
+    accList = allList[lowerBound:higherBound]
+    partialString = "_" + p
+else:
+    accList = allList    
 
+of = open(dbName+ partialString +"_prot.fasta","w")
+nucAccI = open(dbName+ partialString +"_included.txt","w")
+nucAccX = open(dbName+ partialString +"_excluded.txt","w")
 
 ntf.close()
 
@@ -59,7 +70,7 @@ def extractProducts(geneEntry):
                 continue
             else:
                 if line.strip().startswith("ORGANISM"):
-                    organism = line.split()[1]
+                    organism = " ".join(line.split()[1:])
                 continue
         if inFeatures:
             if not line.startswith("     "):
@@ -75,8 +86,8 @@ cpt = 0
 for acc in accList:
     acc = acc.strip().split(".")[0]
     cpt+=1
-    goldenResult = subprocess.check_output(['golden',"-a",("refseq:" if "_" in acc else "genbank:")+acc])
-    if len(goldenResult) != 0:
+    goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
+    if goldenResult:
         nucAccI.write(acc + "\n")
         prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
         for name,seq in prots.items():
diff --git a/build/serialGolden.sh b/build/serialGolden.sh
index 9def896..11dd697 100644
--- a/build/serialGolden.sh
+++ b/build/serialGolden.sh
@@ -1,5 +1,5 @@
 #! /bin/sh
-#SBATCH --qos=fast
+#SBATCH --qos=hubbioit --partition=dedicated
 echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
-srun python3 /pasteur/homes/tbigot/bank/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/accession_rVDB.txt rVDB_prot2 ${SLURM_ARRAY_TASK_ID}
+srun python2 /pasteur/homes/tbigot/dev/meloit_6333/viroumenet/build/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/rVDB/current/list_acc_nucl.txt rVDB_prot -p ${SLURM_ARRAY_TASK_ID}
 exit 0
diff --git a/decodeResults/hmmerAnnotationDecode.py b/decodeResults/hmmerAnnotationDecode.py
index 678f977..6620a34 100644
--- a/decodeResults/hmmerAnnotationDecode.py
+++ b/decodeResults/hmmerAnnotationDecode.py
@@ -50,8 +50,20 @@ def readHmmer(pathIn,pathOut,vfamDir,verbose=False):
                     continue
                 subjectData = (line.strip().split())
                 #  #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
-                newLine = currSubject+ "," + subjectData[5]+ "," + subjectData[6]+ "," + subjectData[7]+ "," + subjectData[9]+ "," + subjectData[10]+ ","  + currClusterInfo["id"] + ","  + str(currClusterInfo["length"])+ ","  + str(currClusterInfo["families"]).replace(","," ") + "," + str(currClusterInfo["functions"]).replace(","," ") + "," + str(currClusterInfo["nbSeq"]) + "\n"
-                out.write(newLine)
+                newLine = [ 
+                    currSubject,            #subject
+                    subjectData[5],         #evalue
+                    subjectData[6],         #hmfrom
+                    subjectData[7],         #hmmto
+                    subjectData[9],         #alifrom
+                    subjectData[10],        #alito
+                    currClusterInfo["id"],  #clusterID
+                    str(currClusterInfo["length"]),         #clusterLength
+                    str(currClusterInfo["families"]).replace(","," "),  #families
+                    str(currClusterInfo["functions"]).replace(","," "), #functions
+                    str(currClusterInfo["nbSeq"])               #clusterNbSeq
+                ]
+                out.write(",".join(newLine)+"\n")
                 if(verbose):
                     print(newLine)
 
-- 
GitLab