From 58ea43bf9e84d7139c58615741426722736fc7fb Mon Sep 17 00:00:00 2001
From: Thomas Bigot <thomas.bigot@pasteur.fr>
Date: Tue, 16 May 2017 16:46:29 +0200
Subject: [PATCH] updates
---
annotate/makeAnotations.py | 148 ++++++++++++++-----------
build/nucl2ProtGolden.py | 29 +++--
build/serialGolden.sh | 4 +-
decodeResults/hmmerAnnotationDecode.py | 16 ++-
4 files changed, 119 insertions(+), 78 deletions(-)
diff --git a/annotate/makeAnotations.py b/annotate/makeAnotations.py
index 6a9728d..35cef11 100644
--- a/annotate/makeAnotations.py
+++ b/annotate/makeAnotations.py
@@ -1,88 +1,106 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-import glob,re,argparse
+import glob,re,argparse,os
from taxadb.schema import *
from taxadb import accession
from taxadb import taxid
-parser = argparse.ArgumentParser()
-parser.add_argument("hmmDB", help="hmm file")
-parser.add_argument("msaDir", help="directory containing .msa.faa")
-parser.add_argument("taxadb", help="taxadb file")
-
-args = parser.parse_args()
-
+def getClusterInfo(HMMpath):
+ clusterInfo = {}
+ with open(HMMpath) as HMMfile:
+ for line in HMMfile:
+ if line.startswith("LENG"):
+ currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
+ if line.startswith("NAME"):
+ currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
+ if line.startswith("//"):
+ clusterInfo = {"name": currID, "length": currLength}
+ if line.startswith("HMM"):
+ break
+ return clusterInfo
-#on ouvre le fichier refseq viral
-allHMMfile = open(args.hmmDB)
-
-# pour chaque hmm, on extrait les infos dans un dico
-print("Reading RefSeqViral.hmm")
-currLength = 0
-allHMM = {}
-currID = 0
-#limit = 10
-for line in allHMMfile:
- if line.startswith("LENG"):
- currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
- if line.startswith("NAME"):
- currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
- if line.startswith("//"):
- allHMM[currID] = {"name": currID, "length": currLength}
- #limit -=1
- #if limit == 0:
- #break
-
-print("Done !")
-
-print("\nReading each alignement file")
-#On ouvre le fichier d alignements
-# pour chaque alignement, on liste les espèces et les «fonctions»
-
-for currCluster in allHMM.keys():
- currMSA = open(args.msaDir + "/" + str(currCluster) + ".msa.faa")
- allHMM[currCluster]["annotations"] = []
+def extractSeqNames(MSApath):
+ currMSA = open(MSApath)
+ annotations = []
for line in currMSA:
if line.startswith(">"):
- allHMM[currCluster]["annotations"].append(line)
+ annotations.append(line)
currMSA.close()
-print("Done !")
-
-print("\nReading acc number from gathered data")
-#get acc
-for currCluster in allHMM.keys():
+ return annotations
+
+def getTaxInfo(annotations):
+ taxInfo = {}
accs = []
- for ca in allHMM[currCluster]["annotations"]:
+ for ca in annotations:
accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
- allHMM[currCluster]["taxid"] = []
+ taxInfo["taxid"] = []
currAcc = 0
# proceeding by lots, to avoid `too many SQL variables` issue
while currAcc < len(accs):
- allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot))
- currAcc += 50
+ currAccs=accs[currAcc:(min(currAcc+249,len(accs)))]
+ print("currAccs = %s"%str(currAccs))
+ taxInfo["taxid"].append(accession.taxid(currAcc, args.taxadb, Prot))
+ currAcc += 249
# from taxid family
- allHMM[currCluster]["families"] = {}
+ taxInfo["families"] = {}
+
+ LCAs = []
+
# deloting
- for ctgen in allHMM[currCluster]["taxid"]:
+ for ctgen in taxInfo["taxid"]:
for ct in ctgen:
- lineage = taxid.lineage_name(ct[1], args.taxadb)
- family = lineage[2]
- if family not in allHMM[currCluster]["families"].keys():
- allHMM[currCluster]["families"][family] = 1
+ print("plop %s"%ct[1])
+ lineage = taxid.lineage_name(ct[1], args.taxadb, reverse=True)
+ #managing LCA
+ if len(LCAs) == 0:
+ LCAs = lineage
+ print("LCA: %s"% (str(LCAs)))
+ else:
+ print("LCA: %s"% (str(LCAs)))
+ for index,taxon in enumerate(LCAs):
+ if lineage[index] != taxon:
+ LCAs = LCAs[:index]
+ break
+ family = lineage[-3]
+ if family not in taxInfo["families"].keys():
+ taxInfo["families"][family] = 1
else:
- allHMM[currCluster]["families"][family] += 1
-
-print("Done !")
+ taxInfo["families"][family] += 1
+
+ taxInfo["LCA"] = LCAs[-1]
+ return taxInfo
+
+
+def writeAnnotations(annotationPath,clusterInfo,taxInfo,annotations):
+ annotFile = open(annotationPath,"w")
+ annotFile.write("LENGTH\t" + str(clusterInfo["length"])+"\n")
+ annotFile.write("FAMILIES\t" + str(taxInfo["families"]) + "\n")
+ annotFile.write("LCA\t%s\n"%(taxInfo["LCA"]))
+ annotFile.write("FASTA SEQUENCE TITLES:\n")
+ for currST in annotations:
+ annotFile.write(currST)
+
+
+
+
-print("\nWriting files")
+parser = argparse.ArgumentParser()
+parser.add_argument("hmmDir", help="hmm dir")
+parser.add_argument("msaDir", help="directory containing .msa.faa")
+parser.add_argument("taxadb", help="taxadb file")
+parser.add_argument("annotationDir", help="directory where annotation files have to be written")
-#on output tout par cluster
-for currCluster in allHMM.keys():
- currAnnot = open("annotations/" + str(currCluster)+ "_annotations.txt","w")
- currAnnot.write("LENGTH\t" + str(allHMM[currCluster]["length"])+"\n")
- currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n")
- currAnnot.write("FASTA SEQUENCE TITLES:\n")
- for currST in allHMM[currCluster]["annotations"]:
- currAnnot.write(currST)
+args = parser.parse_args()
+
+clusterList = glob.glob(args.hmmDir+"/*.hmm")
+for currCluster in clusterList:
+ familyName = os.path.basename(currCluster)[:-4]
+ print(familyName)
+ clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName))
+ annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName))
+ taxInfo = getTaxInfo(annotations)
+ writeAnnotations("%s/%s.txt"%(args.annotationDir,familyName))
+
+
diff --git a/build/nucl2ProtGolden.py b/build/nucl2ProtGolden.py
index 0be6ae5..a7f18c7 100644
--- a/build/nucl2ProtGolden.py
+++ b/build/nucl2ProtGolden.py
@@ -2,25 +2,36 @@
import sys,subprocess,argparse
+import Golden
from io import StringIO
parser = argparse.ArgumentParser()
parser.add_argument("list", help="file containing list of nucleic accession numbers")
parser.add_argument("dbName", help="prefix for output files")
-parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
+parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)")
args = parser.parse_args()
-p = args.p
+p = args.partial
ntf = open(args.list)
dbName = args.dbName
-of = open(dbName+"_"+ p +"_prot.fasta","w")
-nucAccI = open(dbName+"_"+ p +"_included.txt","w")
-nucAccX = open(dbName+"_"+ p +"_excluded.txt","w")
+
allList = ntf.readlines()
-accList = allList[round(len(allList)*(int(p)-1)/100.0):round(len(allList)*int(p)/100.0)]
+partialString = ""
+
+if p:
+ print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p))))
+ lowerBound = int(round(len(allList)*(int(p)-1)/10.0))
+ higherBound = int(round(len(allList)*int(p)/10.0))
+ accList = allList[lowerBound:higherBound]
+ partialString = "_" + p
+else:
+ accList = allList
+of = open(dbName+ partialString +"_prot.fasta","w")
+nucAccI = open(dbName+ partialString +"_included.txt","w")
+nucAccX = open(dbName+ partialString +"_excluded.txt","w")
ntf.close()
@@ -59,7 +70,7 @@ def extractProducts(geneEntry):
continue
else:
if line.strip().startswith("ORGANISM"):
- organism = line.split()[1]
+ organism = " ".join(line.split()[1:])
continue
if inFeatures:
if not line.startswith(" "):
@@ -75,8 +86,8 @@ cpt = 0
for acc in accList:
acc = acc.strip().split(".")[0]
cpt+=1
- goldenResult = subprocess.check_output(['golden',"-a",("refseq:" if "_" in acc else "genbank:")+acc])
- if len(goldenResult) != 0:
+ goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
+ if goldenResult:
nucAccI.write(acc + "\n")
prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
for name,seq in prots.items():
diff --git a/build/serialGolden.sh b/build/serialGolden.sh
index 9def896..11dd697 100644
--- a/build/serialGolden.sh
+++ b/build/serialGolden.sh
@@ -1,5 +1,5 @@
#! /bin/sh
-#SBATCH --qos=fast
+#SBATCH --qos=hubbioit --partition=dedicated
echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
-srun python3 /pasteur/homes/tbigot/bank/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/accession_rVDB.txt rVDB_prot2 ${SLURM_ARRAY_TASK_ID}
+srun python2 /pasteur/homes/tbigot/dev/meloit_6333/viroumenet/build/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/rVDB/current/list_acc_nucl.txt rVDB_prot -p ${SLURM_ARRAY_TASK_ID}
exit 0
diff --git a/decodeResults/hmmerAnnotationDecode.py b/decodeResults/hmmerAnnotationDecode.py
index 678f977..6620a34 100644
--- a/decodeResults/hmmerAnnotationDecode.py
+++ b/decodeResults/hmmerAnnotationDecode.py
@@ -50,8 +50,20 @@ def readHmmer(pathIn,pathOut,vfamDir,verbose=False):
continue
subjectData = (line.strip().split())
# # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc
- newLine = currSubject+ "," + subjectData[5]+ "," + subjectData[6]+ "," + subjectData[7]+ "," + subjectData[9]+ "," + subjectData[10]+ "," + currClusterInfo["id"] + "," + str(currClusterInfo["length"])+ "," + str(currClusterInfo["families"]).replace(","," ") + "," + str(currClusterInfo["functions"]).replace(","," ") + "," + str(currClusterInfo["nbSeq"]) + "\n"
- out.write(newLine)
+ newLine = [
+ currSubject, #subject
+ subjectData[5], #evalue
+ subjectData[6], #hmfrom
+ subjectData[7], #hmmto
+ subjectData[9], #alifrom
+ subjectData[10], #alito
+ currClusterInfo["id"], #clusterID
+ str(currClusterInfo["length"]), #clusterLength
+ str(currClusterInfo["families"]).replace(","," "), #families
+ str(currClusterInfo["functions"]).replace(","," "), #functions
+ str(currClusterInfo["nbSeq"]) #clusterNbSeq
+ ]
+ out.write(",".join(newLine)+"\n")
if(verbose):
print(newLine)
--
GitLab