Skip to content
Snippets Groups Projects
Commit 58ea43bf authored by Thomas  BIGOT's avatar Thomas BIGOT
Browse files

updates

parent f489a96b
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import glob,re,argparse import glob,re,argparse,os
from taxadb.schema import * from taxadb.schema import *
from taxadb import accession from taxadb import accession
from taxadb import taxid from taxadb import taxid
parser = argparse.ArgumentParser() def getClusterInfo(HMMpath):
parser.add_argument("hmmDB", help="hmm file") clusterInfo = {}
parser.add_argument("msaDir", help="directory containing .msa.faa") with open(HMMpath) as HMMfile:
parser.add_argument("taxadb", help="taxadb file") for line in HMMfile:
args = parser.parse_args()
#on ouvre le fichier refseq viral
allHMMfile = open(args.hmmDB)
# pour chaque hmm, on extrait les infos dans un dico
print("Reading RefSeqViral.hmm")
currLength = 0
allHMM = {}
currID = 0
#limit = 10
for line in allHMMfile:
if line.startswith("LENG"): if line.startswith("LENG"):
currLength = int(re.search('LENG\s+([0-9]+)',line).group(1)) currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
if line.startswith("NAME"): if line.startswith("NAME"):
currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1) currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
if line.startswith("//"): if line.startswith("//"):
allHMM[currID] = {"name": currID, "length": currLength} clusterInfo = {"name": currID, "length": currLength}
#limit -=1 if line.startswith("HMM"):
#if limit == 0: break
#break return clusterInfo
print("Done !")
print("\nReading each alignement file")
#On ouvre le fichier d alignements
# pour chaque alignement, on liste les espèces et les «fonctions»
for currCluster in allHMM.keys(): def extractSeqNames(MSApath):
currMSA = open(args.msaDir + "/" + str(currCluster) + ".msa.faa") currMSA = open(MSApath)
allHMM[currCluster]["annotations"] = [] annotations = []
for line in currMSA: for line in currMSA:
if line.startswith(">"): if line.startswith(">"):
allHMM[currCluster]["annotations"].append(line) annotations.append(line)
currMSA.close() currMSA.close()
print("Done !") return annotations
print("\nReading acc number from gathered data") def getTaxInfo(annotations):
#get acc taxInfo = {}
for currCluster in allHMM.keys():
accs = [] accs = []
for ca in allHMM[currCluster]["annotations"]: for ca in annotations:
accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0]) accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
allHMM[currCluster]["taxid"] = [] taxInfo["taxid"] = []
currAcc = 0 currAcc = 0
# proceeding by lots, to avoid `too many SQL variables` issue # proceeding by lots, to avoid `too many SQL variables` issue
while currAcc < len(accs): while currAcc < len(accs):
allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot)) currAccs=accs[currAcc:(min(currAcc+249,len(accs)))]
currAcc += 50 print("currAccs = %s"%str(currAccs))
taxInfo["taxid"].append(accession.taxid(currAcc, args.taxadb, Prot))
currAcc += 249
# from taxid family # from taxid family
allHMM[currCluster]["families"] = {} taxInfo["families"] = {}
LCAs = []
# deloting # deloting
for ctgen in allHMM[currCluster]["taxid"]: for ctgen in taxInfo["taxid"]:
for ct in ctgen: for ct in ctgen:
lineage = taxid.lineage_name(ct[1], args.taxadb) print("plop %s"%ct[1])
family = lineage[2] lineage = taxid.lineage_name(ct[1], args.taxadb, reverse=True)
if family not in allHMM[currCluster]["families"].keys(): #managing LCA
allHMM[currCluster]["families"][family] = 1 if len(LCAs) == 0:
LCAs = lineage
print("LCA: %s"% (str(LCAs)))
else:
print("LCA: %s"% (str(LCAs)))
for index,taxon in enumerate(LCAs):
if lineage[index] != taxon:
LCAs = LCAs[:index]
break
family = lineage[-3]
if family not in taxInfo["families"].keys():
taxInfo["families"][family] = 1
else: else:
allHMM[currCluster]["families"][family] += 1 taxInfo["families"][family] += 1
taxInfo["LCA"] = LCAs[-1]
return taxInfo
def writeAnnotations(annotationPath,clusterInfo,taxInfo,annotations):
annotFile = open(annotationPath,"w")
annotFile.write("LENGTH\t" + str(clusterInfo["length"])+"\n")
annotFile.write("FAMILIES\t" + str(taxInfo["families"]) + "\n")
annotFile.write("LCA\t%s\n"%(taxInfo["LCA"]))
annotFile.write("FASTA SEQUENCE TITLES:\n")
for currST in annotations:
annotFile.write(currST)
parser = argparse.ArgumentParser()
parser.add_argument("hmmDir", help="hmm dir")
parser.add_argument("msaDir", help="directory containing .msa.faa")
parser.add_argument("taxadb", help="taxadb file")
parser.add_argument("annotationDir", help="directory where annotation files have to be written")
args = parser.parse_args()
print("Done !") clusterList = glob.glob(args.hmmDir+"/*.hmm")
for currCluster in clusterList:
familyName = os.path.basename(currCluster)[:-4]
print(familyName)
clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName))
annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName))
taxInfo = getTaxInfo(annotations)
writeAnnotations("%s/%s.txt"%(args.annotationDir,familyName))
print("\nWriting files")
#on output tout par cluster
for currCluster in allHMM.keys():
currAnnot = open("annotations/" + str(currCluster)+ "_annotations.txt","w")
currAnnot.write("LENGTH\t" + str(allHMM[currCluster]["length"])+"\n")
currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n")
currAnnot.write("FASTA SEQUENCE TITLES:\n")
for currST in allHMM[currCluster]["annotations"]:
currAnnot.write(currST)
...@@ -2,25 +2,36 @@ ...@@ -2,25 +2,36 @@
import sys,subprocess,argparse import sys,subprocess,argparse
import Golden
from io import StringIO from io import StringIO
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("list", help="file containing list of nucleic accession numbers") parser.add_argument("list", help="file containing list of nucleic accession numbers")
parser.add_argument("dbName", help="prefix for output files") parser.add_argument("dbName", help="prefix for output files")
parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)") parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)")
args = parser.parse_args() args = parser.parse_args()
p = args.p p = args.partial
ntf = open(args.list) ntf = open(args.list)
dbName = args.dbName dbName = args.dbName
of = open(dbName+"_"+ p +"_prot.fasta","w")
nucAccI = open(dbName+"_"+ p +"_included.txt","w")
nucAccX = open(dbName+"_"+ p +"_excluded.txt","w")
allList = ntf.readlines() allList = ntf.readlines()
accList = allList[round(len(allList)*(int(p)-1)/100.0):round(len(allList)*int(p)/100.0)] partialString = ""
if p:
print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p))))
lowerBound = int(round(len(allList)*(int(p)-1)/10.0))
higherBound = int(round(len(allList)*int(p)/10.0))
accList = allList[lowerBound:higherBound]
partialString = "_" + p
else:
accList = allList
of = open(dbName+ partialString +"_prot.fasta","w")
nucAccI = open(dbName+ partialString +"_included.txt","w")
nucAccX = open(dbName+ partialString +"_excluded.txt","w")
ntf.close() ntf.close()
...@@ -59,7 +70,7 @@ def extractProducts(geneEntry): ...@@ -59,7 +70,7 @@ def extractProducts(geneEntry):
continue continue
else: else:
if line.strip().startswith("ORGANISM"): if line.strip().startswith("ORGANISM"):
organism = line.split()[1] organism = " ".join(line.split()[1:])
continue continue
if inFeatures: if inFeatures:
if not line.startswith(" "): if not line.startswith(" "):
...@@ -75,8 +86,8 @@ cpt = 0 ...@@ -75,8 +86,8 @@ cpt = 0
for acc in accList: for acc in accList:
acc = acc.strip().split(".")[0] acc = acc.strip().split(".")[0]
cpt+=1 cpt+=1
goldenResult = subprocess.check_output(['golden',"-a",("refseq:" if "_" in acc else "genbank:")+acc]) goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
if len(goldenResult) != 0: if goldenResult:
nucAccI.write(acc + "\n") nucAccI.write(acc + "\n")
prots = extractProducts(StringIO(goldenResult.decode("utf-8"))) prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
for name,seq in prots.items(): for name,seq in prots.items():
......
#! /bin/sh #! /bin/sh
#SBATCH --qos=fast #SBATCH --qos=hubbioit --partition=dedicated
echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
srun python3 /pasteur/homes/tbigot/bank/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/accession_rVDB.txt rVDB_prot2 ${SLURM_ARRAY_TASK_ID} srun python2 /pasteur/homes/tbigot/dev/meloit_6333/viroumenet/build/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/rVDB/current/list_acc_nucl.txt rVDB_prot -p ${SLURM_ARRAY_TASK_ID}
exit 0 exit 0
...@@ -50,8 +50,20 @@ def readHmmer(pathIn,pathOut,vfamDir,verbose=False): ...@@ -50,8 +50,20 @@ def readHmmer(pathIn,pathOut,vfamDir,verbose=False):
continue continue
subjectData = (line.strip().split()) subjectData = (line.strip().split())
# # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc # # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc
newLine = currSubject+ "," + subjectData[5]+ "," + subjectData[6]+ "," + subjectData[7]+ "," + subjectData[9]+ "," + subjectData[10]+ "," + currClusterInfo["id"] + "," + str(currClusterInfo["length"])+ "," + str(currClusterInfo["families"]).replace(","," ") + "," + str(currClusterInfo["functions"]).replace(","," ") + "," + str(currClusterInfo["nbSeq"]) + "\n" newLine = [
out.write(newLine) currSubject, #subject
subjectData[5], #evalue
subjectData[6], #hmfrom
subjectData[7], #hmmto
subjectData[9], #alifrom
subjectData[10], #alito
currClusterInfo["id"], #clusterID
str(currClusterInfo["length"]), #clusterLength
str(currClusterInfo["families"]).replace(","," "), #families
str(currClusterInfo["functions"]).replace(","," "), #functions
str(currClusterInfo["nbSeq"]) #clusterNbSeq
]
out.write(",".join(newLine)+"\n")
if(verbose): if(verbose):
print(newLine) print(newLine)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment