diff --git a/annotate/makeAnotations.py b/annotate/makeAnotations.py index 6a9728d74b870a149e206feaf1c6411b49c95c74..35cef11f458a31bc260039294d9f5f10312f7eaf 100644 --- a/annotate/makeAnotations.py +++ b/annotate/makeAnotations.py @@ -1,88 +1,106 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import glob,re,argparse +import glob,re,argparse,os from taxadb.schema import * from taxadb import accession from taxadb import taxid -parser = argparse.ArgumentParser() -parser.add_argument("hmmDB", help="hmm file") -parser.add_argument("msaDir", help="directory containing .msa.faa") -parser.add_argument("taxadb", help="taxadb file") - -args = parser.parse_args() - +def getClusterInfo(HMMpath): + clusterInfo = {} + with open(HMMpath) as HMMfile: + for line in HMMfile: + if line.startswith("LENG"): + currLength = int(re.search('LENG\s+([0-9]+)',line).group(1)) + if line.startswith("NAME"): + currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1) + if line.startswith("//"): + clusterInfo = {"name": currID, "length": currLength} + if line.startswith("HMM"): + break + return clusterInfo -#on ouvre le fichier refseq viral -allHMMfile = open(args.hmmDB) - -# pour chaque hmm, on extrait les infos dans un dico -print("Reading RefSeqViral.hmm") -currLength = 0 -allHMM = {} -currID = 0 -#limit = 10 -for line in allHMMfile: - if line.startswith("LENG"): - currLength = int(re.search('LENG\s+([0-9]+)',line).group(1)) - if line.startswith("NAME"): - currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1) - if line.startswith("//"): - allHMM[currID] = {"name": currID, "length": currLength} - #limit -=1 - #if limit == 0: - #break - -print("Done !") - -print("\nReading each alignement file") -#On ouvre le fichier d alignements -# pour chaque alignement, on liste les espèces et les «fonctions» - -for currCluster in allHMM.keys(): - currMSA = open(args.msaDir + "/" + str(currCluster) + ".msa.faa") - allHMM[currCluster]["annotations"] = [] +def extractSeqNames(MSApath): + currMSA = open(MSApath) + annotations = [] for line in currMSA: if line.startswith(">"): - allHMM[currCluster]["annotations"].append(line) + annotations.append(line) currMSA.close() -print("Done !") - -print("\nReading acc number from gathered data") -#get acc -for currCluster in allHMM.keys(): + return annotations + +def getTaxInfo(annotations): + taxInfo = {} accs = [] - for ca in allHMM[currCluster]["annotations"]: + for ca in annotations: accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0]) - allHMM[currCluster]["taxid"] = [] + taxInfo["taxid"] = [] currAcc = 0 # proceeding by lots, to avoid `too many SQL variables` issue while currAcc < len(accs): - allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot)) - currAcc += 50 + currAccs=accs[currAcc:(min(currAcc+249,len(accs)))] + print("currAccs = %s"%str(currAccs)) + taxInfo["taxid"].append(accession.taxid(currAcc, args.taxadb, Prot)) + currAcc += 249 # from taxid family - allHMM[currCluster]["families"] = {} + taxInfo["families"] = {} + + LCAs = [] + # deloting - for ctgen in allHMM[currCluster]["taxid"]: + for ctgen in taxInfo["taxid"]: for ct in ctgen: - lineage = taxid.lineage_name(ct[1], args.taxadb) - family = lineage[2] - if family not in allHMM[currCluster]["families"].keys(): - allHMM[currCluster]["families"][family] = 1 + print("plop %s"%ct[1]) + lineage = taxid.lineage_name(ct[1], args.taxadb, reverse=True) + #managing LCA + if len(LCAs) == 0: + LCAs = lineage + print("LCA: %s"% (str(LCAs))) + else: + print("LCA: %s"% (str(LCAs))) + for index,taxon in enumerate(LCAs): + if lineage[index] != taxon: + LCAs = LCAs[:index] + break + family = lineage[-3] + if family not in taxInfo["families"].keys(): + taxInfo["families"][family] = 1 else: - allHMM[currCluster]["families"][family] += 1 - -print("Done !") + taxInfo["families"][family] += 1 + + taxInfo["LCA"] = LCAs[-1] + return taxInfo + + +def writeAnnotations(annotationPath,clusterInfo,taxInfo,annotations): + annotFile = open(annotationPath,"w") + annotFile.write("LENGTH\t" + str(clusterInfo["length"])+"\n") + annotFile.write("FAMILIES\t" + str(taxInfo["families"]) + "\n") + annotFile.write("LCA\t%s\n"%(taxInfo["LCA"])) + annotFile.write("FASTA SEQUENCE TITLES:\n") + for currST in annotations: + annotFile.write(currST) + + + + -print("\nWriting files") +parser = argparse.ArgumentParser() +parser.add_argument("hmmDir", help="hmm dir") +parser.add_argument("msaDir", help="directory containing .msa.faa") +parser.add_argument("taxadb", help="taxadb file") +parser.add_argument("annotationDir", help="directory where annotation files have to be written") -#on output tout par cluster -for currCluster in allHMM.keys(): - currAnnot = open("annotations/" + str(currCluster)+ "_annotations.txt","w") - currAnnot.write("LENGTH\t" + str(allHMM[currCluster]["length"])+"\n") - currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n") - currAnnot.write("FASTA SEQUENCE TITLES:\n") - for currST in allHMM[currCluster]["annotations"]: - currAnnot.write(currST) +args = parser.parse_args() + +clusterList = glob.glob(args.hmmDir+"/*.hmm") +for currCluster in clusterList: + familyName = os.path.basename(currCluster)[:-4] + print(familyName) + clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName)) + annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName)) + taxInfo = getTaxInfo(annotations) + writeAnnotations("%s/%s.txt"%(args.annotationDir,familyName)) + + diff --git a/build/nucl2ProtGolden.py b/build/nucl2ProtGolden.py index 0be6ae57a335209cbeabc790fab2377210d65493..a7f18c7ce658bddb240b41650f6421f7b2a63abc 100644 --- a/build/nucl2ProtGolden.py +++ b/build/nucl2ProtGolden.py @@ -2,25 +2,36 @@ import sys,subprocess,argparse +import Golden from io import StringIO parser = argparse.ArgumentParser() parser.add_argument("list", help="file containing list of nucleic accession numbers") parser.add_argument("dbName", help="prefix for output files") -parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)") +parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)") args = parser.parse_args() -p = args.p +p = args.partial ntf = open(args.list) dbName = args.dbName -of = open(dbName+"_"+ p +"_prot.fasta","w") -nucAccI = open(dbName+"_"+ p +"_included.txt","w") -nucAccX = open(dbName+"_"+ p +"_excluded.txt","w") + allList = ntf.readlines() -accList = allList[round(len(allList)*(int(p)-1)/100.0):round(len(allList)*int(p)/100.0)] +partialString = "" + +if p: + print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p)))) + lowerBound = int(round(len(allList)*(int(p)-1)/10.0)) + higherBound = int(round(len(allList)*int(p)/10.0)) + accList = allList[lowerBound:higherBound] + partialString = "_" + p +else: + accList = allList +of = open(dbName+ partialString +"_prot.fasta","w") +nucAccI = open(dbName+ partialString +"_included.txt","w") +nucAccX = open(dbName+ partialString +"_excluded.txt","w") ntf.close() @@ -59,7 +70,7 @@ def extractProducts(geneEntry): continue else: if line.strip().startswith("ORGANISM"): - organism = line.split()[1] + organism = " ".join(line.split()[1:]) continue if inFeatures: if not line.startswith(" "): @@ -75,8 +86,8 @@ cpt = 0 for acc in accList: acc = acc.strip().split(".")[0] cpt+=1 - goldenResult = subprocess.check_output(['golden',"-a",("refseq:" if "_" in acc else "genbank:")+acc]) - if len(goldenResult) != 0: + goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc) + if goldenResult: nucAccI.write(acc + "\n") prots = extractProducts(StringIO(goldenResult.decode("utf-8"))) for name,seq in prots.items(): diff --git a/build/serialGolden.sh b/build/serialGolden.sh index 9def896ac7042c5959af220f6e023df727bae150..11dd697c70b63a1da16edee02c70695ffb2ec97b 100644 --- a/build/serialGolden.sh +++ b/build/serialGolden.sh @@ -1,5 +1,5 @@ #! /bin/sh -#SBATCH --qos=fast +#SBATCH --qos=hubbioit --partition=dedicated echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt -srun python3 /pasteur/homes/tbigot/bank/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/accession_rVDB.txt rVDB_prot2 ${SLURM_ARRAY_TASK_ID} +srun python2 /pasteur/homes/tbigot/dev/meloit_6333/viroumenet/build/nucl2ProtGolden.py /pasteur/homes/tbigot/bank/rVDB/current/list_acc_nucl.txt rVDB_prot -p ${SLURM_ARRAY_TASK_ID} exit 0 diff --git a/decodeResults/hmmerAnnotationDecode.py b/decodeResults/hmmerAnnotationDecode.py index 678f97795ac99680cd4c6b453ce65514f12b60e5..6620a34f8ac5084072a0febf00e22f686a8b12fb 100644 --- a/decodeResults/hmmerAnnotationDecode.py +++ b/decodeResults/hmmerAnnotationDecode.py @@ -50,8 +50,20 @@ def readHmmer(pathIn,pathOut,vfamDir,verbose=False): continue subjectData = (line.strip().split()) # # score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc - newLine = currSubject+ "," + subjectData[5]+ "," + subjectData[6]+ "," + subjectData[7]+ "," + subjectData[9]+ "," + subjectData[10]+ "," + currClusterInfo["id"] + "," + str(currClusterInfo["length"])+ "," + str(currClusterInfo["families"]).replace(","," ") + "," + str(currClusterInfo["functions"]).replace(","," ") + "," + str(currClusterInfo["nbSeq"]) + "\n" - out.write(newLine) + newLine = [ + currSubject, #subject + subjectData[5], #evalue + subjectData[6], #hmfrom + subjectData[7], #hmmto + subjectData[9], #alifrom + subjectData[10], #alito + currClusterInfo["id"], #clusterID + str(currClusterInfo["length"]), #clusterLength + str(currClusterInfo["families"]).replace(","," "), #families + str(currClusterInfo["functions"]).replace(","," "), #functions + str(currClusterInfo["nbSeq"]) #clusterNbSeq + ] + out.write(",".join(newLine)+"\n") if(verbose): print(newLine)