diff --git a/annotate/annotateHmmerOutput.py b/annotate/annotateHmmerOutput.py new file mode 100644 index 0000000000000000000000000000000000000000..1ddc3215e9eb0110b91acf0f64f64df7547c1e27 --- /dev/null +++ b/annotate/annotateHmmerOutput.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import glob,re,argparse,os,sqlite3 + +from taxadb.accessionid import AccessionID +from taxadb.taxid import TaxID + +from collections import defaultdict + +def getFamAnnot(famID,annotC,tdb_taxid): + c = annotC.cursor() + c.execute("SELECT LCAtaxid FROM family WHERE id=?",(famID,)) + data = c.fetchone() + lcaTaxid = data[0] + lineage = tdb_taxid.lineage_name(lcaTaxid) + if not lineage: + print("notLineage " + str(famID)) + lineage = ["Unknown"] + + c.execute("SELECT str,freq FROM fam_kw JOIN keyword ON fam_kw.kwId = keyword.id WHERE famID=? order by freq desc",(famID,)) + keywords = c.fetchall() + return(lineage,keywords) + + + +parser = argparse.ArgumentParser() +parser.add_argument("hmmTabular", help="tabular output of a HMMER run") +parser.add_argument("taxaDB", help="taxadb file") +parser.add_argument("annotDB", help="annotation db file") +parser.add_argument("annotatedOutput", help="file to be produced") + + +args = parser.parse_args() + +conn = sqlite3.connect(args.annotDB) + +tdb_taxid = TaxID(dbtype='sqlite', dbname=args.taxaDB) + +i = open(args.hmmTabular) +o = open(args.annotatedOutput,"w") +for line in i: + if line.startswith("#"): + continue + ls = line.split() + sname = ls[0] + qname = ls[2] + evalue = ls[4] + famID = int(qname[3:-4]) + lineage,keywords = getFamAnnot(famID,conn,tdb_taxid) + + o.write("{sname};{qname};{evalue};{lineage};{keywords}\n".format( + sname=sname, + qname=qname, + evalue=evalue, + lineage=",".join(lineage), + keywords=",".join(["{}({})".format(k,w) for k,w in keywords]) + )) + + + +conn.close() diff --git a/annotate/annotations_schema.sql b/annotate/annotations_schema.sql index 144eae2576424fd7c62168814fcecea570eb65dc..ae46593440ecffc49048570d5799cf9a69d2a3c6 100644 --- a/annotate/annotations_schema.sql +++ b/annotate/annotations_schema.sql @@ -10,10 +10,19 @@ CREATE TABLE keyword( str CHAR(30) UNIQUE ); +CREATE TABLE fam_kw_unsorted( + famId INTEGER, + kwId INTEGER, + freq INTEGER, + FOREIGN KEY(famId) REFERENCES family(id), + FOREIGN KEY(kwId) REFERENCES keyword(id), + PRIMARY KEY(famId,kwId) +); + CREATE TABLE fam_kw( famId INTEGER, kwId INTEGER, - frequency INTEGER, + freq INTEGER, FOREIGN KEY(famId) REFERENCES family(id), FOREIGN KEY(kwId) REFERENCES keyword(id), PRIMARY KEY(famId,kwId) diff --git a/annotate/makeAnotationsIntoSQL.py b/annotate/makeAnnotationsIntoSQL.py similarity index 81% rename from annotate/makeAnotationsIntoSQL.py rename to annotate/makeAnnotationsIntoSQL.py index 718b513178087ae937ac2bb6a5c138e49cad045e..bb45d6dc3de939883c9d960fa2a3967ea6be9384 100644 --- a/annotate/makeAnotationsIntoSQL.py +++ b/annotate/makeAnnotationsIntoSQL.py @@ -14,7 +14,7 @@ def getClusterInfo(HMMpath): if line.startswith("LENG"): currLength = int(re.search('LENG\s+([0-9]+)',line).group(1)) if line.startswith("NAME"): - currID = re.search('NAME.*FAM([0-9]+).hmm',line).group(1) + currID = re.search('NAME.*FAM([0-9]+)',line).group(1) if line.startswith("//"): clusterInfo = {"id": currID, "length": currLength} if line.split() == ("HMM"): @@ -35,16 +35,17 @@ def extractSeqNames(MSApath): def wordCount(annotList): wc = defaultdict(int) for currAnnot in annotList: - words = currAnnot.strip().split("|")[-1].split() + words = re.findall(r"\w{3,}",currAnnot.split("|")[5]) for word in words: wc[word] += 1 return wc def getTaxInfo(tdb_accession,tdb_taxid,annotations): - taxInfo = {} + taxInfo = {"nbSeq":len(annotations)} accs = [] for ca in annotations: - accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0]) + #accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0]) + accs.append(ca.split("|")[2].split(".")[0]) taxInfo["taxid"] = [] currAcc = 0 # proceeding by lots, to avoid `too many SQL variables` issue @@ -86,7 +87,7 @@ def writeAnnotations(dbConn,famID,clusterInfo,taxInfo,annotations): c.execute("INSERT INTO family VALUES ({id},{size},{nbSeq},{LCAtaxid});".format( id=famID, size=clusterInfo["length"], - nbSeq=len(taxInfo["taxon"]), + nbSeq=taxInfo["nbSeq"], LCAtaxid=taxInfo["LCA"] )) c.executemany("INSERT INTO fam_tax VALUES ({},?,?);".format(famID),taxInfo["taxon"].items()) @@ -100,7 +101,7 @@ def writeAnnotations(dbConn,famID,clusterInfo,taxInfo,annotations): cwid = c.lastrowid else: cwid = data[0] - c.execute("INSERT INTO fam_kw VALUES (?,?,?);",(famID,cwid,currCount)) + c.execute("INSERT INTO fam_kw_unsorted VALUES (?,?,?);",(famID,cwid,currCount)) dbConn.commit() @@ -109,10 +110,11 @@ def writeAnnotations(dbConn,famID,clusterInfo,taxInfo,annotations): parser = argparse.ArgumentParser() parser.add_argument("hmmDir", help="hmm dir") -parser.add_argument("msaDir", help="directory containing .msa.faa") +parser.add_argument("alignedDir", help="directory containing .msa.faa") parser.add_argument("taxadb", help="taxadb file") -parser.add_argument("annotDB", help="db to create in which annotation will be stored") parser.add_argument("schema", help="sql schema to format the db") +parser.add_argument("annotDB", help="db to create in which annotation will be stored") + args = parser.parse_args() @@ -127,8 +129,17 @@ clusterList = glob.glob(args.hmmDir+"/*.hmm") for currCluster in clusterList: familyName = os.path.basename(currCluster)[:-4] clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName)) - annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName)) + if not clusterInfo: + print("Failed to read {}".format(familyName)) + continue + annotations = extractSeqNames("%s/%s.fasta"%(args.alignedDir,familyName)) taxInfo = getTaxInfo(tdb_accession,tdb_taxid,annotations) writeAnnotations(conn,clusterInfo["id"],clusterInfo,taxInfo,annotations) +# now we need to sort the table fam_kw_unsorted +c = conn.cursor() +c.execute("INSERT INTO fam_kw SELECT * FROM fam_kw_unsorted ORDER BY freq DESC;") +c.execute("DROP TABLE fam_kw_unsorted;") +conn.commit() + conn.close() diff --git a/annotate/makeAnotations.py b/annotate/makeAnotations.py deleted file mode 100644 index 35cef11f458a31bc260039294d9f5f10312f7eaf..0000000000000000000000000000000000000000 --- a/annotate/makeAnotations.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -import glob,re,argparse,os - -from taxadb.schema import * -from taxadb import accession -from taxadb import taxid - -def getClusterInfo(HMMpath): - clusterInfo = {} - with open(HMMpath) as HMMfile: - for line in HMMfile: - if line.startswith("LENG"): - currLength = int(re.search('LENG\s+([0-9]+)',line).group(1)) - if line.startswith("NAME"): - currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1) - if line.startswith("//"): - clusterInfo = {"name": currID, "length": currLength} - if line.startswith("HMM"): - break - return clusterInfo - - -def extractSeqNames(MSApath): - currMSA = open(MSApath) - annotations = [] - for line in currMSA: - if line.startswith(">"): - annotations.append(line) - currMSA.close() - return annotations - -def getTaxInfo(annotations): - taxInfo = {} - accs = [] - for ca in annotations: - accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0]) - taxInfo["taxid"] = [] - currAcc = 0 - # proceeding by lots, to avoid `too many SQL variables` issue - while currAcc < len(accs): - currAccs=accs[currAcc:(min(currAcc+249,len(accs)))] - print("currAccs = %s"%str(currAccs)) - taxInfo["taxid"].append(accession.taxid(currAcc, args.taxadb, Prot)) - currAcc += 249 - # from taxid family - taxInfo["families"] = {} - - LCAs = [] - - # deloting - for ctgen in taxInfo["taxid"]: - for ct in ctgen: - print("plop %s"%ct[1]) - lineage = taxid.lineage_name(ct[1], args.taxadb, reverse=True) - #managing LCA - if len(LCAs) == 0: - LCAs = lineage - print("LCA: %s"% (str(LCAs))) - else: - print("LCA: %s"% (str(LCAs))) - for index,taxon in enumerate(LCAs): - if lineage[index] != taxon: - LCAs = LCAs[:index] - break - family = lineage[-3] - if family not in taxInfo["families"].keys(): - taxInfo["families"][family] = 1 - else: - taxInfo["families"][family] += 1 - - taxInfo["LCA"] = LCAs[-1] - return taxInfo - - -def writeAnnotations(annotationPath,clusterInfo,taxInfo,annotations): - annotFile = open(annotationPath,"w") - annotFile.write("LENGTH\t" + str(clusterInfo["length"])+"\n") - annotFile.write("FAMILIES\t" + str(taxInfo["families"]) + "\n") - annotFile.write("LCA\t%s\n"%(taxInfo["LCA"])) - annotFile.write("FASTA SEQUENCE TITLES:\n") - for currST in annotations: - annotFile.write(currST) - - - - - -parser = argparse.ArgumentParser() -parser.add_argument("hmmDir", help="hmm dir") -parser.add_argument("msaDir", help="directory containing .msa.faa") -parser.add_argument("taxadb", help="taxadb file") -parser.add_argument("annotationDir", help="directory where annotation files have to be written") - -args = parser.parse_args() - -clusterList = glob.glob(args.hmmDir+"/*.hmm") -for currCluster in clusterList: - familyName = os.path.basename(currCluster)[:-4] - print(familyName) - clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName)) - annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName)) - taxInfo = getTaxInfo(annotations) - writeAnnotations("%s/%s.txt"%(args.annotationDir,familyName)) - - diff --git a/annotate/sqlAnnotToTxt.py b/annotate/sqlAnnotToTxt.py new file mode 100644 index 0000000000000000000000000000000000000000..146b0db869ef902568be5e0ae599ac60ff02d45c --- /dev/null +++ b/annotate/sqlAnnotToTxt.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import glob,re,argparse,os,sqlite3 + +from taxadb.accessionid import AccessionID +from taxadb.taxid import TaxID + +from collections import defaultdict + +parser = argparse.ArgumentParser() +parser.add_argument("sqlite", help="sqlite DB") +parser.add_argument("txtDir", help="directory in which produce annotations") +parser.add_argument("taxadb", help="taxadb file") +parser.add_argument("--prefix", help="file prefix. Eg: “FAM”, default none",type=str,default="") + + +args = parser.parse_args() + + +conn = sqlite3.connect(args.sqlite) +tdb_taxid = TaxID(dbtype='sqlite', dbname=args.taxadb) + +ccf = conn.cursor() +ckw = conn.cursor() + +for currFamR in ccf.execute('SELECT * FROM family'): + currF,currSize,currNbseq,currLCA = currFamR + o = open("{dir}/{prefix}{id}.txt".format(dir=args.txtDir,prefix=args.prefix,id=currF),"w") + lineage = tdb_taxid.lineage_name(int(currLCA), reverse=True) + o.write("LENGTH\t{length}\nLCA\t{lca}\nNBSEQ\t{nbseq}\nKEYWORDS:\n".format(length=currSize,lca=("::".join(tdb_taxid.lineage_name(int(currLCA), reverse=True)) if lineage else ""),nbseq=currNbseq)) + for currFreq,currKw in ckw.execute('select freq,str from fam_kw JOIN keyword ON fam_kw.kwId = keyword.id WHERE fam_kw.famID=? order by freq desc',(currF,)): + o.write("{kw}\t{count}\n".format(kw=currKw.strip("[]()"),count=currFreq)) + o.write("SEQUENCES:\n") + with open("aligned/FAM{famName}.fasta".format(famName = str(currF).zfill(6))) as f: + for line in f: + if line.startswith(">"): + o.write(line[1:]) + + + + + +conn.close() diff --git a/build/makeFastaFromCluster.py b/build/makeFastaFromCluster.py index fa51a88a0e836389ed300c95e8dff783c804d742..0077063422c43609beaeae1d83d6d8d4ff21da36 100644 --- a/build/makeFastaFromCluster.py +++ b/build/makeFastaFromCluster.py @@ -39,3 +39,4 @@ for line in sequences: currSequence = [] else: currSequence.append(line) +addSeqToFamilyFile(currName,currSequence) diff --git a/build/nucl2ProtGolden.py b/build/nucl2ProtGolden.py index c2e910aaa6d0960eccf5848131dd7924e9f2d197..8ff6f3e363038f354cbd37efad662d3bb9c578dc 100644 --- a/build/nucl2ProtGolden.py +++ b/build/nucl2ProtGolden.py @@ -29,14 +29,15 @@ if p: else: accList = allList -of = open(dbName+ partialString +"_prot.fasta","w") +of = open(dbName+ partialString,"w") nucAccI = open(dbName+ partialString +"_included.txt","w") nucAccX = open(dbName+ partialString +"_excluded.txt","w") ntf.close() -def addCDStoDict(cdsString,cdsDict,organism,nuclAcc): +def addCDStoDict(cdsString,cdsDict,organism,nuclAcc,origBank): if len(cdsString)!=0 and cdsString[0].strip().startswith("CDS"): + bank = "REFSEQ" if origBank == "REFSEQ" else "GENBANK" currProduct = "" currSequence = [] currAcc = "" @@ -54,11 +55,11 @@ def addCDStoDict(cdsString,cdsDict,organism,nuclAcc): else: currSequence.append(cc) if currProduct != "" and len(currSequence) != 0: - entryName = ("{}| {} [{}]".format(currAcc,currProduct,organism)) + entryName = ("acc|{bank}|{currAcc}|{origBank}|{origAcc}|{currProduct} [{organism}]".format(currAcc=currAcc,currProduct=currProduct,organism=organism,origBank=origBank,origAcc=nuclAcc,bank=bank)) cdsDict[entryName] = "".join(currSequence).strip("\"") del cdsString[:] -def extractProducts(geneEntry,acc): +def extractProducts(geneEntry,acc,origBank): prots = {} currCDSstr = [] organism = "--unknown organism" @@ -75,22 +76,27 @@ def extractProducts(geneEntry,acc): continue if inFeatures: if not line.startswith(" "): - addCDStoDict(currCDSstr,prots,organism,acc) + addCDStoDict(currCDSstr,prots,organism,acc,origBank) break # here we are in Features : if(line[5] != " "): - addCDStoDict(currCDSstr,prots,organism,acc) + addCDStoDict(currCDSstr,prots,organism,acc,origBank) currCDSstr.append(line.rstrip()) return prots cpt = 0 for acc in accList: - acc = acc.strip().split(".")[0] + als = acc.strip().split("|") + acc = als[1].split(".")[0] + origBank = als[0] cpt+=1 - goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc) + try: + goldenResult = Golden.access(("refseq" if origBank=="REFSEQ" else "genbank"),acc) + except UnicodeDecodeError: + print("UnicodeDecodeError: {}".format(acc)) if goldenResult: nucAccI.write(acc + "\n") - prots = extractProducts(StringIO(goldenResult),acc) + prots = extractProducts(StringIO(goldenResult),acc,origBank) for name,seq in prots.items(): of.write(">" + name + "\n" + seq + "\n") else: