Select Git revision
sqlAnnotToTxt.py
sqlAnnotToTxt.py 2.03 KiB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import glob,re,argparse,os,sqlite3
from taxadb.accessionid import AccessionID
from taxadb.taxid import TaxID
from collections import defaultdict
parser = argparse.ArgumentParser()
parser.add_argument("sqlite", help="sqlite DB")
parser.add_argument("txtDir", help="directory in which produce annotations")
parser.add_argument("taxadb", help="taxadb file")
parser.add_argument("--prefix", help="file prefix. Eg: “FAM”, default none",type=str,default="")
args = parser.parse_args()
conn = sqlite3.connect(args.sqlite)
tdb_taxid = TaxID(dbtype='sqlite', dbname=args.taxadb)
ccf = conn.cursor()
ckw = conn.cursor()
for currFamR in ccf.execute('SELECT * FROM family'):
currF,currSize,currNbseq,currLCA = currFamR
o = open("{dir}/{prefix}{id}.txt".format(dir=args.txtDir,prefix=args.prefix,id=currF),"w")
lineage = tdb_taxid.lineage_name(int(currLCA), reverse=True)
o.write("LENGTH\t{length}\nLCA\t{lca}\nNBSEQ\t{nbseq}\nKEYWORDS:\n".format(length=currSize,lca=("::".join(tdb_taxid.lineage_name(int(currLCA), reverse=True)) if lineage else ""),nbseq=currNbseq))
# keywords from reference database
kw_string = []
for currFreq,currKw in ckw.execute('select freq,str from fam_kw_ref JOIN keyword ON fam_kw_ref.kwId = keyword.id WHERE fam_kw_ref.famID=? order by freq desc',(currF,)):
kw_string.append((f"{currKw}\t{currFreq}"))
o.write( "\n".join(kw_string) if kw_string else "--" )
# keywords from sequence names
o.write("\nKEYWORDS FROM SEQUENCES:\n")
for currFreq,currKw in ckw.execute('select freq,str from fam_kw_seqnames JOIN keyword ON fam_kw_seqnames.kwId = keyword.id WHERE fam_kw_seqnames.famID=? order by freq desc',(currF,)):
o.write("{kw}\t{count}\n".format(kw=currKw.strip("[]()"),count=currFreq))
o.write("SEQUENCES:\n")
with open("aligned/FAM{famName}.fasta".format(famName = str(currF).zfill(6))) as f:
for line in f:
if line.startswith(">"):
o.write(line[1:])
o.close()
conn.close()