Skip to content
Snippets Groups Projects
Select Git revision
  • 1f71fbf86ace29512c3016415145606f829dfd9d
  • master default protected
  • v19.0.1
  • v19.0.0
  • v15.1.0
5 results

sqlAnnotToTxt.py

Blame
  • sqlAnnotToTxt.py 2.03 KiB
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    import glob,re,argparse,os,sqlite3
    
    from taxadb.accessionid import AccessionID
    from taxadb.taxid import TaxID
    
    from collections import defaultdict
    
    parser = argparse.ArgumentParser()
    parser.add_argument("sqlite", help="sqlite DB")
    parser.add_argument("txtDir", help="directory in which produce annotations")
    parser.add_argument("taxadb", help="taxadb file")
    parser.add_argument("--prefix", help="file prefix. Eg: “FAM”, default none",type=str,default="")
    
    
    args = parser.parse_args()
    
    
    conn = sqlite3.connect(args.sqlite)
    tdb_taxid = TaxID(dbtype='sqlite', dbname=args.taxadb)
    
    ccf = conn.cursor()
    ckw = conn.cursor()
    
    for currFamR in ccf.execute('SELECT * FROM family'):
        currF,currSize,currNbseq,currLCA = currFamR
        o = open("{dir}/{prefix}{id}.txt".format(dir=args.txtDir,prefix=args.prefix,id=currF),"w")
        lineage = tdb_taxid.lineage_name(int(currLCA), reverse=True)
        o.write("LENGTH\t{length}\nLCA\t{lca}\nNBSEQ\t{nbseq}\nKEYWORDS:\n".format(length=currSize,lca=("::".join(tdb_taxid.lineage_name(int(currLCA), reverse=True)) if lineage else ""),nbseq=currNbseq))
        
        # keywords from reference database
        kw_string = []
        for currFreq,currKw in ckw.execute('select freq,str from fam_kw_ref JOIN keyword ON fam_kw_ref.kwId = keyword.id WHERE fam_kw_ref.famID=? order by freq desc',(currF,)):
            kw_string.append((f"{currKw}\t{currFreq}"))
        o.write( "\n".join(kw_string) if kw_string else "--" )
        
        # keywords from sequence names
        o.write("\nKEYWORDS FROM SEQUENCES:\n")
        for currFreq,currKw in ckw.execute('select freq,str from fam_kw_seqnames JOIN keyword ON fam_kw_seqnames.kwId = keyword.id WHERE fam_kw_seqnames.famID=? order by freq desc',(currF,)):
            o.write("{kw}\t{count}\n".format(kw=currKw.strip("[]()"),count=currFreq))
        o.write("SEQUENCES:\n")
        with open("aligned/FAM{famName}.fasta".format(famName = str(currF).zfill(6))) as f:
            for line in f:
                if line.startswith(">"):
                    o.write(line[1:])
        o.close()
        
        
    
    
    
    conn.close()