nucl2ProtGolden.py

#!/usr/bin/env python3


import sys,subprocess,argparse
import Golden
from io import StringIO

parser = argparse.ArgumentParser()
parser.add_argument("list", help="file containing list of nucleic accession numbers")
parser.add_argument("dbName", help="prefix for output files")
parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)")
args = parser.parse_args()

p = args.partial

ntf = open(args.list)
dbName = args.dbName


allList = ntf.readlines()
partialString = ""

if p:
    print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p))))
    lowerBound = int(round(len(allList)*(int(p)-1)/10.0))
    higherBound = int(round(len(allList)*int(p)/10.0))
    accList = allList[lowerBound:higherBound]
    partialString = "_" + p
else:
    accList = allList

of = open(dbName+ partialString +"_prot.fasta","w")
nucAccI = open(dbName+ partialString +"_included.txt","w")
nucAccX = open(dbName+ partialString +"_excluded.txt","w")

ntf.close()

def addCDStoDict(cdsString,cdsDict,organism):
    if len(cdsString)!=0 and cdsString[0].strip().startswith("CDS"):
        currProduct = ""
        currSequence = []
        currAcc = ""
        inTranslation=False
        for cc in cdsString:
            cc = cc.strip()
            if not inTranslation:
                if cc.startswith("/product"):
                    currProduct = cc.split("=")[1].strip("\"")
                elif cc.startswith("/protein_id"):
                    currAcc = cc.split("=")[1].strip("\"")
                elif cc.startswith("/translation"):
                    currSequence.append(cc.split("=")[1])
                    inTranslation = True
            else:
                currSequence.append(cc)
        if currProduct != "":
            cdsDict[(currAcc + "| " + currProduct + " ["+organism+"]")] = "".join(currSequence).strip("\"")
    del cdsString[:]

def extractProducts(geneEntry):
    prots = {}
    currCDSstr = []
    organism = "--unknown organism"
    inFeatures = False
    for line in geneEntry:
        # control the presence in features
        if not inFeatures:
            if line.startswith("FEATURES"):
                inFeatures = True
                continue
            else:
                if line.strip().startswith("ORGANISM"):
                    organism = " ".join(line.split()[1:])
                continue
        if inFeatures:
            if not line.startswith("     "):
                addCDStoDict(currCDSstr,prots,organism)
                break
        # here we are in Features :
        if(line[5] != " "):
            addCDStoDict(currCDSstr,prots,organism)
        currCDSstr.append(line.rstrip())
    return prots

cpt = 0
for acc in accList:
    acc = acc.strip().split(".")[0]
    cpt+=1
    goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
    if goldenResult:
        nucAccI.write(acc + "\n")
        prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
        for name,seq in prots.items():
            of.write(">" + name + "\n" + seq + "\n")
    else:
        nucAccX.write(acc + "\n")