Select Git revision
nucl2ProtGolden.py
nucl2ProtGolden.py 3.05 KiB
#!/usr/bin/env python3
import sys,subprocess,argparse
import Golden
from io import StringIO
parser = argparse.ArgumentParser()
parser.add_argument("list", help="file containing list of nucleic accession numbers")
parser.add_argument("dbName", help="prefix for output files")
parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)")
args = parser.parse_args()
p = args.partial
ntf = open(args.list)
dbName = args.dbName
allList = ntf.readlines()
partialString = ""
if p:
print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p))))
lowerBound = int(round(len(allList)*(int(p)-1)/10.0))
higherBound = int(round(len(allList)*int(p)/10.0))
accList = allList[lowerBound:higherBound]
partialString = "_" + p
else:
accList = allList
of = open(dbName+ partialString +"_prot.fasta","w")
nucAccI = open(dbName+ partialString +"_included.txt","w")
nucAccX = open(dbName+ partialString +"_excluded.txt","w")
ntf.close()
def addCDStoDict(cdsString,cdsDict,organism):
if len(cdsString)!=0 and cdsString[0].strip().startswith("CDS"):
currProduct = ""
currSequence = []
currAcc = ""
inTranslation=False
for cc in cdsString:
cc = cc.strip()
if not inTranslation:
if cc.startswith("/product"):
currProduct = cc.split("=")[1].strip("\"")
elif cc.startswith("/protein_id"):
currAcc = cc.split("=")[1].strip("\"")
elif cc.startswith("/translation"):
currSequence.append(cc.split("=")[1])
inTranslation = True
else:
currSequence.append(cc)
if currProduct != "":
cdsDict[(currAcc + "| " + currProduct + " ["+organism+"]")] = "".join(currSequence).strip("\"")
del cdsString[:]
def extractProducts(geneEntry):
prots = {}
currCDSstr = []
organism = "--unknown organism"
inFeatures = False
for line in geneEntry:
# control the presence in features
if not inFeatures:
if line.startswith("FEATURES"):
inFeatures = True
continue
else:
if line.strip().startswith("ORGANISM"):
organism = " ".join(line.split()[1:])
continue
if inFeatures:
if not line.startswith(" "):
addCDStoDict(currCDSstr,prots,organism)
break
# here we are in Features :
if(line[5] != " "):
addCDStoDict(currCDSstr,prots,organism)
currCDSstr.append(line.rstrip())
return prots
cpt = 0
for acc in accList:
acc = acc.strip().split(".")[0]
cpt+=1
goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
if goldenResult:
nucAccI.write(acc + "\n")
prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
for name,seq in prots.items():
of.write(">" + name + "\n" + seq + "\n")
else:
nucAccX.write(acc + "\n")