Skip to content
Snippets Groups Projects
Select Git revision
  • 58ea43bf9e84d7139c58615741426722736fc7fb
  • master default protected
  • v19.0.1
  • v19.0.0
  • v15.1.0
5 results

nucl2ProtGolden.py

Blame
  • nucl2ProtGolden.py 3.05 KiB
    #!/usr/bin/env python3
    
    
    import sys,subprocess,argparse
    import Golden
    from io import StringIO
    
    parser = argparse.ArgumentParser()
    parser.add_argument("list", help="file containing list of nucleic accession numbers")
    parser.add_argument("dbName", help="prefix for output files")
    parser.add_argument("--partial","-p", help="proceed p percent of the file (from p-1 to p percent)")
    args = parser.parse_args()
    
    p = args.partial
    
    ntf = open(args.list)
    dbName = args.dbName
    
    
    allList = ntf.readlines()
    partialString = ""
    
    if p:
        print("Will proceed from %s to %s percent" % (str(int(p)-1),str(int(p))))
        lowerBound = int(round(len(allList)*(int(p)-1)/10.0))
        higherBound = int(round(len(allList)*int(p)/10.0))
        accList = allList[lowerBound:higherBound]
        partialString = "_" + p
    else:
        accList = allList    
    
    of = open(dbName+ partialString +"_prot.fasta","w")
    nucAccI = open(dbName+ partialString +"_included.txt","w")
    nucAccX = open(dbName+ partialString +"_excluded.txt","w")
    
    ntf.close()
    
    def addCDStoDict(cdsString,cdsDict,organism):
        if len(cdsString)!=0 and cdsString[0].strip().startswith("CDS"):
            currProduct = ""
            currSequence = []
            currAcc = ""
            inTranslation=False
            for cc in cdsString:
                cc = cc.strip()
                if not inTranslation:
                    if cc.startswith("/product"):
                        currProduct = cc.split("=")[1].strip("\"")
                    elif cc.startswith("/protein_id"):
                        currAcc = cc.split("=")[1].strip("\"")
                    elif cc.startswith("/translation"):
                        currSequence.append(cc.split("=")[1])
                        inTranslation = True
                else:
                    currSequence.append(cc)
            if currProduct != "":
                cdsDict[(currAcc + "| " + currProduct + " ["+organism+"]")] = "".join(currSequence).strip("\"")
        del cdsString[:]
    
    def extractProducts(geneEntry):
        prots = {}
        currCDSstr = []
        organism = "--unknown organism"
        inFeatures = False
        for line in geneEntry:
            # control the presence in features
            if not inFeatures:
                if line.startswith("FEATURES"):
                    inFeatures = True
                    continue
                else:
                    if line.strip().startswith("ORGANISM"):
                        organism = " ".join(line.split()[1:])
                    continue
            if inFeatures:
                if not line.startswith("     "):
                    addCDStoDict(currCDSstr,prots,organism)
                    break
            # here we are in Features :
            if(line[5] != " "):
                addCDStoDict(currCDSstr,prots,organism)
            currCDSstr.append(line.rstrip())
        return prots
    
    cpt = 0
    for acc in accList:
        acc = acc.strip().split(".")[0]
        cpt+=1
        goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
        if goldenResult:
            nucAccI.write(acc + "\n")
            prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
            for name,seq in prots.items():
                of.write(">" + name + "\n" + seq + "\n")
        else:
            nucAccX.write(acc + "\n")