diff --git a/makeAnotations.py b/annotate/makeAnotations.py similarity index 69% rename from makeAnotations.py rename to annotate/makeAnotations.py index 5da26cfaa1e3700a5c4c3725fb5d14244fb724be..6a9728d74b870a149e206feaf1c6411b49c95c74 100644 --- a/makeAnotations.py +++ b/annotate/makeAnotations.py @@ -2,9 +2,9 @@ # -*- coding: utf-8 -*- import glob,re,argparse -#from taxadb.schema import * -#from taxadb import accession -#from taxadb import taxid +from taxadb.schema import * +from taxadb import accession +from taxadb import taxid parser = argparse.ArgumentParser() parser.add_argument("hmmDB", help="hmm file") @@ -55,20 +55,24 @@ print("\nReading acc number from gathered data") for currCluster in allHMM.keys(): accs = [] for ca in allHMM[currCluster]["annotations"]: - print(ca) - accs.append(ca.split("|")[3].split(".")[0]) - #allHMM[currCluster]["taxid"] = accession.taxid(accs, args.taxadb, Prot) - allHMM[currCluster]["taxid"] = "1234" + accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0]) + allHMM[currCluster]["taxid"] = [] + currAcc = 0 + # proceeding by lots, to avoid `too many SQL variables` issue + while currAcc < len(accs): + allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot)) + currAcc += 50 # from taxid family allHMM[currCluster]["families"] = {} - for ct in allHMM[currCluster]["taxid"]: - #lineage = taxid.lineage_name(ct[1], args.taxadb) - lineage = (1234,"ABCD") - family = lineage[2] - if family not in allHMM[currCluster]["families"].keys(): - allHMM[currCluster]["families"][family] = 1 - else: - allHMM[currCluster]["families"][family] += 1 + # deloting + for ctgen in allHMM[currCluster]["taxid"]: + for ct in ctgen: + lineage = taxid.lineage_name(ct[1], args.taxadb) + family = lineage[2] + if family not in allHMM[currCluster]["families"].keys(): + allHMM[currCluster]["families"][family] = 1 + else: + allHMM[currCluster]["families"][family] += 1 print("Done !") @@ -81,4 +85,4 @@ for currCluster in allHMM.keys(): currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n") currAnnot.write("FASTA SEQUENCE TITLES:\n") for currST in allHMM[currCluster]["annotations"]: - currAnnot.write(currST+"\n") + currAnnot.write(currST) diff --git a/collapse.py b/build/collapse.py similarity index 100% rename from collapse.py rename to build/collapse.py diff --git a/fastaHmmr.py b/build/fastaHmmr.py similarity index 100% rename from fastaHmmr.py rename to build/fastaHmmr.py diff --git a/makeFastaFromCluster.py b/build/makeFastaFromCluster.py similarity index 100% rename from makeFastaFromCluster.py rename to build/makeFastaFromCluster.py diff --git a/nucl2ProtGolden.py b/build/nucl2ProtGolden.py similarity index 100% rename from nucl2ProtGolden.py rename to build/nucl2ProtGolden.py diff --git a/partialBlast.py b/build/partialBlast.py similarity index 100% rename from partialBlast.py rename to build/partialBlast.py diff --git a/partialHmmbuild.py b/build/partialHmmbuild.py similarity index 100% rename from partialHmmbuild.py rename to build/partialHmmbuild.py diff --git a/partialMuscle.py b/build/partialMuscle.py similarity index 100% rename from partialMuscle.py rename to build/partialMuscle.py diff --git a/removeDup.py b/build/removeDup.py similarity index 100% rename from removeDup.py rename to build/removeDup.py diff --git a/serialBlast.sh b/build/serialBlast.sh similarity index 100% rename from serialBlast.sh rename to build/serialBlast.sh diff --git a/serialGolden.sh b/build/serialGolden.sh similarity index 100% rename from serialGolden.sh rename to build/serialGolden.sh diff --git a/serialHmmbuild.sh b/build/serialHmmbuild.sh similarity index 100% rename from serialHmmbuild.sh rename to build/serialHmmbuild.sh diff --git a/serialMuscle.sh b/build/serialMuscle.sh similarity index 100% rename from serialMuscle.sh rename to build/serialMuscle.sh