From 014d24735a85795946b3d5ee580351c5e840829d Mon Sep 17 00:00:00 2001 From: Thomas Bigot <thomas.bigot@pasteur.fr> Date: Wed, 22 Feb 2017 13:31:28 +0100 Subject: [PATCH] Reorganization --- .../makeAnotations.py | 36 ++++++++++--------- collapse.py => build/collapse.py | 0 fastaHmmr.py => build/fastaHmmr.py | 0 .../makeFastaFromCluster.py | 0 .../nucl2ProtGolden.py | 0 partialBlast.py => build/partialBlast.py | 0 .../partialHmmbuild.py | 0 partialMuscle.py => build/partialMuscle.py | 0 removeDup.py => build/removeDup.py | 0 serialBlast.sh => build/serialBlast.sh | 0 serialGolden.sh => build/serialGolden.sh | 0 serialHmmbuild.sh => build/serialHmmbuild.sh | 0 serialMuscle.sh => build/serialMuscle.sh | 0 13 files changed, 20 insertions(+), 16 deletions(-) rename makeAnotations.py => annotate/makeAnotations.py (69%) rename collapse.py => build/collapse.py (100%) rename fastaHmmr.py => build/fastaHmmr.py (100%) rename makeFastaFromCluster.py => build/makeFastaFromCluster.py (100%) rename nucl2ProtGolden.py => build/nucl2ProtGolden.py (100%) rename partialBlast.py => build/partialBlast.py (100%) rename partialHmmbuild.py => build/partialHmmbuild.py (100%) rename partialMuscle.py => build/partialMuscle.py (100%) rename removeDup.py => build/removeDup.py (100%) rename serialBlast.sh => build/serialBlast.sh (100%) rename serialGolden.sh => build/serialGolden.sh (100%) rename serialHmmbuild.sh => build/serialHmmbuild.sh (100%) rename serialMuscle.sh => build/serialMuscle.sh (100%) diff --git a/makeAnotations.py b/annotate/makeAnotations.py similarity index 69% rename from makeAnotations.py rename to annotate/makeAnotations.py index 5da26cf..6a9728d 100644 --- a/makeAnotations.py +++ b/annotate/makeAnotations.py @@ -2,9 +2,9 @@ # -*- coding: utf-8 -*- import glob,re,argparse -#from taxadb.schema import * -#from taxadb import accession -#from taxadb import taxid +from taxadb.schema import * +from taxadb import accession +from taxadb import taxid parser = argparse.ArgumentParser() parser.add_argument("hmmDB", help="hmm file") @@ -55,20 +55,24 @@ print("\nReading acc number from gathered data") for currCluster in allHMM.keys(): accs = [] for ca in allHMM[currCluster]["annotations"]: - print(ca) - accs.append(ca.split("|")[3].split(".")[0]) - #allHMM[currCluster]["taxid"] = accession.taxid(accs, args.taxadb, Prot) - allHMM[currCluster]["taxid"] = "1234" + accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0]) + allHMM[currCluster]["taxid"] = [] + currAcc = 0 + # proceeding by lots, to avoid `too many SQL variables` issue + while currAcc < len(accs): + allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot)) + currAcc += 50 # from taxid family allHMM[currCluster]["families"] = {} - for ct in allHMM[currCluster]["taxid"]: - #lineage = taxid.lineage_name(ct[1], args.taxadb) - lineage = (1234,"ABCD") - family = lineage[2] - if family not in allHMM[currCluster]["families"].keys(): - allHMM[currCluster]["families"][family] = 1 - else: - allHMM[currCluster]["families"][family] += 1 + # deloting + for ctgen in allHMM[currCluster]["taxid"]: + for ct in ctgen: + lineage = taxid.lineage_name(ct[1], args.taxadb) + family = lineage[2] + if family not in allHMM[currCluster]["families"].keys(): + allHMM[currCluster]["families"][family] = 1 + else: + allHMM[currCluster]["families"][family] += 1 print("Done !") @@ -81,4 +85,4 @@ for currCluster in allHMM.keys(): currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n") currAnnot.write("FASTA SEQUENCE TITLES:\n") for currST in allHMM[currCluster]["annotations"]: - currAnnot.write(currST+"\n") + currAnnot.write(currST) diff --git a/collapse.py b/build/collapse.py similarity index 100% rename from collapse.py rename to build/collapse.py diff --git a/fastaHmmr.py b/build/fastaHmmr.py similarity index 100% rename from fastaHmmr.py rename to build/fastaHmmr.py diff --git a/makeFastaFromCluster.py b/build/makeFastaFromCluster.py similarity index 100% rename from makeFastaFromCluster.py rename to build/makeFastaFromCluster.py diff --git a/nucl2ProtGolden.py b/build/nucl2ProtGolden.py similarity index 100% rename from nucl2ProtGolden.py rename to build/nucl2ProtGolden.py diff --git a/partialBlast.py b/build/partialBlast.py similarity index 100% rename from partialBlast.py rename to build/partialBlast.py diff --git a/partialHmmbuild.py b/build/partialHmmbuild.py similarity index 100% rename from partialHmmbuild.py rename to build/partialHmmbuild.py diff --git a/partialMuscle.py b/build/partialMuscle.py similarity index 100% rename from partialMuscle.py rename to build/partialMuscle.py diff --git a/removeDup.py b/build/removeDup.py similarity index 100% rename from removeDup.py rename to build/removeDup.py diff --git a/serialBlast.sh b/build/serialBlast.sh similarity index 100% rename from serialBlast.sh rename to build/serialBlast.sh diff --git a/serialGolden.sh b/build/serialGolden.sh similarity index 100% rename from serialGolden.sh rename to build/serialGolden.sh diff --git a/serialHmmbuild.sh b/build/serialHmmbuild.sh similarity index 100% rename from serialHmmbuild.sh rename to build/serialHmmbuild.sh diff --git a/serialMuscle.sh b/build/serialMuscle.sh similarity index 100% rename from serialMuscle.sh rename to build/serialMuscle.sh -- GitLab