From 014d24735a85795946b3d5ee580351c5e840829d Mon Sep 17 00:00:00 2001
From: Thomas Bigot <thomas.bigot@pasteur.fr>
Date: Wed, 22 Feb 2017 13:31:28 +0100
Subject: [PATCH] Reorganization

---
 .../makeAnotations.py                         | 36 ++++++++++---------
 collapse.py => build/collapse.py              |  0
 fastaHmmr.py => build/fastaHmmr.py            |  0
 .../makeFastaFromCluster.py                   |  0
 .../nucl2ProtGolden.py                        |  0
 partialBlast.py => build/partialBlast.py      |  0
 .../partialHmmbuild.py                        |  0
 partialMuscle.py => build/partialMuscle.py    |  0
 removeDup.py => build/removeDup.py            |  0
 serialBlast.sh => build/serialBlast.sh        |  0
 serialGolden.sh => build/serialGolden.sh      |  0
 serialHmmbuild.sh => build/serialHmmbuild.sh  |  0
 serialMuscle.sh => build/serialMuscle.sh      |  0
 13 files changed, 20 insertions(+), 16 deletions(-)
 rename makeAnotations.py => annotate/makeAnotations.py (69%)
 rename collapse.py => build/collapse.py (100%)
 rename fastaHmmr.py => build/fastaHmmr.py (100%)
 rename makeFastaFromCluster.py => build/makeFastaFromCluster.py (100%)
 rename nucl2ProtGolden.py => build/nucl2ProtGolden.py (100%)
 rename partialBlast.py => build/partialBlast.py (100%)
 rename partialHmmbuild.py => build/partialHmmbuild.py (100%)
 rename partialMuscle.py => build/partialMuscle.py (100%)
 rename removeDup.py => build/removeDup.py (100%)
 rename serialBlast.sh => build/serialBlast.sh (100%)
 rename serialGolden.sh => build/serialGolden.sh (100%)
 rename serialHmmbuild.sh => build/serialHmmbuild.sh (100%)
 rename serialMuscle.sh => build/serialMuscle.sh (100%)

diff --git a/makeAnotations.py b/annotate/makeAnotations.py
similarity index 69%
rename from makeAnotations.py
rename to annotate/makeAnotations.py
index 5da26cf..6a9728d 100644
--- a/makeAnotations.py
+++ b/annotate/makeAnotations.py
@@ -2,9 +2,9 @@
 # -*- coding: utf-8 -*-
 import glob,re,argparse
 
-#from taxadb.schema import *
-#from taxadb import accession
-#from taxadb import taxid
+from taxadb.schema import *
+from taxadb import accession
+from taxadb import taxid
 
 parser = argparse.ArgumentParser()
 parser.add_argument("hmmDB", help="hmm file")
@@ -55,20 +55,24 @@ print("\nReading acc number from gathered data")
 for currCluster in allHMM.keys():
     accs = []
     for ca in allHMM[currCluster]["annotations"]:
-        print(ca)
-        accs.append(ca.split("|")[3].split(".")[0])
-    #allHMM[currCluster]["taxid"] = accession.taxid(accs, args.taxadb, Prot)
-    allHMM[currCluster]["taxid"] = "1234"
+        accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
+    allHMM[currCluster]["taxid"] = []
+    currAcc = 0
+    # proceeding by lots, to avoid `too many SQL variables` issue
+    while currAcc < len(accs):
+        allHMM[currCluster]["taxid"].append(accession.taxid(accs[currAcc:(min(currAcc+50,len(accs)-1))], args.taxadb, Prot))
+        currAcc += 50
     # from taxid family
     allHMM[currCluster]["families"] = {}
-    for ct in allHMM[currCluster]["taxid"]:
-        #lineage = taxid.lineage_name(ct[1], args.taxadb)
-        lineage = (1234,"ABCD")
-        family = lineage[2]
-        if family not in allHMM[currCluster]["families"].keys():
-            allHMM[currCluster]["families"][family] = 1
-        else:
-            allHMM[currCluster]["families"][family] += 1
+    # deloting
+    for ctgen in allHMM[currCluster]["taxid"]:
+        for ct in ctgen:
+            lineage = taxid.lineage_name(ct[1], args.taxadb)
+            family = lineage[2]
+            if family not in allHMM[currCluster]["families"].keys():
+                allHMM[currCluster]["families"][family] = 1
+            else:
+                allHMM[currCluster]["families"][family] += 1
         
 print("Done !")
 
@@ -81,4 +85,4 @@ for currCluster in allHMM.keys():
     currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n")
     currAnnot.write("FASTA SEQUENCE TITLES:\n")
     for currST in allHMM[currCluster]["annotations"]:
-        currAnnot.write(currST+"\n")
+        currAnnot.write(currST)
diff --git a/collapse.py b/build/collapse.py
similarity index 100%
rename from collapse.py
rename to build/collapse.py
diff --git a/fastaHmmr.py b/build/fastaHmmr.py
similarity index 100%
rename from fastaHmmr.py
rename to build/fastaHmmr.py
diff --git a/makeFastaFromCluster.py b/build/makeFastaFromCluster.py
similarity index 100%
rename from makeFastaFromCluster.py
rename to build/makeFastaFromCluster.py
diff --git a/nucl2ProtGolden.py b/build/nucl2ProtGolden.py
similarity index 100%
rename from nucl2ProtGolden.py
rename to build/nucl2ProtGolden.py
diff --git a/partialBlast.py b/build/partialBlast.py
similarity index 100%
rename from partialBlast.py
rename to build/partialBlast.py
diff --git a/partialHmmbuild.py b/build/partialHmmbuild.py
similarity index 100%
rename from partialHmmbuild.py
rename to build/partialHmmbuild.py
diff --git a/partialMuscle.py b/build/partialMuscle.py
similarity index 100%
rename from partialMuscle.py
rename to build/partialMuscle.py
diff --git a/removeDup.py b/build/removeDup.py
similarity index 100%
rename from removeDup.py
rename to build/removeDup.py
diff --git a/serialBlast.sh b/build/serialBlast.sh
similarity index 100%
rename from serialBlast.sh
rename to build/serialBlast.sh
diff --git a/serialGolden.sh b/build/serialGolden.sh
similarity index 100%
rename from serialGolden.sh
rename to build/serialGolden.sh
diff --git a/serialHmmbuild.sh b/build/serialHmmbuild.sh
similarity index 100%
rename from serialHmmbuild.sh
rename to build/serialHmmbuild.sh
diff --git a/serialMuscle.sh b/build/serialMuscle.sh
similarity index 100%
rename from serialMuscle.sh
rename to build/serialMuscle.sh
-- 
GitLab