diff --git a/annotate/annotateHmmerOutput.py b/annotate/annotateHmmerOutput.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ddc3215e9eb0110b91acf0f64f64df7547c1e27
--- /dev/null
+++ b/annotate/annotateHmmerOutput.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import glob,re,argparse,os,sqlite3
+
+from taxadb.accessionid import AccessionID
+from taxadb.taxid import TaxID
+
+from collections import defaultdict
+
+def getFamAnnot(famID,annotC,tdb_taxid):
+    c = annotC.cursor()
+    c.execute("SELECT LCAtaxid FROM family WHERE id=?",(famID,))
+    data = c.fetchone()
+    lcaTaxid = data[0]
+    lineage = tdb_taxid.lineage_name(lcaTaxid)
+    if not lineage:
+        print("notLineage " + str(famID))
+        lineage = ["Unknown"]
+    
+    c.execute("SELECT str,freq FROM fam_kw JOIN keyword ON fam_kw.kwId = keyword.id WHERE famID=? order by freq desc",(famID,))
+    keywords = c.fetchall()
+    return(lineage,keywords)
+
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("hmmTabular", help="tabular output of a HMMER run")
+parser.add_argument("taxaDB", help="taxadb file")
+parser.add_argument("annotDB", help="annotation db file")
+parser.add_argument("annotatedOutput", help="file to be produced")
+
+
+args = parser.parse_args()
+
+conn = sqlite3.connect(args.annotDB)
+
+tdb_taxid = TaxID(dbtype='sqlite', dbname=args.taxaDB)
+
+i = open(args.hmmTabular)
+o = open(args.annotatedOutput,"w")
+for line in i:
+    if line.startswith("#"):
+        continue
+    ls = line.split()
+    sname = ls[0]
+    qname = ls[2]
+    evalue = ls[4]
+    famID = int(qname[3:-4])
+    lineage,keywords = getFamAnnot(famID,conn,tdb_taxid)
+    
+    o.write("{sname};{qname};{evalue};{lineage};{keywords}\n".format(
+            sname=sname,
+            qname=qname,
+            evalue=evalue,
+            lineage=",".join(lineage),
+            keywords=",".join(["{}({})".format(k,w) for k,w in keywords])
+        ))
+    
+
+
+conn.close()
diff --git a/annotate/annotations_schema.sql b/annotate/annotations_schema.sql
index 144eae2576424fd7c62168814fcecea570eb65dc..ae46593440ecffc49048570d5799cf9a69d2a3c6 100644
--- a/annotate/annotations_schema.sql
+++ b/annotate/annotations_schema.sql
@@ -10,10 +10,19 @@ CREATE TABLE keyword(
     str CHAR(30) UNIQUE
 );
 
+CREATE TABLE fam_kw_unsorted(
+    famId INTEGER,
+    kwId INTEGER,
+    freq INTEGER,
+    FOREIGN KEY(famId) REFERENCES family(id),
+    FOREIGN KEY(kwId) REFERENCES keyword(id),
+    PRIMARY KEY(famId,kwId)
+);
+
 CREATE TABLE fam_kw(
     famId INTEGER,
     kwId INTEGER,
-    frequency INTEGER,
+    freq INTEGER,
     FOREIGN KEY(famId) REFERENCES family(id),
     FOREIGN KEY(kwId) REFERENCES keyword(id),
     PRIMARY KEY(famId,kwId)
diff --git a/annotate/makeAnotationsIntoSQL.py b/annotate/makeAnnotationsIntoSQL.py
similarity index 81%
rename from annotate/makeAnotationsIntoSQL.py
rename to annotate/makeAnnotationsIntoSQL.py
index 718b513178087ae937ac2bb6a5c138e49cad045e..bb45d6dc3de939883c9d960fa2a3967ea6be9384 100644
--- a/annotate/makeAnotationsIntoSQL.py
+++ b/annotate/makeAnnotationsIntoSQL.py
@@ -14,7 +14,7 @@ def getClusterInfo(HMMpath):
             if line.startswith("LENG"):
                 currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
             if line.startswith("NAME"):
-                currID = re.search('NAME.*FAM([0-9]+).hmm',line).group(1)
+                currID = re.search('NAME.*FAM([0-9]+)',line).group(1)
             if line.startswith("//"):
                 clusterInfo = {"id": currID, "length": currLength}
             if line.split() == ("HMM"):
@@ -35,16 +35,17 @@ def extractSeqNames(MSApath):
 def wordCount(annotList):
     wc = defaultdict(int)
     for currAnnot in annotList:
-        words = currAnnot.strip().split("|")[-1].split()
+        words = re.findall(r"\w{3,}",currAnnot.split("|")[5])
         for word in words:
             wc[word] += 1
     return wc
 
 def getTaxInfo(tdb_accession,tdb_taxid,annotations):
-    taxInfo = {}
+    taxInfo = {"nbSeq":len(annotations)}
     accs = []
     for ca in annotations:
-        accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
+        #accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
+        accs.append(ca.split("|")[2].split(".")[0])
     taxInfo["taxid"] = []
     currAcc = 0
     # proceeding by lots, to avoid `too many SQL variables` issue
@@ -86,7 +87,7 @@ def writeAnnotations(dbConn,famID,clusterInfo,taxInfo,annotations):
     c.execute("INSERT INTO family VALUES ({id},{size},{nbSeq},{LCAtaxid});".format(
             id=famID,
             size=clusterInfo["length"],
-            nbSeq=len(taxInfo["taxon"]),
+            nbSeq=taxInfo["nbSeq"],
             LCAtaxid=taxInfo["LCA"]
         ))
     c.executemany("INSERT INTO fam_tax VALUES ({},?,?);".format(famID),taxInfo["taxon"].items())
@@ -100,7 +101,7 @@ def writeAnnotations(dbConn,famID,clusterInfo,taxInfo,annotations):
             cwid = c.lastrowid
         else:
             cwid = data[0]
-        c.execute("INSERT INTO fam_kw VALUES (?,?,?);",(famID,cwid,currCount))
+        c.execute("INSERT INTO fam_kw_unsorted VALUES (?,?,?);",(famID,cwid,currCount))
 
     dbConn.commit()
 
@@ -109,10 +110,11 @@ def writeAnnotations(dbConn,famID,clusterInfo,taxInfo,annotations):
 
 parser = argparse.ArgumentParser()
 parser.add_argument("hmmDir", help="hmm dir")
-parser.add_argument("msaDir", help="directory containing .msa.faa")
+parser.add_argument("alignedDir", help="directory containing .msa.faa")
 parser.add_argument("taxadb", help="taxadb file")
-parser.add_argument("annotDB", help="db to create in which annotation will be stored")
 parser.add_argument("schema", help="sql schema to format the db")
+parser.add_argument("annotDB", help="db to create in which annotation will be stored")
+
 
 args = parser.parse_args()
 
@@ -127,8 +129,17 @@ clusterList = glob.glob(args.hmmDir+"/*.hmm")
 for currCluster in clusterList:
     familyName = os.path.basename(currCluster)[:-4]
     clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName))
-    annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName))
+    if not clusterInfo:
+        print("Failed to read {}".format(familyName))
+        continue
+    annotations = extractSeqNames("%s/%s.fasta"%(args.alignedDir,familyName))
     taxInfo = getTaxInfo(tdb_accession,tdb_taxid,annotations)
     writeAnnotations(conn,clusterInfo["id"],clusterInfo,taxInfo,annotations)
 
+# now we need to sort the table fam_kw_unsorted
+c = conn.cursor()
+c.execute("INSERT INTO fam_kw SELECT * FROM fam_kw_unsorted ORDER BY freq DESC;")
+c.execute("DROP TABLE fam_kw_unsorted;")
+conn.commit()
+
 conn.close()
diff --git a/annotate/makeAnotations.py b/annotate/makeAnotations.py
deleted file mode 100644
index 35cef11f458a31bc260039294d9f5f10312f7eaf..0000000000000000000000000000000000000000
--- a/annotate/makeAnotations.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import glob,re,argparse,os
-
-from taxadb.schema import *
-from taxadb import accession
-from taxadb import taxid
-
-def getClusterInfo(HMMpath):
-    clusterInfo = {}
-    with open(HMMpath) as HMMfile:
-        for line in HMMfile:
-            if line.startswith("LENG"):
-                currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
-            if line.startswith("NAME"):
-                currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
-            if line.startswith("//"):
-                clusterInfo = {"name": currID, "length": currLength}
-            if line.startswith("HMM"):
-                break
-    return clusterInfo
-
-
-def extractSeqNames(MSApath):
-    currMSA = open(MSApath)
-    annotations = []
-    for line in currMSA:
-        if line.startswith(">"):
-            annotations.append(line)
-    currMSA.close()
-    return annotations
-
-def getTaxInfo(annotations):
-    taxInfo = {}
-    accs = []
-    for ca in annotations:
-        accs.append(re.findall("[A-Z0-9\._]{5,10}",ca)[0].split(".")[0])
-    taxInfo["taxid"] = []
-    currAcc = 0
-    # proceeding by lots, to avoid `too many SQL variables` issue
-    while currAcc < len(accs):
-        currAccs=accs[currAcc:(min(currAcc+249,len(accs)))]
-        print("currAccs = %s"%str(currAccs))
-        taxInfo["taxid"].append(accession.taxid(currAcc, args.taxadb, Prot))
-        currAcc += 249
-    # from taxid family
-    taxInfo["families"] = {}
-    
-    LCAs = []
-    
-    # deloting
-    for ctgen in taxInfo["taxid"]:
-        for ct in ctgen:
-            print("plop %s"%ct[1])
-            lineage = taxid.lineage_name(ct[1], args.taxadb, reverse=True)
-            #managing LCA
-            if len(LCAs) == 0:
-                LCAs = lineage
-                print("LCA: %s"% (str(LCAs)))
-            else:
-                print("LCA: %s"% (str(LCAs)))
-                for index,taxon in enumerate(LCAs):
-                    if lineage[index] != taxon:
-                        LCAs = LCAs[:index]
-                        break
-            family = lineage[-3]
-            if family not in taxInfo["families"].keys():
-                taxInfo["families"][family] = 1
-            else:
-                taxInfo["families"][family] += 1
-    
-    taxInfo["LCA"] = LCAs[-1]
-    return taxInfo
-
-
-def writeAnnotations(annotationPath,clusterInfo,taxInfo,annotations):
-    annotFile = open(annotationPath,"w")
-    annotFile.write("LENGTH\t" + str(clusterInfo["length"])+"\n")
-    annotFile.write("FAMILIES\t" + str(taxInfo["families"]) + "\n")
-    annotFile.write("LCA\t%s\n"%(taxInfo["LCA"]))
-    annotFile.write("FASTA SEQUENCE TITLES:\n")
-    for currST in annotations:
-        annotFile.write(currST)
-
-
-
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("hmmDir", help="hmm dir")
-parser.add_argument("msaDir", help="directory containing .msa.faa")
-parser.add_argument("taxadb", help="taxadb file")
-parser.add_argument("annotationDir", help="directory where annotation files have to be written")
-
-args = parser.parse_args()
-
-clusterList = glob.glob(args.hmmDir+"/*.hmm")
-for currCluster in clusterList:
-    familyName = os.path.basename(currCluster)[:-4]
-    print(familyName)
-    clusterInfo = getClusterInfo("%s/%s.hmm"%(args.hmmDir,familyName))
-    annotations = extractSeqNames("%s/%s.msa.faa"%(args.msaDir,familyName))
-    taxInfo = getTaxInfo(annotations)
-    writeAnnotations("%s/%s.txt"%(args.annotationDir,familyName))
-
-    
diff --git a/annotate/sqlAnnotToTxt.py b/annotate/sqlAnnotToTxt.py
new file mode 100644
index 0000000000000000000000000000000000000000..146b0db869ef902568be5e0ae599ac60ff02d45c
--- /dev/null
+++ b/annotate/sqlAnnotToTxt.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import glob,re,argparse,os,sqlite3
+
+from taxadb.accessionid import AccessionID
+from taxadb.taxid import TaxID
+
+from collections import defaultdict
+
+parser = argparse.ArgumentParser()
+parser.add_argument("sqlite", help="sqlite DB")
+parser.add_argument("txtDir", help="directory in which produce annotations")
+parser.add_argument("taxadb", help="taxadb file")
+parser.add_argument("--prefix", help="file prefix. Eg: “FAM”, default none",type=str,default="")
+
+
+args = parser.parse_args()
+
+
+conn = sqlite3.connect(args.sqlite)
+tdb_taxid = TaxID(dbtype='sqlite', dbname=args.taxadb)
+
+ccf = conn.cursor()
+ckw = conn.cursor()
+
+for currFamR in ccf.execute('SELECT * FROM family'):
+    currF,currSize,currNbseq,currLCA = currFamR
+    o = open("{dir}/{prefix}{id}.txt".format(dir=args.txtDir,prefix=args.prefix,id=currF),"w")
+    lineage = tdb_taxid.lineage_name(int(currLCA), reverse=True)
+    o.write("LENGTH\t{length}\nLCA\t{lca}\nNBSEQ\t{nbseq}\nKEYWORDS:\n".format(length=currSize,lca=("::".join(tdb_taxid.lineage_name(int(currLCA), reverse=True)) if lineage else ""),nbseq=currNbseq))
+    for currFreq,currKw in ckw.execute('select freq,str from fam_kw JOIN keyword ON fam_kw.kwId = keyword.id WHERE fam_kw.famID=? order by freq desc',(currF,)):
+        o.write("{kw}\t{count}\n".format(kw=currKw.strip("[]()"),count=currFreq))
+    o.write("SEQUENCES:\n")
+    with open("aligned/FAM{famName}.fasta".format(famName = str(currF).zfill(6))) as f:
+        for line in f:
+            if line.startswith(">"):
+                o.write(line[1:])
+    
+    
+
+
+
+conn.close()
diff --git a/build/makeFastaFromCluster.py b/build/makeFastaFromCluster.py
index fa51a88a0e836389ed300c95e8dff783c804d742..0077063422c43609beaeae1d83d6d8d4ff21da36 100644
--- a/build/makeFastaFromCluster.py
+++ b/build/makeFastaFromCluster.py
@@ -39,3 +39,4 @@ for line in sequences:
         currSequence = []
     else:
         currSequence.append(line)
+addSeqToFamilyFile(currName,currSequence)
diff --git a/build/nucl2ProtGolden.py b/build/nucl2ProtGolden.py
index c2e910aaa6d0960eccf5848131dd7924e9f2d197..8ff6f3e363038f354cbd37efad662d3bb9c578dc 100644
--- a/build/nucl2ProtGolden.py
+++ b/build/nucl2ProtGolden.py
@@ -29,14 +29,15 @@ if p:
 else:
     accList = allList    
 
-of = open(dbName+ partialString +"_prot.fasta","w")
+of = open(dbName+ partialString,"w")
 nucAccI = open(dbName+ partialString +"_included.txt","w")
 nucAccX = open(dbName+ partialString +"_excluded.txt","w")
 
 ntf.close()
 
-def addCDStoDict(cdsString,cdsDict,organism,nuclAcc):
+def addCDStoDict(cdsString,cdsDict,organism,nuclAcc,origBank):
     if len(cdsString)!=0 and cdsString[0].strip().startswith("CDS"):
+        bank = "REFSEQ" if origBank == "REFSEQ" else "GENBANK"
         currProduct = ""
         currSequence = []
         currAcc = ""
@@ -54,11 +55,11 @@ def addCDStoDict(cdsString,cdsDict,organism,nuclAcc):
             else:
                 currSequence.append(cc)
         if currProduct != "" and len(currSequence) != 0:
-            entryName = ("{}| {} [{}]".format(currAcc,currProduct,organism))
+            entryName = ("acc|{bank}|{currAcc}|{origBank}|{origAcc}|{currProduct} [{organism}]".format(currAcc=currAcc,currProduct=currProduct,organism=organism,origBank=origBank,origAcc=nuclAcc,bank=bank))
             cdsDict[entryName] = "".join(currSequence).strip("\"")
     del cdsString[:]
 
-def extractProducts(geneEntry,acc):
+def extractProducts(geneEntry,acc,origBank):
     prots = {}
     currCDSstr = []
     organism = "--unknown organism"
@@ -75,22 +76,27 @@ def extractProducts(geneEntry,acc):
                 continue
         if inFeatures:
             if not line.startswith("     "):
-                addCDStoDict(currCDSstr,prots,organism,acc)
+                addCDStoDict(currCDSstr,prots,organism,acc,origBank)
                 break
         # here we are in Features :
         if(line[5] != " "):
-            addCDStoDict(currCDSstr,prots,organism,acc)
+            addCDStoDict(currCDSstr,prots,organism,acc,origBank)
         currCDSstr.append(line.rstrip())
     return prots
 
 cpt = 0
 for acc in accList:
-    acc = acc.strip().split(".")[0]
+    als = acc.strip().split("|")
+    acc = als[1].split(".")[0]
+    origBank = als[0]
     cpt+=1
-    goldenResult = Golden.access(("refseq" if "_" in acc else "genbank"),acc)
+    try:
+        goldenResult = Golden.access(("refseq" if origBank=="REFSEQ" else "genbank"),acc)
+    except UnicodeDecodeError:
+        print("UnicodeDecodeError: {}".format(acc))
     if goldenResult:
         nucAccI.write(acc + "\n")
-        prots = extractProducts(StringIO(goldenResult),acc)
+        prots = extractProducts(StringIO(goldenResult),acc,origBank)
         for name,seq in prots.items():
             of.write(">" + name + "\n" + seq + "\n")
     else: