adding hmmerAnnotationDecode

f489a96b · Thomas BIGOT · 014d2473 · f489a96b
Commit f489a96b authored 8 years ago by Thomas BIGOT
--- a/decodeResults/hmmerAnnotationDecode.py
+++ b/decodeResults/hmmerAnnotationDecode.py
+#!/usr/bin/env python
+
+import re,ast
+
+
+def getClusterInfo(clusterID,baseURL):
+    result = dict()
+    url = baseURL + clusterID.split(".")[0] + "_annotations.txt"
+    f = open(url)
+    annotations = f.read()
+    m = re.search('FAMILIES\s(.*)',annotations)
+    result["families"] = ast.literal_eval(m.group(1))
+    m = re.search('LENGTH\s(.*)',annotations)
+    result["length"] = int(m.group(1))
+    result["id"] = clusterID
+    result["functions"] = []
+    listSeq = False
+    f.seek(0)
+    for line in f:
+        if line.startswith("FASTA SEQUENCE TITLES:"):
+            listSeq = True
+        elif listSeq:
+            result["functions"].append(line.split("|")[-1].strip())
+    result["nbSeq"] = len(result["functions"])
+    return(result)
+    
+    
+def readHmmer(pathIn,pathOut,vfamDir,verbose=False):
+    f = open(pathIn)
+    out = open(pathOut,"w")
+    out.write("subject,e-value,hmmfrom,hmmto,alifrom,alito,clusterID,clusterLength,families,functions,clusterNbSeq\n")
+    currClusterInfo = {}
+    currHits = []
+    inContigResult = False
+    for line in f:
+        if line.startswith("Query:"):
+            currHits = []
+            m = re.search(r'Query:\s+([0-9a-zA-Z_\.]+)\s+',line)
+            fam = m.group(1)
+            currClusterInfo = getClusterInfo(fam,vfamDir)
+            print(fam)
+        elif line.startswith(">>"):
+            inContigResult = True
+            currSubject = line[3:-1].strip()
+        elif inContigResult:
+            if line.startswith("\n"):
+                inContigResult = False
+            else:
+                if (line.startswith("   #") or line.startswith(" --")):
+                    continue
+                subjectData = (line.strip().split())
+                #  #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
+                newLine = currSubject+ "," + subjectData[5]+ "," + subjectData[6]+ "," + subjectData[7]+ "," + subjectData[9]+ "," + subjectData[10]+ ","  + currClusterInfo["id"] + ","  + str(currClusterInfo["length"])+ ","  + str(currClusterInfo["families"]).replace(","," ") + "," + str(currClusterInfo["functions"]).replace(","," ") + "," + str(currClusterInfo["nbSeq"]) + "\n"
+                out.write(newLine)
+                if(verbose):
+                    print(newLine)
+
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("hmmerResults", help="file generated by hmmer")
+parser.add_argument("vfamDir", help="directory containing the annnotation info")
+parser.add_argument("output", help="csv-file to produce, containing the hmmer results annotated with vfam information")
+parser.add_argument("-v","--verbose", help="print info on screen")
+
+args = parser.parse_args()
+
+readHmmer(args.hmmerResults,args.output,args.vfamDir,args.verbose)