added a tool to get information about the fasta sequences when analysing a virome

9372c740 · Veronique Legrand · 4a5d7857 · 9372c740
Commit 9372c740 authored 1 month ago by Veronique Legrand
--- a/tools/getFastaInfos.py
+++ b/tools/getFastaInfos.py
+# this script aims at getting informations on a fasta file (length of the sequences)
+# Assume that the fasta file has teh following structure:
+# >name1
+# ATCG...
+# >name2
+# ATCG..
+#and so on
+import gzip
+import sys
+import statistics
+
+if len(sys.argv) !=2:
+    #print(len(sys.argv))
+    print("you must provide filename argument")
+    exit(1)
+
+#print(sys.argv[0])
+#print(sys.argv[1])
+filin=sys.argv[1]
+name=""
+genome_lines=""
+seq_length_arr=list()
+infile = gzip.open(filin, "rt") if filin.endswith(".gz") else open(filin, 'r')
+for line in infile:
+    if line[0] == ">":
+        if genome_lines!="":
+            l=len(genome_lines)
+            seq_length_arr.append(l)
+        name=line[1:]
+        genome_line = ""
+    else:
+        genome_lines+=line
+print("total number of sequences: ",len(seq_length_arr))
+print("length of the 20 1rst sequences",seq_length_arr[:20])
+
+print("length of the last 20 sequence",seq_length_arr[-20:])
+print("length of the smallest sequence: ",min(seq_length_arr))
+print("length of the bigest sequence",max(seq_length_arr))
+
+print("average length of the sequences: ",sum(seq_length_arr)/len(seq_length_arr))
+print("medium length of the sequences: ",statistics.median(seq_length_arr))
+
+nb_bins=max(seq_length_arr)/1000
+nb_bins=int(nb_bins)+1
+#print("nb_bins=",nb_bins)
+histo=[0]*nb_bins
+
+for l in seq_length_arr:
+    #print(l)
+    idx=l//1000
+    #print("idx=",idx)
+    histo[idx]+=1
+
+print(histo)
+