Skip to content
Snippets Groups Projects
Commit 9372c740 authored by Veronique Legrand's avatar Veronique Legrand
Browse files

added a tool to get information about the fasta sequences when analysing a virome

parent 4a5d7857
No related branches found
No related tags found
No related merge requests found
Pipeline #152938 passed with stage
in 2 hours, 3 minutes, and 16 seconds
# this script aims at getting informations on a fasta file (length of the sequences)
# Assume that the fasta file has teh following structure:
# >name1
# ATCG...
# >name2
# ATCG..
#and so on
import gzip
import sys
import statistics
if len(sys.argv) !=2:
#print(len(sys.argv))
print("you must provide filename argument")
exit(1)
#print(sys.argv[0])
#print(sys.argv[1])
filin=sys.argv[1]
name=""
genome_lines=""
seq_length_arr=list()
infile = gzip.open(filin, "rt") if filin.endswith(".gz") else open(filin, 'r')
for line in infile:
if line[0] == ">":
if genome_lines!="":
l=len(genome_lines)
seq_length_arr.append(l)
name=line[1:]
genome_line = ""
else:
genome_lines+=line
print("total number of sequences: ",len(seq_length_arr))
print("length of the 20 1rst sequences",seq_length_arr[:20])
print("length of the last 20 sequence",seq_length_arr[-20:])
print("length of the smallest sequence: ",min(seq_length_arr))
print("length of the bigest sequence",max(seq_length_arr))
print("average length of the sequences: ",sum(seq_length_arr)/len(seq_length_arr))
print("medium length of the sequences: ",statistics.median(seq_length_arr))
nb_bins=max(seq_length_arr)/1000
nb_bins=int(nb_bins)+1
#print("nb_bins=",nb_bins)
histo=[0]*nb_bins
for l in seq_length_arr:
#print(l)
idx=l//1000
#print("idx=",idx)
histo[idx]+=1
print(histo)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment