diff --git a/tools/getFastaInfos.py b/tools/getFastaInfos.py new file mode 100644 index 0000000000000000000000000000000000000000..b9da977d86b0aac64ebc105402be95bcadb4c627 --- /dev/null +++ b/tools/getFastaInfos.py @@ -0,0 +1,55 @@ +# this script aims at getting informations on a fasta file (length of the sequences) +# Assume that the fasta file has teh following structure: +# >name1 +# ATCG... +# >name2 +# ATCG.. +#and so on +import gzip +import sys +import statistics + +if len(sys.argv) !=2: + #print(len(sys.argv)) + print("you must provide filename argument") + exit(1) + +#print(sys.argv[0]) +#print(sys.argv[1]) +filin=sys.argv[1] +name="" +genome_lines="" +seq_length_arr=list() +infile = gzip.open(filin, "rt") if filin.endswith(".gz") else open(filin, 'r') +for line in infile: + if line[0] == ">": + if genome_lines!="": + l=len(genome_lines) + seq_length_arr.append(l) + name=line[1:] + genome_line = "" + else: + genome_lines+=line +print("total number of sequences: ",len(seq_length_arr)) +print("length of the 20 1rst sequences",seq_length_arr[:20]) + +print("length of the last 20 sequence",seq_length_arr[-20:]) +print("length of the smallest sequence: ",min(seq_length_arr)) +print("length of the bigest sequence",max(seq_length_arr)) + +print("average length of the sequences: ",sum(seq_length_arr)/len(seq_length_arr)) +print("medium length of the sequences: ",statistics.median(seq_length_arr)) + +nb_bins=max(seq_length_arr)/1000 +nb_bins=int(nb_bins)+1 +#print("nb_bins=",nb_bins) +histo=[0]*nb_bins + +for l in seq_length_arr: + #print(l) + idx=l//1000 + #print("idx=",idx) + histo[idx]+=1 + +print(histo) +