Commit 01b244e9 authored by csaveanu's avatar csaveanu

added the 25/02/2019 version of the scripts

parent ae105a36
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script to generate a graphical representation of similarities between two given sequences
with added data based on the level of conservation of residues in a multiple alignment.
The script is completeley dependent on the data structure of files generated with the companion script multiplalign_to_matrix.
Usage:
--matrix, -m = matrix of values for two aligned sequences (output of the multiplalign_to_matrix.py script)
--output, -o = output file name with the graphical result, a .pdf name will output pdf, and a .svg name will output an svg
--help, -h = this help message
Cosmin Saveanu, 2019
"""
import sys, getopt
import pandas as pd
import plotnine as pn
import numpy as np
import itertools
def runs_in_pd_itertools(pdcolumn):
#finds runs of NaN and real values in an array of values
#useful for the graphical representation of blocks
#return index ranges as two lists
itergroups = itertools.groupby(np.isnan(pdcolumn))
groupedlist = []
groupedkeyslist = []
for key, group in itergroups:
groupedlist.append(len(list(group)))
groupedkeyslist.append(key)
runs = list(np.cumsum(groupedlist))
#list with indexes for each run boundaries
#the indexes might be shifted by a value of 1
list_at0 = [runs[i]-1 for i in range(0, len(runs), 2)]
list_at1 = [runs[i]-1 for i in range(1, len(runs), 2)]
if(groupedkeyslist[0] == False):
#the first group has values, NaN is false
listmin = [0]+list_at1
listmax = list_at0
else:
listmin = list_at0
listmax = list_at1+[runs[-1]]
if(listmax[-1] < listmin[-1]):
listmax.append(listmin[-1])
return((listmin, listmax))
# A few useful elements for the graph, equivalent of global variables
fourlabels=("0-25", "25-50", "50-75", "75-100")
fourcolors=("#ffffff", "#d4d4d4", "#adadad", "#000000")
fourcolors=(0.0, 0.2, 0.5, 1)
fivelabels=("0-20", "20-40", "40-60", "60-80", "80-100")
fivecolors=("#ffffff", "#d4d4d4", "#adadad", "#000000")
black = '#000000'
gray = '#666666'
red = '#FF3333'
green = '#66CC00'
blue = '#3333FF'
purple = '#9933FF'
orange = '#FF8000'
yellow = '#FFFF33'
white = "#ffffff"
gray20 = "#cccccc"
gray40 = "#999999"
gray60 = "#666666"
gray80 = "#323232"
fourcolors=(white, gray20, gray40, gray80)
def main(argv=None):
global matrix_fname, graph_fname
matrix_fname, graph_fname = "", ""
if argv is None:
argv = sys.argv
try:
try:
opts, args = getopt.getopt(argv[1:], "hm:o:", ["help",
"matrix=",
"output="])
except:
print("Problem reading arguments")
# option processing
for option, value in opts:
if option in ("-h", "--help"):
print(__doc__)
if option in ("-m", "--matrix"):
matrix_fname= value
if option in ("-o", "--output="):
graph_fname = value
print(matrix_fname, graph_fname)
print(opts)
if (matrix_fname == "" or graph_fname == ""):
print("No input filename, output file name or missing parameter.")
sys.exit(2)
else:
try:
dt=pd.read_csv(matrix_fname, sep='\t')
basedf=dt[['idx', 'position_1', 'pcaligned_1', 'position_2', 'pcaligned_2']]
basedf=basedf.rename({"idx":"idx", 'position_1':'pos1', 'pcaligned_1':'pc1', 'position_2':'pos2', 'pcaligned_2':'pc2'}, axis='columns')
#recover lists of positions that contain amino acids in each of the sequences
#what is in between are NaN
p1minmax = runs_in_pd_itertools(basedf.pos1)
p2minmax = runs_in_pd_itertools(basedf.pos2)
p1mindf=pd.DataFrame(basedf.idx[p1minmax[0]]).reset_index(drop=True)
p1maxdf=pd.DataFrame(basedf.idx[p1minmax[1]]).reset_index(drop=True)
basedf["pc1categ"] = pd.cut(basedf.pc1, bins=np.arange(-1, 101, 25))
basedf["idxpos1"] = basedf.idx[basedf.pos1.notnull()]
p2mindf=pd.DataFrame(basedf.idx[p2minmax[0]]).reset_index(drop=True)
p2maxdf=pd.DataFrame(basedf.idx[p2minmax[1]]).reset_index(drop=True)
basedf["pc2categ"] = pd.cut(basedf.pc2, bins=np.arange(-1, 101, 25))
basedf["idxpos2"] = basedf.idx[basedf.pos2.notnull()]
#the xbrks_list will start at 1 and continue by a value of 100 until the largest virtual residue number in the
#alignment of the two sequences
#next, find positions that are equivalent in the two sequences and correspond to multiples of 100 in the double alignment
lastpos1 = np.nanmax(basedf.pos1)
lastpos2 = np.nanmax(basedf.pos2)
xbrks1 = np.arange(0, lastpos1, 100)
xbrks_list1 = list(xbrks1)
xbrks_list1[0] = 1
xbrks_list1.append(lastpos1)
posdf1 = basedf.loc[basedf['pos1'].isin(xbrks_list1)]
posdf1 = posdf1.assign(labels=["{:.0f}".format(vlue) for vlue in posdf1.pos1])
xbrks2 = np.arange(0, lastpos2, 100)
xbrks_list2 = list(xbrks2)
xbrks_list2[0] = 1
xbrks_list2.append(lastpos2)
posdf2 = basedf.loc[basedf['pos2'].isin(xbrks_list2)]
posdf2 = posdf2.assign(labels=["{:.0f}".format(vlue) for vlue in posdf2.pos2])
firsty=9.0
disty1_2=4.0
widthy=1.0
brdr=0.2
middley1=firsty+widthy/2
secondy=firsty+disty1_2
middley2=middley1+disty1_2
posdf1["consty"]=firsty-widthy+3*brdr
posdf2["consty"]=secondy-widthy+3*brdr
#the limits of the sequences
x1 = np.nanmin(basedf.idxpos1)
x1end = np.nanmax(basedf.idxpos1)
x2 = np.nanmin(basedf.idxpos2)
x2end = np.nanmax(basedf.idxpos2)
pp = (pn.ggplot(basedf, pn.aes("idxpos1"))
+ pn.geom_segment(x=x1, y=middley1,
xend=x1end, yend=middley1, color="gray", size=2)
+ pn.geom_segment(x=x2, y=middley2,
xend=x2end, yend=middley2, color="gray", size=2)
+ pn.geom_rect(data=p1mindf, mapping=pn.aes(xmin=p1mindf.idx, xmax=p1maxdf.idx,
ymin=firsty-brdr, ymax=firsty+widthy+brdr),
inherit_aes=False, fill=gray20, color="black", )
+ pn.geom_linerange(mapping=pn.aes(ymin=firsty, ymax=firsty+widthy, colour="pc1categ"), alpha=0.5)
+ pn.geom_rect(data=p2mindf, inherit_aes=False, fill=gray20,
color="black", mapping=pn.aes(xmin=p2mindf.idx, xmax=p2maxdf.idx,
ymin=secondy-brdr, ymax=secondy+widthy+brdr))
+ pn.geom_linerange(mapping=pn.aes(x="idxpos2", ymin=secondy, ymax=secondy+widthy, colour="pc2categ"), alpha=0.5)
+ pn.geom_text(data=posdf1, mapping=pn.aes(x="idx", y=firsty-widthy, label="labels"), size=8)
+ pn.geom_text(data=posdf2, mapping=pn.aes(x="idx", y=secondy-widthy, label="labels"), size=8)
+ pn.geom_point(data=posdf1, mapping=pn.aes(x="idx", y="consty"), shape='|')
+ pn.geom_point(data=posdf2, mapping=pn.aes(x="idx", y="consty"), shape='|')
+ pn.theme_classic()
+ pn.ylim([5, 15])
+ pn.ylab("") + pn.xlab("")
+ pn.scale_colour_manual(name="conservation %", values=fourcolors, labels=fourlabels)
+ pn.theme(text = pn.element_text(size=8),
axis_ticks_major=pn.element_blank(),
axis_text=pn.element_blank(),
axis_line=pn.element_blank(),
panel_background=pn.element_blank(),
plot_background=pn.element_blank(),
legend_background=pn.element_rect(fill=gray20)))
#print(pp)
pn.ggplot.save(pp, filename=graph_fname, width=15, height=8, units="cm")
finally:
pass
except:
print("\t No arguments provided.")
print("\t for help use --help")
return 2
if __name__ == "__main__":
sys.exit(main())
"""
This script transforms a multiple fasta alignment in a similarity matrix.
The result can be further used for visualization, in which the level of conservation of a given residue
is computed on the basis of a multiple sequence alignment - however, the final output depends only on
two chosen sequences. The input file has to be in the FASTA format, with gaps shown as "-". All sequences
must have the same length.
Usage:
--alignment, -a = fasta multiple alignment (each sequence has equal length, '-' for gaps)
--uniqueid1, -u = unique identifier for one of the sequences, e.g. "scer"
--uniqueid2, -w = unique identifier for a second sequence, e.g. "hsap"
--output, -o = output file name with the results, usable in R, for example
--help, -h = this help message
Cosmin Saveanu, 2019
"""
import sys, getopt
from operator import itemgetter
import pandas as pd
from collections import Counter
#classes defined by code copied from Stack Overflow to work with FASTA files
class Prot:
''' Object representing a FASTA record. '''
def __init__(self, header, sequence):
self.head = header
self.seq = sequence
def __repr__(self):
return '[HTML]' % (self.head)
def __str__(self, separator=''):
return '>%s\n%s' % (self.head, separator.join(self.seq))
def __len__(self):
return len(''.join(self.seq))
@property
def sequence(self, separator=''):
return separator.join(self.seq)
class Fasta:
''' A FASTA iterator/generates Prot objects. '''
def __init__(self, handle):
self.handle = handle
def __repr__(self):
return '[HTML]' % (self.handle)
def __iter__(self):
header, sequence = '', []
for line in self.handle:
if line[0] == '>':
if sequence: yield Prot(header, sequence)
header = line[1:-1]
sequence = []
else:
sequence.append(line.strip())
yield Prot(header, sequence)
# do fasta = Fasta(handle) and then for record in fasta to yield Prot objects
#main
#__________________________________
#uniqueid_str1="scer"
#uniqueid_str2="hsap"
def get_positions_from_gapped(seqwithgaps):
#recover a list of lists that has the same
#length as the original sequence, but has the
#position of the character in the string as the first
#element and its real position in the "ungapped" sequence
pos_myseq=[]
ctr_true = 0
ctr_align = 0
for aa in seqwithgaps:
if aa=="-":
pos_myseq.append((ctr_align, 0))
else:
pos_myseq.append((ctr_align, ctr_true))
ctr_true+=1
ctr_align+=1
return(pos_myseq)
def get_ungapped_positions(seqwithgaps):
pos_myseq_short=[]
ctr_true = 0
ctr_align = 0
for aa in seqwithgaps:
if aa=="-":
pass
else:
pos_myseq_short.append((ctr_align, ctr_true))
ctr_true+=1
#print(aa, ctr_true, ctr_align)
ctr_align+=1
return(pos_myseq_short)
def calculate_seq_percent(aligndata, normval):
align_data_pc=[]
for pos in aligndata:
apos = pos[0]
grcount = pos[2][1]
grgroup = pos[2][0]
ownpos = pos[1]
ownaa = pos[3]
if ownaa in grgroup:
align_data_pc.append([apos, ownpos, grcount/normval*100, grgroup, ownaa])
else:
align_data_pc.append([apos, ownpos, 0.0, grgroup, ownaa])
return(align_data_pc)
def main(argv=None):
global alignment_fname, output_fname, uniqueid_str1, uniqueid_str2
alignment_fname, output_fname, uniqueid_str1, uniqueid_str2 = "", "", "", ""
if argv is None:
argv = sys.argv
try:
try:
opts, args = getopt.getopt(argv[1:], "ha:u:w:o:", ["help",
"alignment=",
"uniqueid1=",
"uniqueid2=",
"output="])
except:
print("Problem reading arguments")
# option processing
for option, value in opts:
if option in ("-h", "--help"):
print(__doc__)
if option in ("-a", "--alignment"):
alignment_fname= value
if option in ("-u", "--uniqueid1="):
uniqueid_str1 = value
if option in ("-w", "--uniqueid2="):
uniqueid_str2 = value
if option in ("-o", "--output="):
output_fname = value
if (alignment_fname == "" or uniqueid_str1 == "" or uniqueid_str2 == "" or output_fname == ""):
print("No input filename, output file name or missing parameter.")
sys.exit(2)
else:
try:
#read data
print(alignment_fname, uniqueid_str1, uniqueid_str2, output_fname)
print("Reading ", alignment_fname)
print("The results will be written to a file called ", output_fname)
f=open(alignment_fname, "r")
fasta=Fasta(f)
align=[]
for entry in fasta:
align.append(entry)
f.close()
#put all the data in a matrix
#we assume that all the sequences have equal length, with the included gaps
proteinsequences_list = [list(prot.sequence) for prot in align]
#obtain first a list of all the sequences from the Fasta object
poslist = [list(pos) for pos in zip(*proteinsequences_list)]
#use the zip function to put together of the aminoacids at each position of the alignment
counts_per_position = [Counter(".".join(aa_at_pos)) for aa_at_pos in poslist]
#create a list of objects that contain the counts for each aminoacid at each position
#Counter calls give back 0, if there was no aminoacid - like .get(aa, 0) on dict
res_summary=[]
similaraa=["GAVLI", "FYW", "CM", "ST", "KRH", "DENQ", "P", "-"]
for position in counts_per_position:
sgroup_d={sgroup:0 for sgroup in similaraa}
#initialize a dictionary with the groups of aminoacids as keys
for simgroup in sgroup_d.keys():
for aa in position.keys():
#print(aa, simgroup)
if aa in simgroup:
sgroup_d[simgroup]=sgroup_d[simgroup]+position[aa]
res_summary.append(sgroup_d)
sorted_res=[]
for pos in res_summary:
sorted_res.append(sorted(pos.items(), key=itemgetter(1), reverse=True))
#res_summary has now tuples: "aagroup" , number of proteins in alignment
#find where the sequence of interest is
for protein in align:
if uniqueid_str1 in protein.head:
print(protein.head)
myseq1=protein.sequence
for protein in align:
if uniqueid_str2 in protein.head:
print(protein.head)
myseq2=protein.sequence
#create a list of coordinates that correspond to actual aminoacids
#the list has many gaps (correspond to insertions)
#remove all positions that do not correspond to current protein
pos_myseq1_short = get_ungapped_positions(myseq1)
pos_myseq2_short = get_ungapped_positions(myseq2)
#remove the "-" from the sequence of interest
trtable = str.maketrans(dict.fromkeys('-'))
myseq1_noX = myseq1.translate(trtable)
myseq2_noX = myseq2.translate(trtable)
align_data_specific1=[]
for pos in pos_myseq1_short:
alignpos=pos[0]
singlepos=pos[1]
align_data_specific1.append([alignpos, singlepos, sorted_res[alignpos][0], myseq1_noX[singlepos]])
align_data_specific2=[]
for pos in pos_myseq2_short:
alignpos=pos[0]
singlepos=pos[1]
align_data_specific2.append([alignpos, singlepos, sorted_res[alignpos][0], myseq2_noX[singlepos]])
normval = len(align)
align_data_pc1=calculate_seq_percent(align_data_specific1, normval)
align_data_pc2=calculate_seq_percent(align_data_specific2, normval)
#example of entries: initial number, index in sequence, percent similarity, most represented group, own amino acid at position
#[45, 0, 49.45, 'CM', 'M']
#[46, 1, 0.0, '-', 'S']
dt1 = pd.DataFrame(align_data_pc1)
dt1.columns=["alignment_pos", "position", "pcaligned", "aagroup", "ownaa"]
dt1["position"]=dt1["position"]+1
dt1["alignment_pos"]=dt1["alignment_pos"]+1
dt2 = pd.DataFrame(align_data_pc2)
dt2.columns=["alignment_pos", "position", "pcaligned", "aagroup", "ownaa"]
dt2["position"]=dt2["position"]+1
dt2["alignment_pos"]=dt2["alignment_pos"]+1
dttwo = pd.merge(dt1, dt2, how='outer', on="alignment_pos",
sort=True,
suffixes=("_1", "_2"))
#dttwo.info()
#dttwo["position_1"] = dttwo["position_1"].astype('Int64')
dttwo['idx']=dttwo.index.values+1
#write the results as a table, but format first the percent values to include a single decimal digit
#dttwo['pcaligned_1'] = ["{:.1f}".format(vlue) for vlue in dttwo['pcaligned_1']]
#dttwo['pcaligned_2'] = ["{:.1f}".format(vlue) for vlue in dttwo['pcaligned_2']]
dttwo.to_csv(output_fname, sep='\t', index=False, float_format="%.0f")
#using the float_format is important because positions became float with pd.merge (NaN is coded as float...)
finally:
pass
except:
print("\t No arguments provided.")
print("\t for help use --help")
return 2
if __name__ == "__main__":
sys.exit(main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment