Commit 01b244e9 authored by csaveanu's avatar csaveanu

added the 25/02/2019 version of the scripts

parent ae105a36
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script to generate a graphical representation of similarities between two given sequences
with added data based on the level of conservation of residues in a multiple alignment.
The script is completeley dependent on the data structure of files generated with the companion script multiplalign_to_matrix.
Usage:
--matrix, -m = matrix of values for two aligned sequences (output of the multiplalign_to_matrix.py script)
--output, -o = output file name with the graphical result, a .pdf name will output pdf, and a .svg name will output an svg
--help, -h = this help message
Cosmin Saveanu, 2019
"""
import sys, getopt
import pandas as pd
import plotnine as pn
import numpy as np
import itertools
def runs_in_pd_itertools(pdcolumn):
#finds runs of NaN and real values in an array of values
#useful for the graphical representation of blocks
#return index ranges as two lists
itergroups = itertools.groupby(np.isnan(pdcolumn))
groupedlist = []
groupedkeyslist = []
for key, group in itergroups:
groupedlist.append(len(list(group)))
groupedkeyslist.append(key)
runs = list(np.cumsum(groupedlist))
#list with indexes for each run boundaries
#the indexes might be shifted by a value of 1
list_at0 = [runs[i]-1 for i in range(0, len(runs), 2)]
list_at1 = [runs[i]-1 for i in range(1, len(runs), 2)]
if(groupedkeyslist[0] == False):
#the first group has values, NaN is false
listmin = [0]+list_at1
listmax = list_at0
else:
listmin = list_at0
listmax = list_at1+[runs[-1]]
if(listmax[-1] < listmin[-1]):
listmax.append(listmin[-1])
return((listmin, listmax))
# A few useful elements for the graph, equivalent of global variables
fourlabels=("0-25", "25-50", "50-75", "75-100")
fourcolors=("#ffffff", "#d4d4d4", "#adadad", "#000000")
fourcolors=(0.0, 0.2, 0.5, 1)
fivelabels=("0-20", "20-40", "40-60", "60-80", "80-100")
fivecolors=("#ffffff", "#d4d4d4", "#adadad", "#000000")
black = '#000000'
gray = '#666666'
red = '#FF3333'
green = '#66CC00'
blue = '#3333FF'
purple = '#9933FF'
orange = '#FF8000'
yellow = '#FFFF33'
white = "#ffffff"
gray20 = "#cccccc"
gray40 = "#999999"
gray60 = "#666666"
gray80 = "#323232"
fourcolors=(white, gray20, gray40, gray80)
def main(argv=None):
global matrix_fname, graph_fname
matrix_fname, graph_fname = "", ""
if argv is None:
argv = sys.argv
try:
try:
opts, args = getopt.getopt(argv[1:], "hm:o:", ["help",
"matrix=",
"output="])
except:
print("Problem reading arguments")
# option processing
for option, value in opts:
if option in ("-h", "--help"):
print(__doc__)
if option in ("-m", "--matrix"):
matrix_fname= value
if option in ("-o", "--output="):
graph_fname = value
print(matrix_fname, graph_fname)
print(opts)
if (matrix_fname == "" or graph_fname == ""):
print("No input filename, output file name or missing parameter.")
sys.exit(2)
else:
try:
dt=pd.read_csv(matrix_fname, sep='\t')
basedf=dt[['idx', 'position_1', 'pcaligned_1', 'position_2', 'pcaligned_2']]
basedf=basedf.rename({"idx":"idx", 'position_1':'pos1', 'pcaligned_1':'pc1', 'position_2':'pos2', 'pcaligned_2':'pc2'}, axis='columns')
#recover lists of positions that contain amino acids in each of the sequences
#what is in between are NaN
p1minmax = runs_in_pd_itertools(basedf.pos1)
p2minmax = runs_in_pd_itertools(basedf.pos2)
p1mindf=pd.DataFrame(basedf.idx[p1minmax[0]]).reset_index(drop=True)
p1maxdf=pd.DataFrame(basedf.idx[p1minmax[1]]).reset_index(drop=True)
basedf["pc1categ"] = pd.cut(basedf.pc1, bins=np.arange(-1, 101, 25))
basedf["idxpos1"] = basedf.idx[basedf.pos1.notnull()]
p2mindf=pd.DataFrame(basedf.idx[p2minmax[0]]).reset_index(drop=True)
p2maxdf=pd.DataFrame(basedf.idx[p2minmax[1]]).reset_index(drop=True)
basedf["pc2categ"] = pd.cut(basedf.pc2, bins=np.arange(-1, 101, 25))
basedf["idxpos2"] = basedf.idx[basedf.pos2.notnull()]
#the xbrks_list will start at 1 and continue by a value of 100 until the largest virtual residue number in the
#alignment of the two sequences
#next, find positions that are equivalent in the two sequences and correspond to multiples of 100 in the double alignment
lastpos1 = np.nanmax(basedf.pos1)
lastpos2 = np.nanmax(basedf.pos2)
xbrks1 = np.arange(0, lastpos1, 100)
xbrks_list1 = list(xbrks1)
xbrks_list1[0] = 1
xbrks_list1.append(lastpos1)
posdf1 = basedf.loc[basedf['pos1'].isin(xbrks_list1)]
posdf1 = posdf1.assign(labels=["{:.0f}".format(vlue) for vlue in posdf1.pos1])
xbrks2 = np.arange(0, lastpos2, 100)
xbrks_list2 = list(xbrks2)
xbrks_list2[0] = 1
xbrks_list2.append(lastpos2)
posdf2 = basedf.loc[basedf['pos2'].isin(xbrks_list2)]
posdf2 = posdf2.assign(labels=["{:.0f}".format(vlue) for vlue in posdf2.pos2])
firsty=9.0
disty1_2=4.0
widthy=1.0
brdr=0.2
middley1=firsty+widthy/2
secondy=firsty+disty1_2
middley2=middley1+disty1_2
posdf1["consty"]=firsty-widthy+3*brdr
posdf2["consty"]=secondy-widthy+3*brdr
#the limits of the sequences
x1 = np.nanmin(basedf.idxpos1)
x1end = np.nanmax(basedf.idxpos1)
x2 = np.nanmin(basedf.idxpos2)
x2end = np.nanmax(basedf.idxpos2)
pp = (pn.ggplot(basedf, pn.aes("idxpos1"))
+ pn.geom_segment(x=x1, y=middley1,
xend=x1end, yend=middley1, color="gray", size=2)
+ pn.geom_segment(x=x2, y=middley2,
xend=x2end, yend=middley2, color="gray", size=2)
+ pn.geom_rect(data=p1mindf, mapping=pn.aes(xmin=p1mindf.idx, xmax=p1maxdf.idx,
ymin=firsty-brdr, ymax=firsty+widthy+brdr),
inherit_aes=False, fill=gray20, color="black", )
+ pn.geom_linerange(mapping=pn.aes(ymin=firsty, ymax=firsty+widthy, colour="pc1categ"), alpha=0.5)
+ pn.geom_rect(data=p2mindf, inherit_aes=False, fill=gray20,
color="black", mapping=pn.aes(xmin=p2mindf.idx, xmax=p2maxdf.idx,
ymin=secondy-brdr, ymax=secondy+widthy+brdr))
+ pn.geom_linerange(mapping=pn.aes(x="idxpos2", ymin=secondy, ymax=secondy+widthy, colour="pc2categ"), alpha=0.5)
+ pn.geom_text(data=posdf1, mapping=pn.aes(x="idx", y=firsty-widthy, label="labels"), size=8)
+ pn.geom_text(data=posdf2, mapping=pn.aes(x="idx", y=secondy-widthy, label="labels"), size=8)
+ pn.geom_point(data=posdf1, mapping=pn.aes(x="idx", y="consty"), shape='|')
+ pn.geom_point(data=posdf2, mapping=pn.aes(x="idx", y="consty"), shape='|')
+ pn.theme_classic()
+ pn.ylim([5, 15])
+ pn.ylab("") + pn.xlab("")
+ pn.scale_colour_manual(name="conservation %", values=fourcolors, labels=fourlabels)
+ pn.theme(text = pn.element_text(size=8),
axis_ticks_major=pn.element_blank(),
axis_text=pn.element_blank(),
axis_line=pn.element_blank(),
panel_background=pn.element_blank(),
plot_background=pn.element_blank(),
legend_background=pn.element_rect(fill=gray20)))
#print(pp)
pn.ggplot.save(pp, filename=graph_fname, width=15, height=8, units="cm")
finally:
pass
except:
print("\t No arguments provided.")
print("\t for help use --help")
return 2
if __name__ == "__main__":
sys.exit(main())
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment