From 46ad7c44c821a62575dace80ba9e1b6b815d9b48 Mon Sep 17 00:00:00 2001 From: Veronique Legrand <vlegrand@pasteur.fr> Date: Thu, 5 Dec 2024 15:52:45 +0100 Subject: [PATCH] changed the loading of data required to draw the global plot; now read only 3 columns of the original hdf file. --- jass/models/plots.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/jass/models/plots.py b/jass/models/plots.py index f00b5ad6..ef60219e 100644 --- a/jass/models/plots.py +++ b/jass/models/plots.py @@ -17,10 +17,10 @@ from matplotlib import colors import matplotlib.patches as mpatches from scipy.stats import norm, chi2 import seaborn as sns -import os from pandas import DataFrame, read_hdf import pandas as pd +default_chunk_size=50 def replaceZeroes(df): """ @@ -32,30 +32,34 @@ def replaceZeroes(df): df.values[df.values == 0] = min_nonzero return df -def create_global_plot(work_file_path: str, global_plot_path: str): + +def get_info_4_global_plot(work_file_path: str): + regions = read_hdf(work_file_path, "Regions",columns=['Region','CHR','MiddlePosition']) + print(regions.dtypes) + N_reg = regions.Region.max() # Keep biggest element in Region column + binf = regions.Region.iloc[0] + chr_considered = regions.CHR.unique() + length_chr = regions.groupby("CHR").MiddlePosition.max() / 10 ** 6 + length_chr.loc[0] = 0 + return N_reg,binf,chr_considered,length_chr + +def create_global_plot(work_file_path: str, global_plot_path: str, chunk_size:int =default_chunk_size): """ create_global_plot generate genome-wide manhattan plot for a given set of phenotypes """ - regions = read_hdf(work_file_path, "Regions") - #chr_length = regions.groupby('CHR').max().position - N_reg= regions.Region.max() + N_reg,binf,chr_considered,length_chr=get_info_4_global_plot(work_file_path) maxy = 0 fig = plt.figure(figsize=(30, 12)) ax = fig.add_subplot(111) - chunk_size = 50 colors = [ '#4287f5', 'orangered' ] - binf=regions.Region.iloc[0] - bsup= binf+chunk_size - chr_considered= regions.CHR.unique() - length_chr = regions.groupby("CHR").MiddlePosition.max() / 10**6 - length_chr.loc[0] = 0 + label = "Chr"+length_chr.loc[chr_considered].index.astype("str") lab_pos = length_chr.loc[chr_considered]/2 @@ -63,7 +67,7 @@ def create_global_plot(work_file_path: str, global_plot_path: str): pos_shift.index = pos_shift.index +1 pos_shift.loc[chr_considered[0]] = 0 lab_pos = lab_pos + [pos_shift.loc[i] for i in chr_considered] - + bsup = binf + chunk_size while binf < N_reg: df = read_hdf(work_file_path, "SumStatTab", columns=["CHR","position", 'JASS_PVAL', "Region"], where = "Region >= {0} and Region < {1}".format(binf, bsup)) binf+= chunk_size -- GitLab