From 46ad7c44c821a62575dace80ba9e1b6b815d9b48 Mon Sep 17 00:00:00 2001
From: Veronique Legrand <vlegrand@pasteur.fr>
Date: Thu, 5 Dec 2024 15:52:45 +0100
Subject: [PATCH] changed the loading of data required to draw the global plot;
 now read only 3 columns of the original hdf file.

---
 jass/models/plots.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/jass/models/plots.py b/jass/models/plots.py
index f00b5ad6..ef60219e 100644
--- a/jass/models/plots.py
+++ b/jass/models/plots.py
@@ -17,10 +17,10 @@ from matplotlib import colors
 import matplotlib.patches as mpatches
 from scipy.stats import norm, chi2
 import seaborn as sns
-import os
 from pandas import DataFrame, read_hdf
 import pandas as pd
 
+default_chunk_size=50
 
 def replaceZeroes(df):
     """
@@ -32,30 +32,34 @@ def replaceZeroes(df):
     df.values[df.values == 0] = min_nonzero
     return df
 
-def create_global_plot(work_file_path: str, global_plot_path: str):
+
+def get_info_4_global_plot(work_file_path: str):
+    regions = read_hdf(work_file_path, "Regions",columns=['Region','CHR','MiddlePosition'])
+    print(regions.dtypes)
+    N_reg = regions.Region.max()  # Keep biggest element in Region column
+    binf = regions.Region.iloc[0]
+    chr_considered = regions.CHR.unique()
+    length_chr = regions.groupby("CHR").MiddlePosition.max() / 10 ** 6
+    length_chr.loc[0] = 0
+    return N_reg,binf,chr_considered,length_chr
+
+def create_global_plot(work_file_path: str, global_plot_path: str, chunk_size:int =default_chunk_size):
     """
     create_global_plot
     generate genome-wide manhattan plot for a given set of phenotypes
     """
 
-    regions = read_hdf(work_file_path, "Regions")
-    #chr_length = regions.groupby('CHR').max().position
-    N_reg= regions.Region.max()
+    N_reg,binf,chr_considered,length_chr=get_info_4_global_plot(work_file_path)
     maxy = 0
 
     fig = plt.figure(figsize=(30, 12))
     ax = fig.add_subplot(111)
 
-    chunk_size = 50
     colors = [
         '#4287f5',
         'orangered'
         ]
-    binf=regions.Region.iloc[0]
-    bsup= binf+chunk_size
-    chr_considered= regions.CHR.unique()
-    length_chr = regions.groupby("CHR").MiddlePosition.max() / 10**6
-    length_chr.loc[0] = 0
+
     label = "Chr"+length_chr.loc[chr_considered].index.astype("str")
 
     lab_pos = length_chr.loc[chr_considered]/2
@@ -63,7 +67,7 @@ def create_global_plot(work_file_path: str, global_plot_path: str):
     pos_shift.index = pos_shift.index +1
     pos_shift.loc[chr_considered[0]] = 0
     lab_pos = lab_pos + [pos_shift.loc[i] for i in chr_considered]
-
+    bsup = binf + chunk_size
     while binf < N_reg:
         df = read_hdf(work_file_path, "SumStatTab", columns=["CHR","position", 'JASS_PVAL', "Region"], where = "Region >= {0} and Region < {1}".format(binf, bsup))
         binf+= chunk_size
-- 
GitLab