From 61926e2fdb7a7d6615c8d24055a36109fd38be48 Mon Sep 17 00:00:00 2001 From: hanna julienne <hanna.julienne@pasteur.fr> Date: Wed, 22 Aug 2018 18:19:05 +0200 Subject: [PATCH] enriched doc string --- impute_jass/doc/source/index.rst | 6 ++++ .../impute_jass/imputation_launcher.py | 35 +++++++++++-------- impute_jass/impute_jass/ld_matrix.py | 31 ++++++++++++---- impute_jass/impute_jass/stat_models.py | 1 + 4 files changed, 52 insertions(+), 21 deletions(-) diff --git a/impute_jass/doc/source/index.rst b/impute_jass/doc/source/index.rst index e6546bb..e275b27 100644 --- a/impute_jass/doc/source/index.rst +++ b/impute_jass/doc/source/index.rst @@ -13,6 +13,12 @@ Welcome to Peppa-PIG's documentation! What is Peppa-PIG ? =================== +Dependancies +============ + +peppa-pig require plink version + + Installation ============ diff --git a/impute_jass/impute_jass/imputation_launcher.py b/impute_jass/impute_jass/imputation_launcher.py index 85a250b..2d70452 100644 --- a/impute_jass/impute_jass/imputation_launcher.py +++ b/impute_jass/impute_jass/imputation_launcher.py @@ -7,14 +7,12 @@ import pandas as pd from .windows import ld_region_centered_window_imputation, prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel - class ImputationLauncher(object): """ - Class perform imputation of snp from summary statistic - + Class to perform imputation of snp from summary statistic """ def __init__(self, window_size=10000, buf=2500, - lamb= 0.01, pinv_rcond = 0.01 ): + lamb= 0.01, pinv_rcond = 0.01): """ Initialise the imputation object. Fix the windows size, the buffer size and the king of imputation employed @@ -22,12 +20,11 @@ class ImputationLauncher(object): Args: window_size (int): size of the imputation window in bp buffer (int): the size of the padding around the windows of - imputation (relevant only for batch imputation) + imputation (relevant only for batch imputation) lamb (float): size of the increment added to snp correlation - matrices to make it less singular + matrices to make it less singular pinv_rcond (float): the rcond scipy.linalg.pinv function argument. - The scipy.linalg.pinv is used to invert - the correlation matrices + The scipy.linalg.pinv is used to invert the correlation matrices """ self.window_size = window_size @@ -42,12 +39,13 @@ class ImputationLauncher(object): parameters Args: - chrom : str specifying chromosome - zscore : known zscore - ref_panel : location of the folder of reference chromosome - ld_folder: location of linkage desiquilibrium matrices - Returns - Imputed zscore dataframe + chrom (str): chromosome "chr*" + zscore (pandas dataframe): known zscore + ref_panel (str): path of the folder of reference panel + ld_folder (str): path of the folder containing linkage desiquilibrium matrices + + Returns: + pandas dataframe: Imputed zscore dataframe """ pattern = "{0}/{1}_*.ld".format(ld_folder, chrom) zscore = prepare_zscore_for_imputation(ref_panel, zscore) @@ -70,8 +68,15 @@ class ImputationLauncher(object): def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output): """ - Launch imputation on all chromosome for one trait + Launch imputation on all chromosome for one trait by calling + chromosome_imputation for each chromosome + Args: + gwas_tag (str): a short string to annotate imputed GWAS files + ref_panel_folder (str): path of the folder of reference panel + ld_folder (str): path of the folder containing linkage desiquilibrium matrices + zscore_folder (str): path of the folder for input GWAS files + folder_output (str): path of the folder for imputed GWAS files """ for i in range(1, 23): diff --git a/impute_jass/impute_jass/ld_matrix.py b/impute_jass/impute_jass/ld_matrix.py index 95a1626..5b17f41 100644 --- a/impute_jass/impute_jass/ld_matrix.py +++ b/impute_jass/impute_jass/ld_matrix.py @@ -3,7 +3,7 @@ Function set to compute LD correlation from a reference panel in predefined Region - LD matrix are then stored to the scipy sparse matrix format + LD matrix are then transformed to the pandas sparse format """ import scipy as sc @@ -16,7 +16,15 @@ import re def launch_plink_ld(startpos, endpos, chr, reffile, folder): """ - launch plink ld + launch plink linkage desiquilibrium correlation and save + the ouput + + Args: + startpos (int): position of the start of the window + endpos (int): position of the end of the window + chr (str): chromosome position + reffile (str): reference panel file + folder (str): output folder """ bimref = reffile + ".bim" ref_panel = pd.read_csv(bimref, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1) @@ -26,16 +34,23 @@ def launch_plink_ld(startpos, endpos, chr, reffile, folder): fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos) - cmd = "p-link --noweb --bfile {0} --r --ld-snp-list ./snp_list.txt --ld-window 50 --ld-window-kb 3000 --ld-window-r2 0.4 --chr {1} --out {2}".format(reffile, chr, fo) - - + cmd = "plink --bfile {0} --r --ld-snp-list ./snp_list.txt --ld-window 50 --ld-window-kb 3000 --ld-window-r2 0.4 --chr {1} --out {2}".format(reffile, chr, fo) sub.check_output(cmd, shell=True) def generate_sparse_matrix(plink_ld, ref_chr_df): """ + Extract correlation matrix from the plink correlation + file generated by ld_matrix.launch_plink_ld read plink results create a sparse dataframe LD-matrix then save it to a zipped pickle + + Args: + plink_ld (str): path to the plink correlation matrix file + ref_chr_df (str): + + Returns: + pandas.SparseDataFrame : Linkage desiquilibrium matrix """ plink_ld = pd.read_csv(plink_ld, sep = "\s+") @@ -51,12 +66,16 @@ def generate_sparse_matrix(plink_ld, ref_chr_df): mat_ld = mat_ld.loc[re_index, re_index] mat_ld = mat_ld.to_sparse() return mat_ld - #mat_ld.to_pickle(path_ld_mat,, compression='gzip') def generate_genome_matrices(region_files, reffolder, folder_output): """ go through region files and compute LD matrix for each transform and save the results in a pandas sparse dataframe + + Args: + region_files (str) : region file containing beginning and end position + reffolder (str) : folder of reference panel + folder_output (str): folder to save plink LD correlation result files """ regions = pd.read_csv(region_files) for reg in regions.iterrows(): diff --git a/impute_jass/impute_jass/stat_models.py b/impute_jass/impute_jass/stat_models.py index 16cca0c..79f73a0 100644 --- a/impute_jass/impute_jass/stat_models.py +++ b/impute_jass/impute_jass/stat_models.py @@ -21,6 +21,7 @@ import scipy.linalg def compute_mu(sig_i_t, sig_t_inv, zt): """ Compute the estimation of z-score from neighborring snp + Args: sig_i_t (matrix?) : correlation matrix with line corresponding to unknown Snp (snp to impute) and column to known SNPs -- GitLab