Commit 61926e2f authored by Hanna  JULIENNE's avatar Hanna JULIENNE

enriched doc string

parent e47676db
......@@ -13,6 +13,12 @@ Welcome to Peppa-PIG's documentation!
What is Peppa-PIG ?
===================
Dependancies
============
peppa-pig require plink version
Installation
============
......
......@@ -7,14 +7,12 @@ import pandas as pd
from .windows import ld_region_centered_window_imputation, prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel
class ImputationLauncher(object):
"""
Class perform imputation of snp from summary statistic
Class to perform imputation of snp from summary statistic
"""
def __init__(self, window_size=10000, buf=2500,
lamb= 0.01, pinv_rcond = 0.01 ):
lamb= 0.01, pinv_rcond = 0.01):
"""
Initialise the imputation object. Fix the windows size, the buffer size
and the king of imputation employed
......@@ -22,12 +20,11 @@ class ImputationLauncher(object):
Args:
window_size (int): size of the imputation window in bp
buffer (int): the size of the padding around the windows of
imputation (relevant only for batch imputation)
imputation (relevant only for batch imputation)
lamb (float): size of the increment added to snp correlation
matrices to make it less singular
matrices to make it less singular
pinv_rcond (float): the rcond scipy.linalg.pinv function argument.
The scipy.linalg.pinv is used to invert
the correlation matrices
The scipy.linalg.pinv is used to invert the correlation matrices
"""
self.window_size = window_size
......@@ -42,12 +39,13 @@ class ImputationLauncher(object):
parameters
Args:
chrom : str specifying chromosome
zscore : known zscore
ref_panel : location of the folder of reference chromosome
ld_folder: location of linkage desiquilibrium matrices
Returns
Imputed zscore dataframe
chrom (str): chromosome "chr*"
zscore (pandas dataframe): known zscore
ref_panel (str): path of the folder of reference panel
ld_folder (str): path of the folder containing linkage desiquilibrium matrices
Returns:
pandas dataframe: Imputed zscore dataframe
"""
pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
......@@ -70,8 +68,15 @@ class ImputationLauncher(object):
def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output):
"""
Launch imputation on all chromosome for one trait
Launch imputation on all chromosome for one trait by calling
chromosome_imputation for each chromosome
Args:
gwas_tag (str): a short string to annotate imputed GWAS files
ref_panel_folder (str): path of the folder of reference panel
ld_folder (str): path of the folder containing linkage desiquilibrium matrices
zscore_folder (str): path of the folder for input GWAS files
folder_output (str): path of the folder for imputed GWAS files
"""
for i in range(1, 23):
......
......@@ -3,7 +3,7 @@
Function set to compute LD correlation from a reference panel
in predefined Region
LD matrix are then stored to the scipy sparse matrix format
LD matrix are then transformed to the pandas sparse format
"""
import scipy as sc
......@@ -16,7 +16,15 @@ import re
def launch_plink_ld(startpos, endpos, chr, reffile, folder):
"""
launch plink ld
launch plink linkage desiquilibrium correlation and save
the ouput
Args:
startpos (int): position of the start of the window
endpos (int): position of the end of the window
chr (str): chromosome position
reffile (str): reference panel file
folder (str): output folder
"""
bimref = reffile + ".bim"
ref_panel = pd.read_csv(bimref, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
......@@ -26,16 +34,23 @@ def launch_plink_ld(startpos, endpos, chr, reffile, folder):
fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos)
cmd = "p-link --noweb --bfile {0} --r --ld-snp-list ./snp_list.txt --ld-window 50 --ld-window-kb 3000 --ld-window-r2 0.4 --chr {1} --out {2}".format(reffile, chr, fo)
cmd = "plink --bfile {0} --r --ld-snp-list ./snp_list.txt --ld-window 50 --ld-window-kb 3000 --ld-window-r2 0.4 --chr {1} --out {2}".format(reffile, chr, fo)
sub.check_output(cmd, shell=True)
def generate_sparse_matrix(plink_ld, ref_chr_df):
"""
Extract correlation matrix from the plink correlation
file generated by ld_matrix.launch_plink_ld
read plink results create a sparse dataframe LD-matrix
then save it to a zipped pickle
Args:
plink_ld (str): path to the plink correlation matrix file
ref_chr_df (str):
Returns:
pandas.SparseDataFrame : Linkage desiquilibrium matrix
"""
plink_ld = pd.read_csv(plink_ld, sep = "\s+")
......@@ -51,12 +66,16 @@ def generate_sparse_matrix(plink_ld, ref_chr_df):
mat_ld = mat_ld.loc[re_index, re_index]
mat_ld = mat_ld.to_sparse()
return mat_ld
#mat_ld.to_pickle(path_ld_mat,, compression='gzip')
def generate_genome_matrices(region_files, reffolder, folder_output):
"""
go through region files and compute LD matrix for each transform and
save the results in a pandas sparse dataframe
Args:
region_files (str) : region file containing beginning and end position
reffolder (str) : folder of reference panel
folder_output (str): folder to save plink LD correlation result files
"""
regions = pd.read_csv(region_files)
for reg in regions.iterrows():
......
......@@ -21,6 +21,7 @@ import scipy.linalg
def compute_mu(sig_i_t, sig_t_inv, zt):
"""
Compute the estimation of z-score from neighborring snp
Args:
sig_i_t (matrix?) : correlation matrix with line corresponding to
unknown Snp (snp to impute) and column to known SNPs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment