imputation_launcher.py 3.75 KB
Newer Older
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
1
2
3
4
5
"""
Function set to launch imputation on a complete chromosome or
on the genome
"""
import glob
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
6
import pandas as pd
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
7
from .windows import ld_region_centered_window_imputation, impg_like_imputation, realigned_zfiles_on_panel
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
8

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
9
10
11
12
13
14
15
16
17
18
19
20

def chromosome_imputation_entry_point(chrom, zscore, ref_panel, ld_folder, window_size=10000, imputation_style="online", buf=2500, lamb= 0.01, pinv_rcond = 0.01):
    """
    Function whose only purpose is to allow the calling of the ImputationLauncher.chromosome_imputation method
    from an entry point
    """
    imputer = ImputationLauncher( window_size=10000, imputation_style="online", buf=2500, lamb= 0.01, pinv_rcond = 0.01)
    imputer.chromosome_imputation(chrom, zscore, ref_panel, ld_folder)




Hanna  JULIENNE's avatar
Hanna JULIENNE committed
21
class ImputationLauncher(object):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
22
23
    """
    Class perform imputation of snp from summary statistic
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
24

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
25
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
26
    def __init__(self, window_size=10000, imputation_style="online", buf=2500, lamb= 0.01, pinv_rcond = 0.01 ):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
27
28
29
30
31
32
33
34
35
36
37
38
        """

        Args:
            window_size (int): size of the imputation window in bp
            imputation_style (str): define if the windows while span the genome in a non overlapping fashion ("batch") or
                                    by being centered on each snp to impute ('online')
            buffer (int): the size of the padding around the windows of imputation (relevant only for batch imputation)
            lamb (float): size of the increment added to snp correlation matrices to make it less singular
            pinv_rcond (float): the rcond scipy.linalg.pinv function argument. The scipy.linalg.pinv is used to invert
             the correlationmatrices

        """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
39
        self.imputation_style = imputation_style
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
40
        self.window_size = window_size
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
41
42
43
        self.buffer = buf
        self.lamb = lamb
        self.rcond = pinv_rcond
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
44

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
45
    def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
46
        """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
47
        Impute the panel zscore score for one chromosome and with the specified parameters
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
48

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
49
50
51
52
53
54
55
        Args:
            chrom : str specifying chromosome
            zscore : known zscore
            ref_panel : location of the folder of reference chromosome
            ld_folder: location of linkage desiquilibrium matrices
        Returns
            Imputed zscore dataframe
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
56
57
        """
        pattern = "{0}/{1}*.ld".format(ld_folder, chrom)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
58
59
60
61
62
        if self.imputation_style == "online":
            def imputer(ld_file):
                return ld_region_centered_window_imputation(ld_file, ref_panel, zscore, self.window_size)
        elif self.imputation_style == "batch":
            def imputer(ld_file):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
63
                return impg_like_imputation(ld_file, ref_panel, zscore, self.window_size, self.buffer, self.lamb, self.rcond)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
64

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
65
66
67
68
69
70
        for ld_file in glob.glob(pattern):
            print("processing Region: {0}".format(ld_file))
            zscore = imputer(ld_file)

        zscore = realigned_zfiles_on_panel(ref_panel, zscore)
        return zscore
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
71
72
73
74
75
76
77
78
79
80
81
82
83
84


    def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output):
        """
        Launch imputation on all chromosome

        """

        for i in range(1, 23):

            ref_panel_file = ref_panel_folder + "/chr" + str(i) + ".eur.1pct.bim"
            ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)


Hanna  JULIENNE's avatar
Hanna JULIENNE committed
85
            known_zscore_file = zscore_folder + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
86
87
88
89
90
            known_zscore = pd.read_csv(known_zscore_file, index_col=0, sep="\t")

            chrom = "chr"+str(i)
            z_imp = self.chromosome_imputation(chrom, known_zscore, ref_panel, ld_folder)

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
91
            imputed_zscore = folder_output + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
92
            z_imp.to_csv(imputed_zscore, sep="\t")