imputation_launcher.py 4.03 KB
Newer Older
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
1
2
3
4
5
"""
Function set to launch imputation on a complete chromosome or
on the genome
"""
import glob
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
6
import pandas as pd
7
from .windows import ld_region_centered_window_imputation, prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
8

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
9
10


Hanna  JULIENNE's avatar
Hanna JULIENNE committed
11
class ImputationLauncher(object):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
12
13
    """
    Class perform imputation of snp from summary statistic
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
14

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
15
    """
16
17
    def __init__(self, window_size=10000, imputation_style="batch", buf=2500,
                 lamb= 0.01, pinv_rcond = 0.01 ):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
18
        """
19
20
        Initialise the imputation object. Fix the windows size, the buffer size
         and the king of imputation employed
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
21
22
        Args:
            window_size (int): size of the imputation window in bp
23
24
25
26
27
28
29
30
31
32
            imputation_style (str): define if the windows while span the genome
                                    in a non overlapping fashion ("batch") or
                                    by being centered on each snp to impute
                                    ('online')
            buffer (int): the size of the padding around the windows of
                            imputation (relevant only for batch imputation)
            lamb (float): size of the increment added to snp correlation
                        matrices to make it less singular
            pinv_rcond (float): the rcond scipy.linalg.pinv function argument.
            The scipy.linalg.pinv is used to invert
33
             the correlation matrices
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
34
        """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
35
        self.imputation_style = imputation_style
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
36
        self.window_size = window_size
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
37
38
39
        self.buffer = buf
        self.lamb = lamb
        self.rcond = pinv_rcond
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
40

41

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
42
    def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
43
        """
44
45
        Impute the panel zscore score for one chromosome and with the specified
        parameters
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
46

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
47
48
49
50
51
52
53
        Args:
            chrom : str specifying chromosome
            zscore : known zscore
            ref_panel : location of the folder of reference chromosome
            ld_folder: location of linkage desiquilibrium matrices
        Returns
            Imputed zscore dataframe
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
54
        """
55
        pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
56
57
        zscore = prepare_zscore_for_imputation(ref_panel, zscore)
        zscore_results = zscore.copy(deep=True)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
58
59
        if self.imputation_style == "online":
            def imputer(ld_file):
60
61
62
                return ld_region_centered_window_imputation(ld_file, ref_panel,
                                                            zscore,
                                                            self.window_size)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
63
64
        elif self.imputation_style == "batch":
            def imputer(ld_file):
65
66
67
                return impg_like_imputation(ld_file, ref_panel, zscore,
                                            self.window_size, self.buffer,
                                             self.lamb, self.rcond)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
68

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
69
70
        for ld_file in glob.glob(pattern):
            print("processing Region: {0}".format(ld_file))
71
72
            ld_batch = imputer(ld_file)
            zscore_results = pd.concat([zscore_results, ld_batch])
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
73

74
75
76
        zscore_results.sort_values(by="pos")
        zscore_results = realigned_zfiles_on_panel(ref_panel, zscore_results)
        return zscore_results
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
77
78
79
80


    def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output):
        """
81
        Launch imputation on all chromosome for one trait
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
82
83
84
85
86
87
88
89
90

        """

        for i in range(1, 23):

            ref_panel_file = ref_panel_folder + "/chr" + str(i) + ".eur.1pct.bim"
            ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)


Hanna  JULIENNE's avatar
Hanna JULIENNE committed
91
            known_zscore_file = zscore_folder + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
92
93
94
95
96
            known_zscore = pd.read_csv(known_zscore_file, index_col=0, sep="\t")

            chrom = "chr"+str(i)
            z_imp = self.chromosome_imputation(chrom, known_zscore, ref_panel, ld_folder)

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
97
            imputed_zscore = folder_output + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
98
            z_imp.to_csv(imputed_zscore, sep="\t")