imputation_launcher.py 3.81 KB
Newer Older
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
1
"""
2
Function set to launch SNP imputation on a complete chromosome or
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
3
4
5
on the genome
"""
import glob
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
6
import pandas as pd
7
from .windows import ld_region_centered_window_imputation, prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
8

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
9

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
10
class ImputationLauncher(object):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
11
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
12
    Class to perform imputation of snp from summary statistic
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
13
    """
14
    def __init__(self, window_size=10000, buf=2500,
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
15
                 lamb= 0.01, pinv_rcond = 0.01):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
16
        """
17
        Initialise the imputation object. Fix the windows size, the buffer size
18
19
        and the king of imputation employed

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
20
21
        Args:
            window_size (int): size of the imputation window in bp
22
            buffer (int): the size of the padding around the windows of
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
23
                imputation (relevant only for batch imputation)
24
            lamb (float): size of the increment added to snp correlation
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
25
                matrices to make it less singular
26
            pinv_rcond (float): the rcond scipy.linalg.pinv function argument.
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
27
                The scipy.linalg.pinv is used to invert the correlation matrices
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
28
        """
29

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
30
        self.window_size = window_size
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
31
32
33
        self.buffer = buf
        self.lamb = lamb
        self.rcond = pinv_rcond
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
34

35

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
36
    def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
37
        """
38
39
        Impute the panel zscore score for one chromosome and with the specified
        parameters
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
40

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
41
        Args:
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
42
43
44
45
46
47
48
            chrom (str): chromosome "chr*"
            zscore (pandas dataframe): known zscore
            ref_panel (str): path of the folder of reference panel
            ld_folder (str): path of the folder containing linkage desiquilibrium matrices

        Returns:
            pandas dataframe: Imputed zscore dataframe
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
49
        """
50
        pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
51
52
        zscore = prepare_zscore_for_imputation(ref_panel, zscore)
        zscore_results = zscore.copy(deep=True)
53
54
55
56
57

        def imputer(ld_file):
            return impg_like_imputation(ld_file, ref_panel, zscore,
                                        self.window_size, self.buffer,
                                         self.lamb, self.rcond)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
58

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
59
60
        for ld_file in glob.glob(pattern):
            print("processing Region: {0}".format(ld_file))
61
62
            ld_batch = imputer(ld_file)
            zscore_results = pd.concat([zscore_results, ld_batch])
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
63

64
        zscore_results.sort_values(by="pos", inplace=True)
65
66
        zscore_results = realigned_zfiles_on_panel(ref_panel, zscore_results)
        return zscore_results
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
67
68
69
70


    def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output):
        """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
71
72
        Launch imputation on all chromosome for one trait by calling
        chromosome_imputation for each chromosome
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
73

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
74
75
76
77
78
79
        Args:
            gwas_tag (str): a short string to annotate imputed GWAS files
            ref_panel_folder (str): path of the folder of reference panel
            ld_folder (str): path of the folder containing linkage desiquilibrium matrices
            zscore_folder (str): path of the folder for input GWAS files
            folder_output (str): path of the folder for imputed GWAS files
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
80
81
82
83
84
85
86
87
        """

        for i in range(1, 23):

            ref_panel_file = ref_panel_folder + "/chr" + str(i) + ".eur.1pct.bim"
            ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)


Hanna  JULIENNE's avatar
Hanna JULIENNE committed
88
            known_zscore_file = zscore_folder + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
89
90
91
92
93
            known_zscore = pd.read_csv(known_zscore_file, index_col=0, sep="\t")

            chrom = "chr"+str(i)
            z_imp = self.chromosome_imputation(chrom, known_zscore, ref_panel, ld_folder)

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
94
            imputed_zscore = folder_output + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
95
            z_imp.to_csv(imputed_zscore, sep="\t")