imputation_launcher.py 3.82 KB
Newer Older
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# -*- coding: utf-8 -*-
"""Imputation launcher

Function set to launch SNP imputation on a complete chromosome or
on the genome

"""

import glob
import pandas as pd
from .windows import prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel

class ImputationLauncher(object):
    """
    Class to perform imputation of snp from summary statistic
    """
    def __init__(self, window_size=10000, buf=2500,
                 lamb= 0.01, pinv_rcond = 0.01):
        """
        Initialise the imputation object. Fix the windows size, the buffer size
        and the king of imputation employed

        Args:
            window_size (int): size of the imputation window in bp
            buffer (int): the size of the padding around the windows of
                imputation (relevant only for batch imputation)
            lamb (float): size of the increment added to snp correlation
                matrices to make it less singular
            pinv_rcond (float): the rcond scipy.linalg.pinv function argument.
                The scipy.linalg.pinv is used to invert the correlation matrices
        """

        self.window_size = window_size
        self.buffer = buf
        self.lamb = lamb
        self.rcond = pinv_rcond


    def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
        """
        Impute the panel zscore score for one chromosome and with the specified
        parameters

        Args:
            chrom (str): chromosome "chr*"
            zscore (pandas dataframe): known zscore
            ref_panel (str): path of the folder of reference panel
            ld_folder (str): path of the folder containing linkage desiquilibrium matrices

        Returns:
            pandas dataframe: Imputed zscore dataframe
        """
        pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
        zscore = prepare_zscore_for_imputation(ref_panel, zscore)
        zscore_results = zscore.copy(deep=True)

        def imputer(ld_file):
            return impg_like_imputation(ld_file, ref_panel, zscore,
                                        self.window_size, self.buffer,
                                         self.lamb, self.rcond)

        for ld_file in glob.glob(pattern):
            print("processing Region: {0}".format(ld_file))
            ld_batch = imputer(ld_file)
            zscore_results = pd.concat([zscore_results, ld_batch])

        zscore_results.sort_values(by="pos", inplace=True)
        zscore_results = realigned_zfiles_on_panel(ref_panel, zscore_results)
        return zscore_results


    def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output):
        """
        Launch imputation on all chromosome for one trait by calling
        chromosome_imputation for each chromosome

        Args:
            gwas_tag (str): a short string to annotate imputed GWAS files
            ref_panel_folder (str): path of the folder of reference panel
            ld_folder (str): path of the folder containing linkage desiquilibrium matrices
            zscore_folder (str): path of the folder for input GWAS files
            folder_output (str): path of the folder for imputed GWAS files
        """

        for i in range(1, 23):

            ref_panel_file = ref_panel_folder + "/chr" + str(i) + ".eur.1pct.bim"
            ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)


            known_zscore_file = zscore_folder + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
            known_zscore = pd.read_csv(known_zscore_file, index_col=0, sep="\t")

            chrom = "chr"+str(i)
            z_imp = self.chromosome_imputation(chrom, known_zscore, ref_panel, ld_folder)

            imputed_zscore = folder_output + "/z_" + gwas_tag + "_chr" + str(i) + ".txt"
            z_imp.to_csv(imputed_zscore, sep="\t")