windows.py 3.74 KB
Newer Older
1
2
3
4
"""
implement the imputation window is sliding along the genome:

- ImpG like: Non overlapping windows, the imputation is apply in batch to unknown snp in the window
5
- centered_window:  A sliding window centered on the Snp to impute
6
7

"""
8
9
from .stat_models import ImpG_model_batch, ImpG_model_snp
from .ld_matrix import generate_sparse_matrix
10

11
import pandas as pd
12
13
import numpy as np

14
15
16
17
18
19
20
21
22
23
24

def parse_region_position(LD_file):
    """
    Retrieve the region definition from a ld-file generated by impute_jass
    Argument :
        LD_file : A LD file generated by jass_impute

    """
    (chrom, startpos, endpos ) = LD_file.split("/")[-1].split(".")[0].split('_')
    return (chrom, startpos, endpos)

25
26

def realigned_zfiles_on_panel(ref_panel, Zscores):
27
    """
28
29
30
31
32
33
34
35
36
37
38
    Check if the counted allele is the same in the reference panel and
    the Zscore files.

    If not, the coded and other allele are inverted and the Zscores sign
    is inverted also.
    """
    allele_inverted = (ref_panel.loc[Zscores.index, 'Ref_all'] != Zscores.A0)

    Zscores.loc[allele_inverted, "A0"] = ref_panel.alt_all
    Zscores.loc[allele_inverted, "A1"] = ref_panel.Ref_all
    Zscores.loc[allele_inverted, "Z"] = - Zscores.loc[allele_inverted, "Z"]
39

40
41
    return Zscores

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def prepare_Zscore_for_imputation(ref_panel, Zscores):
    """
    Prepare the known Z score by realigning them on the reference ref_panel
    the snps that are not present in the ref panel are filtered
    """
    Zscores = realigned_zfiles_on_panel(ref_panel, Zscores)
    Zscores['Var'] = 1
    Zscores['Nsnp_to_impute'] = -1
    Zscores = Zscores.loc[Zscores.index.intersection(ref_panel.index)]
    return Zscores


def in_region(pos_vector, start, end):
    return ((start < pos_vector) & (pos_vector < end))

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
57
def Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, window_size, unknowns=pd.Series([])):
58
59
60
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument
61
    """
62
    (chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
63
64
65
66


    LD_mat = generate_sparse_matrix(LD_file, ref_panel)

67
    #Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
68
    Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
69

70
    # Find Snp to impute
71
72
73
    if len(unknowns) == 0:
        unknowns = LD_mat.index.difference(Zscores.index)

74
    N_snp = len(unknowns)
75
    print("### Imputation of {0} snps ###".format(len(unknowns)))
76
    i = 0
77
78
79

    for snp_unknown in unknowns:
        # Boundary of the centered_window
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
80
81
82
        #print(((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block)))
        start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
        end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))
83
84
        #print(snp_unknown, start_pos, end_pos, start_ld_block, end_ld_block)

85
        in_LD_reg_n_window =  in_region(Zscores.pos, start_pos, end_pos)
86
87

        known = Zscores.loc[in_LD_reg_n_window].index
88
89
90
91
        Sig_t = LD_mat.loc[known, known]
        Sig_i_t = LD_mat.loc[snp_unknown, known]
        Zt = Zscores.loc[known,'Z']

92
93
        if(len(known) > 0):
            imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
94
95
96
            Zscores.loc[snp_unknown, "pos"] = ref_panel.loc[snp_unknown, 'pos']
            Zscores.loc[snp_unknown, "A0"] = ref_panel.loc[snp_unknown, "Ref_all"]
            Zscores.loc[snp_unknown, "A1"] = ref_panel.loc[snp_unknown, "alt_all"]
97
98
            Zscores.loc[snp_unknown, "Z"] = imp['mu']
            Zscores.loc[snp_unknown, "Var"] = imp['Var']
99
            Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
100
        i = i+1
101
        if i%100 == 0:
102
            print("{0}\%".format(np.round(i/N_snp,4)))
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
103
    print(Zscores.head(10))
104
    return Zscores.sort_values(by="pos")