windows.py 3.72 KB
Newer Older
1 2 3 4
"""
implement the imputation window is sliding along the genome:

- ImpG like: Non overlapping windows, the imputation is apply in batch to unknown snp in the window
5
- centered_window:  A sliding window centered on the Snp to impute
6 7

"""
8 9
from .stat_models import ImpG_model_batch, ImpG_model_snp
from .ld_matrix import generate_sparse_matrix
10

11
import pandas as pd
12 13
import numpy as np

14 15 16 17 18 19 20 21 22 23 24

def parse_region_position(LD_file):
    """
    Retrieve the region definition from a ld-file generated by impute_jass
    Argument :
        LD_file : A LD file generated by jass_impute

    """
    (chrom, startpos, endpos ) = LD_file.split("/")[-1].split(".")[0].split('_')
    return (chrom, startpos, endpos)

25 26

def realigned_zfiles_on_panel(ref_panel, Zscores):
27
    """
28 29 30 31 32 33 34 35 36 37 38
    Check if the counted allele is the same in the reference panel and
    the Zscore files.

    If not, the coded and other allele are inverted and the Zscores sign
    is inverted also.
    """
    allele_inverted = (ref_panel.loc[Zscores.index, 'Ref_all'] != Zscores.A0)

    Zscores.loc[allele_inverted, "A0"] = ref_panel.alt_all
    Zscores.loc[allele_inverted, "A1"] = ref_panel.Ref_all
    Zscores.loc[allele_inverted, "Z"] = - Zscores.loc[allele_inverted, "Z"]
39

40 41
    return Zscores

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
def prepare_Zscore_for_imputation(ref_panel, Zscores):
    """
    Prepare the known Z score by realigning them on the reference ref_panel
    the snps that are not present in the ref panel are filtered
    """
    Zscores = realigned_zfiles_on_panel(ref_panel, Zscores)
    Zscores['Var'] = 1
    Zscores['Nsnp_to_impute'] = -1
    Zscores = Zscores.loc[Zscores.index.intersection(ref_panel.index)]
    return Zscores


def in_region(pos_vector, start, end):
    return ((start < pos_vector) & (pos_vector < end))

57
def centered_window_imputation(LD_file, ref_panel_folder, Zscores, window_size, unknowns=pd.Series([])):
58 59 60
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument
61
    """
62
    (chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
63 64 65 66 67 68

    ref_panel_file = "/mnt/atlas/PCMA/1._DATA/ImpG_refpanel/{0}.eur.1pct.bim".format(chrom)
    ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)

    LD_mat = generate_sparse_matrix(LD_file, ref_panel)

69
    #Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
70
    Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
71

72
    # Find Snp to impute
73 74 75
    if len(unknowns) == 0:
        unknowns = LD_mat.index.difference(Zscores.index)

76
    N_snp = len(unknowns)
77
    print("### Imputation of {0} snps ###".format(len(unknowns)))
78
    i = 0
79 80 81 82

    for snp_unknown in unknowns:
        # Boundary of the centered_window

83 84 85 86 87 88 89
        start_pos = ref_panel.loc[snp_unknown,'pos'] - window_size
        end_pos = ref_panel.loc[snp_unknown,'pos'] + window_size
        #print(snp_unknown, start_pos, end_pos, start_ld_block, end_ld_block)

        in_LD_reg_n_window = in_region(Zscores.pos, int(start_ld_block), int(end_ld_block)) & in_region(Zscores.pos, start_pos, end_pos)

        known = Zscores.loc[in_LD_reg_n_window].index
90 91 92 93
        Sig_t = LD_mat.loc[known, known]
        Sig_i_t = LD_mat.loc[snp_unknown, known]
        Zt = Zscores.loc[known,'Z']

94 95
        if(len(known) > 0):
            imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
96
            Zscores.loc[snp_unknown, ["pos", "A0", "A1"]] = ref_panel.loc[snp_unknown, ['pos', "Ref_all", "alt_all"]]
97 98
            Zscores.loc[snp_unknown, "Z"] = imp['mu']
            Zscores.loc[snp_unknown, "Var"] = imp['Var']
99
            Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
100 101
        i = i+1

102
        if i%100 == 0:
103 104
            print("{0}\%".format(np.round(i/N_snp,4)))

105

106
    return Zscores.sort_values(by="pos")