ld_matrix.py 2.84 KB
Newer Older
1
# coding: utf-8
2
3
4
5
6
7
8
9
10
11
"""
    Function set to compute LD correlation from a reference panel
    in predefined Region

    LD matrix are then stored to the scipy sparse matrix format
"""

import scipy as sc
import pandas as pd
import subprocess as sub
12
13
14
import pkg_resources
import numpy as np
import re
15
16
17
18
19
20


def launch_plink_ld(startpos, endpos, chr, reffile, folder):
    """
    launch plink ld
    """
21
22
23
24
25
    bimref = reffile + ".bim"
    ref_panel = pd.read_csv(bimref, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)

    ref_panel = ref_panel.loc[(ref_panel.pos > startpos) & (ref_panel.pos < endpos)]
    ref_panel.index.to_series().to_csv("./snp_list.txt", index=False)
26

27
    fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos)
28
29
30

    cmd = "p-link --noweb --bfile {0} --r --ld-snp-list ./snp_list.txt --ld-window 50 --ld-window-kb 3000 --ld-window-r2 0.4 --chr {1} --out {2}".format(reffile, chr, fo)
    print(cmd)
31

32
33
34
    sub.check_output(cmd, shell=True)


35
def generate_sparse_matrix(plink_ld, ref_chr_df):
36
37
38
39
40
41
    """
    read plink results create a sparse dataframe LD-matrix
    then save it to a zipped pickle
    """

    plink_ld = pd.read_csv(plink_ld, sep = "\s+")
42
43
44
45
46
    mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R')
    un_index = mat_ld.index.union(mat_ld.columns)
    mat_ld = mat_ld.reindex(index=un_index, columns=un_index)
    mat_ld.fillna(0, inplace=True)

47
48

    sym = np.maximum(mat_ld.values,mat_ld.values.transpose())
49
50
    np.fill_diagonal(sym, 1.01)

51
52
53
    mat_ld = pd.DataFrame(sym, index=mat_ld.index, columns=mat_ld.columns)
    re_index = ref_chr_df.loc[mat_ld.index].sort_values(by="pos").index
    mat_ld = mat_ld.loc[re_index, re_index]
54
55
56
57
#    mat_ld = pd.DataFrame(np.maximum(mat_ld.values, mat_ld.values.transpose()), index=un_index, columns=un_index)
    mat_ld = mat_ld.to_sparse()
    return mat_ld
    #mat_ld.to_pickle(path_ld_mat,, compression='gzip')
58
59
60

def generate_genome_matrices(region_files, reffolder, folder_output):
    """
61
62
    go through region files and compute LD matrix for each transform and
    save the results in a pandas sparse dataframe
63
64
    """
    regions = pd.read_csv(region_files)
65
    for reg in regions.iterrows():
66
67
68
69
        print(reg[0])
        # input reference panel file
        fi_ref = "{0}/{1}.eur.1pct".format(reffolder, reg[1]['chr'])

70
        chr_int = re.search('([0-9]{1,2})', str(reg[1]['chr'])).group()
71
        # Compute the LD correlation with LD
72
        launch_plink_ld(reg[1]['start'], reg[1]['stop'], chr_int, fi_ref, folder_output)
73

74
75
        #fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
        #fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
76
        #transform plink output to a compressed generate_sparse_matrix
77
        #generate_sparse_matrix(fi_plink, fo_mat)