ld_matrix.py 3.4 KB
Newer Older
1
# coding: utf-8
2
3
4
5
"""
    Function set to compute LD correlation from a reference panel
    in predefined Region

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
6
    LD matrix are then transformed to the pandas sparse format
7
8
9
10
11
"""

import scipy as sc
import pandas as pd
import subprocess as sub
12
13
14
import pkg_resources
import numpy as np
import re
15
16
17
18


def launch_plink_ld(startpos, endpos, chr, reffile, folder):
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
19
20
21
22
23
24
25
26
27
    launch plink linkage desiquilibrium correlation and save
    the ouput

    Args:
        startpos (int): position of the start of the window
        endpos (int): position of the end of the window
        chr (str): chromosome position
        reffile (str): reference panel file
        folder (str): output folder
28
    """
29
30
31
32
33
    bimref = reffile + ".bim"
    ref_panel = pd.read_csv(bimref, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)

    ref_panel = ref_panel.loc[(ref_panel.pos > startpos) & (ref_panel.pos < endpos)]
    ref_panel.index.to_series().to_csv("./snp_list.txt", index=False)
34

35
    fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos)
36

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
37
    cmd = "plink --bfile {0} --r --ld-snp-list ./snp_list.txt --ld-window 50 --ld-window-kb 3000 --ld-window-r2 0.4 --chr {1} --out {2}".format(reffile, chr, fo)
38
39
40
    sub.check_output(cmd, shell=True)


41
def generate_sparse_matrix(plink_ld, ref_chr_df):
42
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
43
44
    Extract correlation matrix from the plink correlation
    file generated by ld_matrix.launch_plink_ld
45
46
    read plink results create a sparse dataframe LD-matrix
    then save it to a zipped pickle
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
47
48
49
50
51
52
53

    Args:
        plink_ld (str): path to the plink correlation matrix file
        ref_chr_df (str):

    Returns:
        pandas.SparseDataFrame : Linkage desiquilibrium matrix
54
55
56
    """

    plink_ld = pd.read_csv(plink_ld, sep = "\s+")
57
58
59
60
61
    mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R')
    un_index = mat_ld.index.union(mat_ld.columns)
    mat_ld = mat_ld.reindex(index=un_index, columns=un_index)
    mat_ld.fillna(0, inplace=True)

62
    sym = np.maximum(mat_ld.values,mat_ld.values.transpose())
63

64
65
66
    mat_ld = pd.DataFrame(sym, index=mat_ld.index, columns=mat_ld.columns)
    re_index = ref_chr_df.loc[mat_ld.index].sort_values(by="pos").index
    mat_ld = mat_ld.loc[re_index, re_index]
67
68
    mat_ld = mat_ld.to_sparse()
    return mat_ld
69
70
71

def generate_genome_matrices(region_files, reffolder, folder_output):
    """
72
73
    go through region files and compute LD matrix for each transform and
    save the results in a pandas sparse dataframe
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
74
75
76
77
78

    Args:
        region_files (str) : region file containing beginning and end position
        reffolder (str) : folder of reference panel
        folder_output (str): folder to save plink LD correlation result files
79
80
    """
    regions = pd.read_csv(region_files)
81
    for reg in regions.iterrows():
82
83
84
85
        print(reg[0])
        # input reference panel file
        fi_ref = "{0}/{1}.eur.1pct".format(reffolder, reg[1]['chr'])

86
        chr_int = re.search('([0-9]{1,2})', str(reg[1]['chr'])).group()
87
        # Compute the LD correlation with LD
88
        launch_plink_ld(reg[1]['start'], reg[1]['stop'], chr_int, fi_ref, folder_output)
89

90
91
        #fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
        #fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
92
        #transform plink output to a compressed generate_sparse_matrix
93
        #generate_sparse_matrix(fi_plink, fo_mat)