ld_matrix.py 2.39 KB
Newer Older
1
# coding: utf-8
2
3
4
5
6
7
8
9
10
11
"""
    Function set to compute LD correlation from a reference panel
    in predefined Region

    LD matrix are then stored to the scipy sparse matrix format
"""

import scipy as sc
import pandas as pd
import subprocess as sub
12
13
14
import pkg_resources
import numpy as np
import re
15
16
17
18
19
20
21


def launch_plink_ld(startpos, endpos, chr, reffile, folder):
    """
    launch plink ld
    """

22
    fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos)
23
    cmd = "p-link --noweb --bfile {0} --r --ld-window-r2 0  --from-bp {1} --to-bp {2} --chr {3} --out {4}".format(reffile, startpos, endpos, chr, fo)
24
25
    #print(cmd)

26
27
28
    sub.check_output(cmd, shell=True)


29
def generate_sparse_matrix(plink_ld):
30
31
32
33
34
35
    """
    read plink results create a sparse dataframe LD-matrix
    then save it to a zipped pickle
    """

    plink_ld = pd.read_csv(plink_ld, sep = "\s+")
36
37
38
39
40
41
42
43
44
45
46
47
48
    mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R')
    un_index = mat_ld.index.union(mat_ld.columns)
    mat_ld = mat_ld.reindex(index=un_index, columns=un_index)
    mat_ld.fillna(0, inplace=True)

    sym = mat_ld.values + mat_ld.values.transpose()
    np.fill_diagonal(sym, 1.01)
    mat_ld = pd.DataFrame(sym, index=mat_ld.index, columns=mat_ld.columns)

#    mat_ld = pd.DataFrame(np.maximum(mat_ld.values, mat_ld.values.transpose()), index=un_index, columns=un_index)
    mat_ld = mat_ld.to_sparse()
    return mat_ld
    #mat_ld.to_pickle(path_ld_mat,, compression='gzip')
49
50
51

def generate_genome_matrices(region_files, reffolder, folder_output):
    """
52
53
    go through region files and compute LD matrix for each transform and
    save the results in a pandas sparse dataframe
54
55
    """
    regions = pd.read_csv(region_files)
56
    for reg in regions.iterrows():
57
58
59
60
        print(reg[0])
        # input reference panel file
        fi_ref = "{0}/{1}.eur.1pct".format(reffolder, reg[1]['chr'])

61
        chr_int = re.search('([0-9]{1,2})', str(reg[1]['chr'])).group()
62
        # Compute the LD correlation with LD
63
        launch_plink_ld(reg[1]['start'], reg[1]['stop'], chr_int, fi_ref, folder_output)
64

65
66
        #fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
        #fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
67
        #transform plink output to a compressed generate_sparse_matrix
68
        #generate_sparse_matrix(fi_plink, fo_mat)