ld_matrix.py 1.79 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
    Function set to compute LD correlation from a reference panel
    in predefined Region

    LD matrix are then stored to the scipy sparse matrix format
"""

import scipy as sc
import pandas as pd
import subprocess as sub

sub.check_output("pwd")

LD_region = pd.read_csv('./impute_for_jass/Imputation_for_jass/impute_jass/data/Region_LD.csv')


def launch_plink_ld(startpos, endpos, chr, reffile, folder):
    """
    launch plink ld
    """

    fo = "{0}/{1}_{2}_{3}".format(folder, chr, startpos, endpos)
    cmd = "p-link --noweb --bfile {0} --r --ld-window-r2 0  --from-bp {1} --to-bp {2} --chr {3} --out {4}".format(reffile, startpos, endpos, chr, fo)
    sub.check_output(cmd, shell=True)


def generate_sparse_matrix(plink_ld, path_ld_mat):
    """
    read plink results create a sparse dataframe LD-matrix
    then save it to a zipped pickle
    """

    plink_ld = pd.read_csv(plink_ld, sep = "\s+")
    mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R').to_sparse(fill_value=0)
    mat_ld.to_pickle(path_ld_mat)

def generate_genome_matrices(region_files, reffolder, folder_output):
    """

    """

    regions = pd.read_csv(region_files)
    for reg in region_files.iterrows():
        print(reg[0])
        # input reference panel file
        fi_ref = "{0}/{1}.eur.1pct".format(reffolder, reg[1]['chr'])

        # Compute the LD correlation with LD
        launch_plink_ld(reg[1]['start'], reg[1]['stop'], reg[1]['chr'], fi_ref, folder_output)

        fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
        fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
        #transform plink output to a compressed generate_sparse_matrix
        generate_sparse_matrix(fi_plink, fo_mat)