Skip to content
Snippets Groups Projects
Commit fa8b3139 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

Test compute LD matrix + write Impg model function

parent 2193ea22
Branches
Tags
No related merge requests found
import impute_jass.ld_matrix as LD
# coding: utf-8
""" """
Function set to compute LD correlation from a reference panel Function set to compute LD correlation from a reference panel
in predefined Region in predefined Region
...@@ -8,10 +9,9 @@ ...@@ -8,10 +9,9 @@
import scipy as sc import scipy as sc
import pandas as pd import pandas as pd
import subprocess as sub import subprocess as sub
import pkg_resources
sub.check_output("pwd") import numpy as np
import re
LD_region = pd.read_csv('./impute_for_jass/Imputation_for_jass/impute_jass/data/Region_LD.csv')
def launch_plink_ld(startpos, endpos, chr, reffile, folder): def launch_plink_ld(startpos, endpos, chr, reffile, folder):
...@@ -19,36 +19,50 @@ def launch_plink_ld(startpos, endpos, chr, reffile, folder): ...@@ -19,36 +19,50 @@ def launch_plink_ld(startpos, endpos, chr, reffile, folder):
launch plink ld launch plink ld
""" """
fo = "{0}/{1}_{2}_{3}".format(folder, chr, startpos, endpos) fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos)
cmd = "p-link --noweb --bfile {0} --r --ld-window-r2 0 --from-bp {1} --to-bp {2} --chr {3} --out {4}".format(reffile, startpos, endpos, chr, fo) cmd = "p-link --noweb --bfile {0} --r --ld-window-r2 0 --from-bp {1} --to-bp {2} --chr {3} --out {4}".format(reffile, startpos, endpos, chr, fo)
#print(cmd)
sub.check_output(cmd, shell=True) sub.check_output(cmd, shell=True)
def generate_sparse_matrix(plink_ld, path_ld_mat): def generate_sparse_matrix(plink_ld):
""" """
read plink results create a sparse dataframe LD-matrix read plink results create a sparse dataframe LD-matrix
then save it to a zipped pickle then save it to a zipped pickle
""" """
plink_ld = pd.read_csv(plink_ld, sep = "\s+") plink_ld = pd.read_csv(plink_ld, sep = "\s+")
mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R').to_sparse(fill_value=0) mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R')
mat_ld.to_pickle(path_ld_mat) un_index = mat_ld.index.union(mat_ld.columns)
mat_ld = mat_ld.reindex(index=un_index, columns=un_index)
mat_ld.fillna(0, inplace=True)
sym = mat_ld.values + mat_ld.values.transpose()
np.fill_diagonal(sym, 1.01)
mat_ld = pd.DataFrame(sym, index=mat_ld.index, columns=mat_ld.columns)
# mat_ld = pd.DataFrame(np.maximum(mat_ld.values, mat_ld.values.transpose()), index=un_index, columns=un_index)
mat_ld = mat_ld.to_sparse()
return mat_ld
#mat_ld.to_pickle(path_ld_mat,, compression='gzip')
def generate_genome_matrices(region_files, reffolder, folder_output): def generate_genome_matrices(region_files, reffolder, folder_output):
""" """
go through region files and compute LD matrix for each transform and
save the results in a pandas sparse dataframe
""" """
regions = pd.read_csv(region_files) regions = pd.read_csv(region_files)
for reg in region_files.iterrows(): for reg in regions.iterrows():
print(reg[0]) print(reg[0])
# input reference panel file # input reference panel file
fi_ref = "{0}/{1}.eur.1pct".format(reffolder, reg[1]['chr']) fi_ref = "{0}/{1}.eur.1pct".format(reffolder, reg[1]['chr'])
chr_int = re.search('([0-9]{1,2})', str(reg[1]['chr'])).group()
# Compute the LD correlation with LD # Compute the LD correlation with LD
launch_plink_ld(reg[1]['start'], reg[1]['stop'], reg[1]['chr'], fi_ref, folder_output) launch_plink_ld(reg[1]['start'], reg[1]['stop'], chr_int, fi_ref, folder_output)
fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"]) #fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"]) #fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"])
#transform plink output to a compressed generate_sparse_matrix #transform plink output to a compressed generate_sparse_matrix
generate_sparse_matrix(fi_plink, fo_mat) #generate_sparse_matrix(fi_plink, fo_mat)
from setuptools import setup, find_packages from setuptools import setup, find_packages
setup(name='jass_preprocessing', setup(name='impute_jass',
version='0.1', version='0.1',
description='Preprocess GWAS summary statistic for JASS', description='Preprocess GWAS summary statistic for JASS',
url='http:https://gitlab.pasteur.fr/statistical-genetics/JASS_Pre-processing', url='http:https://gitlab.pasteur.fr/statistical-genetics/JASS_Pre-processing',
...@@ -9,5 +9,5 @@ setup(name='jass_preprocessing', ...@@ -9,5 +9,5 @@ setup(name='jass_preprocessing',
license='MIT', license='MIT',
#package_dir = {'': 'jass_preprocessing'}, #package_dir = {'': 'jass_preprocessing'},
packages= ['impute_jass'], packages= ['impute_jass'],
package_data = {'impute_jass':'data/*.csv'}, package_data = {'impute_jass':'./data/*.csv'},
zip_safe=False) zip_safe=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment