diff --git a/impute_jass/impute_jass/__init__.py b/impute_jass/impute_jass/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5a3b4f01b173541c42388a638240f206e4f22118 100644 --- a/impute_jass/impute_jass/__init__.py +++ b/impute_jass/impute_jass/__init__.py @@ -0,0 +1 @@ +import impute_jass.ld_matrix as LD diff --git a/impute_jass/impute_jass/ld_matrix.py b/impute_jass/impute_jass/ld_matrix.py index 3d2691c679e01cef9da9738195bf4d50ce2e736e..6d80effc727e4ab9efc50870d5cf3a1df8866576 100644 --- a/impute_jass/impute_jass/ld_matrix.py +++ b/impute_jass/impute_jass/ld_matrix.py @@ -1,3 +1,4 @@ +# coding: utf-8 """ Function set to compute LD correlation from a reference panel in predefined Region @@ -8,10 +9,9 @@ import scipy as sc import pandas as pd import subprocess as sub - -sub.check_output("pwd") - -LD_region = pd.read_csv('./impute_for_jass/Imputation_for_jass/impute_jass/data/Region_LD.csv') +import pkg_resources +import numpy as np +import re def launch_plink_ld(startpos, endpos, chr, reffile, folder): @@ -19,36 +19,50 @@ def launch_plink_ld(startpos, endpos, chr, reffile, folder): launch plink ld """ - fo = "{0}/{1}_{2}_{3}".format(folder, chr, startpos, endpos) + fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos) cmd = "p-link --noweb --bfile {0} --r --ld-window-r2 0 --from-bp {1} --to-bp {2} --chr {3} --out {4}".format(reffile, startpos, endpos, chr, fo) + #print(cmd) + sub.check_output(cmd, shell=True) -def generate_sparse_matrix(plink_ld, path_ld_mat): +def generate_sparse_matrix(plink_ld): """ read plink results create a sparse dataframe LD-matrix then save it to a zipped pickle """ plink_ld = pd.read_csv(plink_ld, sep = "\s+") - mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R').to_sparse(fill_value=0) - mat_ld.to_pickle(path_ld_mat) + mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R') + un_index = mat_ld.index.union(mat_ld.columns) + mat_ld = mat_ld.reindex(index=un_index, columns=un_index) + mat_ld.fillna(0, inplace=True) + + sym = mat_ld.values + mat_ld.values.transpose() + np.fill_diagonal(sym, 1.01) + mat_ld = pd.DataFrame(sym, index=mat_ld.index, columns=mat_ld.columns) + +# mat_ld = pd.DataFrame(np.maximum(mat_ld.values, mat_ld.values.transpose()), index=un_index, columns=un_index) + mat_ld = mat_ld.to_sparse() + return mat_ld + #mat_ld.to_pickle(path_ld_mat,, compression='gzip') def generate_genome_matrices(region_files, reffolder, folder_output): """ - + go through region files and compute LD matrix for each transform and + save the results in a pandas sparse dataframe """ - regions = pd.read_csv(region_files) - for reg in region_files.iterrows(): + for reg in regions.iterrows(): print(reg[0]) # input reference panel file fi_ref = "{0}/{1}.eur.1pct".format(reffolder, reg[1]['chr']) + chr_int = re.search('([0-9]{1,2})', str(reg[1]['chr'])).group() # Compute the LD correlation with LD - launch_plink_ld(reg[1]['start'], reg[1]['stop'], reg[1]['chr'], fi_ref, folder_output) + launch_plink_ld(reg[1]['start'], reg[1]['stop'], chr_int, fi_ref, folder_output) - fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"]) - fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"]) + #fi_plink = "{0}/{1}_{2}_{3}.ld".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"]) + #fo_mat = "{0}/{1}_{2}_{3}.mat".format(folder_output, reg[1]['chr'], reg[1]['startpos'], reg[1]["endpos"]) #transform plink output to a compressed generate_sparse_matrix - generate_sparse_matrix(fi_plink, fo_mat) + #generate_sparse_matrix(fi_plink, fo_mat) diff --git a/impute_jass/setup.py b/impute_jass/setup.py index d17679bc2ac2b0d5dcccf7313b5a03b9f66dce67..6e6a63473669d92d3dbd5070347e3888297cc927 100644 --- a/impute_jass/setup.py +++ b/impute_jass/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -setup(name='jass_preprocessing', +setup(name='impute_jass', version='0.1', description='Preprocess GWAS summary statistic for JASS', url='http:https://gitlab.pasteur.fr/statistical-genetics/JASS_Pre-processing', @@ -9,5 +9,5 @@ setup(name='jass_preprocessing', license='MIT', #package_dir = {'': 'jass_preprocessing'}, packages= ['impute_jass'], - package_data = {'impute_jass':'data/*.csv'}, + package_data = {'impute_jass':'./data/*.csv'}, zip_safe=False)