Commit 1fcacb74 authored by Hanna  JULIENNE's avatar Hanna JULIENNE

change folder names

parent f1d368bc
Pipeline #7916 failed with stage
in 47 seconds
This diff is collapsed.
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = Peppa-PIG
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
set SPHINXPROJ=Peppa-PIG
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd
impute\_jass.imputation\_launcher
=================================
.. automodule:: impute_jass.imputation_launcher
.. rubric:: Classes
.. autosummary::
ImputationLauncher
\ No newline at end of file
impute\_jass.ld\_matrix
=======================
.. automodule:: impute_jass.ld_matrix
.. rubric:: Functions
.. autosummary::
generate_genome_matrices
generate_sparse_matrix
launch_plink_ld
\ No newline at end of file
impute\_jass.stat\_models
=========================
.. automodule:: impute_jass.stat_models
.. rubric:: Functions
.. autosummary::
check_inversion
compute_mu
compute_var
impg_model
var_in_boundaries
\ No newline at end of file
impute\_jass.windows
====================
.. automodule:: impute_jass.windows
.. rubric:: Functions
.. autosummary::
compute_window_and_size
empty_imputed_dataframe
format_result_df
impg_like_imputation
in_region
parse_region_position
prepare_zscore_for_imputation
print_progression
realigned_zfiles_on_panel
\ No newline at end of file
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath('../..'))
#print(os.path.abspath('../..'))
# -- Project information -----------------------------------------------------
project = 'RAISS'
copyright = '2018, hjulienne'
author = 'hjulienne'
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '1.0'
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.coverage',
'sphinx.ext.mathjax',
'sphinx.ext.viewcode',
'sphinx.ext.autosummary',
'sphinxcontrib.bibtex',
'sphinxarg.ext'
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = []
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'Peppa-PIGdoc'
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'Peppa-PIG.tex', 'Peppa-PIG Documentation',
'hjulienne', 'manual'),
]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'peppa-pig', 'Peppa-PIG Documentation',
[author], 1)
]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'Peppa-PIG', 'Peppa-PIG Documentation',
author, 'Peppa-PIG', 'One line description of project.',
'Miscellaneous'),
]
# -- Extension configuration -------------------------------------------------i
autoclass_content = "both" # include both class docstring and __init__
autodoc_default_flags = [
# Make sure that any autodoc declarations show the right members
"members",
"inherited-members",
"private-members",
"show-inheritance",
]
autosummary_generate = True # Make _autosummary files and include them
.. RAISS documentation master file, created by
sphinx-quickstart on Mon Aug 20 16:17:59 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to the Robust and Accurate Imputation from Summary Statistics (RAISS) documentation!
============================================================================================
.. toctree::
:maxdepth: 2
:caption: Contents:
What is RAISS ?
===================
RAISS is a python package to impute missing SNP summary statistics from
neighboring SNPs in linkage desiquilibrium.
The statistical model used to make the imputation is described in :cite:`Pasaniuc2014`
The imputation execution time is optimized by precomputing Linkage desiquilibrium between SNPs.
Dependencies
============
RAISS requires plink version 1.9 : `<https://www.cog-genomics.org/plink2>`_
Installation
============
.. code-block:: shell
pip3 install git+https://gitlab.pasteur.fr/statistical-genetics/imputation_for_jass
Precomputation of LD-correlation
=================================
The imputation is based the Linkage desiquilibrium
between SNPs.
To save computation time, the LD is computed before imputation and saved as tabular format.
To limit the number of SNP pairs, the LD is computed between pairs of
SNPs in a approximately LD-independent regions. For an european ancestry, you can use
the region defined by :cite:`Berisa2015` that are provided in the package data folder.
To compute the LD you need to specify a reference panel splitted by chromosomes
(bed, fam and bim formats of plink, see `PLINK formats <https://www.cog-genomics.org/plink2/formats>`_ )
.. code-block:: python
# path to the Region file
region_berisa = "/mnt/atlas/PCMA/WKD_Hanna/cleaned_jass_input/Region_LD.csv"
# Path to the reference panel
ref_folder="/mnt/atlas/PCMA/1._DATA/ImpG_refpanel"
# path to the folder to store the results
ld_folder_out = "/mnt/atlas/PCMA/WKD_Hanna/impute_for_jass/berisa_ld_block"
raiss.LD.generate_genome_matrices(, ...)
Input format:
=============
GWAS results files must be provided in the tabular format by chromosome (tab separated)
all in the same folder with the following columns with the same header:
+----------+-------+------+-----+--------+
| rsID | pos | A0 | A1 | Z |
+==========+=======+======+=====+========+
| rs6548219| 30762 | A | G | -1.133 |
+----------+-------+------+-----+--------+
This format can be obtained with the `JASS PreProcessing package <https://gitlab.pasteur.fr/statistical-genetics/JASS_Pre-processing>`_.
Launching imputation on one chromosome
======================================
RAISS has an interface with the command line (see Command Line Usage bellow).
If you have access to a cluster, an efficient way to use RAISS is to launch
the imputation of each chromosome on a separate cluster node. The script
`launch_imputation_all_gwas.sh <https://gitlab.pasteur.fr/statistical-genetics/raiss/blob/master/launch_imputation_all_gwas.sh>`_
contain an example of raiss usage with a SLURM scheduler.
Output
======
The raiss package outputs imputed GWAS files in the tabular format:
#TODO suppress complementary columns
+------------+---+--+----------------+-----+-----+----------------+------------------+---------+---------+
| |A0 |A1| Nsnp_to_impute |Var |Z |condition_number|correct_inversion |ld_score | pos |
+============+===+==+================+=====+=====+================+==================+=========+=========+
| rs11584349 |C | T| 18 | 0.85|-0.28| 116.9 | False | 1.34 | 1000156 |
+------------+---+--+----------------+-----+-----+----------------+------------------+---------+---------+
# Keep only useful columns
Command Line Usage
==================
.. argparse::
:ref: impute_jass.__main__.add_chromosome_imputation_argument
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
.. automodule:: impute_jass
:members:
* :ref:`search`
.. autosummary::
:toctree: _autosummary
.. bibliography:: reference.bib
@article{Pasaniuc2014,
abstract = {MOTIVATION Imputation using external reference panels (e.g. 1000 Genomes) is a widely used approach for increasing power in genome-wide association studies and meta-analysis. Existing hidden Markov models (HMM)-based imputation approaches require individual-level genotypes. Here, we develop a new method for Gaussian imputation from summary association statistics, a type of data that is becoming widely available. RESULTS In simulations using 1000 Genomes (1000G) data, this method recovers 84{\%} (54{\%}) of the effective sample size for common ({\textgreater}5{\%}) and low-frequency (1-5{\%}) variants [increasing to 87{\%} (60{\%}) when summary linkage disequilibrium information is available from target samples] versus the gold standard of 89{\%} (67{\%}) for HMM-based imputation, which cannot be applied to summary statistics. Our approach accounts for the limited sample size of the reference panel, a crucial step to eliminate false-positive associations, and it is computationally very fast. As an empirical demonstration, we apply our method to seven case-control phenotypes from the Wellcome Trust Case Control Consortium (WTCCC) data and a study of height in the British 1958 birth cohort (1958BC). Gaussian imputation from summary statistics recovers 95{\%} (105{\%}) of the effective sample size (as quantified by the ratio of [Formula: see text] association statistics) compared with HMM-based imputation from individual-level genotypes at the 227 (176) published single nucleotide polymorphisms (SNPs) in the WTCCC (1958BC height) data. In addition, for publicly available summary statistics from large meta-analyses of four lipid traits, we publicly release imputed summary statistics at 1000G SNPs, which could not have been obtained using previously published methods, and demonstrate their accuracy by masking subsets of the data. We show that 1000G imputation using our approach increases the magnitude and statistical evidence of enrichment at genic versus non-genic loci for these traits, as compared with an analysis without 1000G imputation. Thus, imputation of summary statistics will be a valuable tool in future functional enrichment analyses. AVAILABILITY AND IMPLEMENTATION Publicly available software package available at http://bogdan.bioinformatics.ucla.edu/software/. CONTACT bpasaniuc@mednet.ucla.edu or aprice@hsph.harvard.edu SUPPLEMENTARY INFORMATION Supplementary materials are available at Bioinformatics online.},
archivePrefix = {arXiv},
arxivId = {arXiv:1309.3258v1},
author = {Pasaniuc, Bogdan and Zaitlen, Noah and Shi, Huwenbo and Bhatia, Gaurav and Gusev, Alexander and Pickrell, Joseph and Hirschhorn, Joel and Strachan, David P. and Patterson, Nick and Price, Alkes L.},
doi = {10.1093/bioinformatics/btu416},
eprint = {arXiv:1309.3258v1},
file = {:home/hjulienn/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Pasaniuc et al. - 2014 - Fast and accurate imputation of summary statistics enhances evidence of functional enrichment.pdf:pdf},
issn = {13674811},
journal = {Bioinformatics (Oxford, England)},
keywords = {Hughes},
mendeley-tags = {Hughes},
number = {20},
pages = {2906--2914},
pmid = {24990607},
title = {{Fast and accurate imputation of summary statistics enhances evidence of functional enrichment}},
volume = {30},
year = {2014}
}
@article{Berisa2015,
abstract = {We present a method to identify approximately independent blocks of linkage disequilibrium (LD) in the human genome. These blocks enable automated analysis of multiple genome-wide association studies.},
author = {Berisa, Tomaz and Pickrell, Joseph K.},
doi = {10.1093/bioinformatics/btv546},
file = {:home/hjulienn/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Berisa, Pickrell - 2015 - Approximately independent linkage disequilibrium blocks in human populations.pdf:pdf},
isbn = {1367-4811 (Electronic) 1367-4803 (Linking)},
issn = {14602059},
journal = {Bioinformatics},
mendeley-groups = {Genetics},
number = {2},
pages = {283--285},
pmid = {26395773},
title = {{Approximately independent linkage disequilibrium blocks in human populations}},
volume = {32},
year = {2015}
}
"""
.. autosummary::
:toctree: _autosummary
imputation_launcher
ld_matrix
stat_models
windows
"""
import impute_jass.ld_matrix as LD
import impute_jass.stat_models as model
import impute_jass.windows
from impute_jass.imputation_launcher import ImputationLauncher
import argparse
import pandas as pd
from impute_jass.imputation_launcher import ImputationLauncher
def launch_chromosome_imputation(args):
"""
Function allow the calling of the ImputationLauncher.chromosome_imputation
method from an entry point
Args:
args (dict): Argument parsed from the command line see the
__main__.add_chromosome_imputation_argument(parser) function.
"""
print("Imputation of {0} gwas for chromosome {1}".format(args.gwas, args.chrom))
# Imputer settings
imputer = ImputationLauncher( window_size=int(args.window_size), buf=int(args.buffer_size),
lamb= float(args.l2_regularization), pinv_rcond = float(args.eigen_treshold))
# Reading of inputs
z_file = "{0}/z_{1}_{2}.txt".format(args.zscore_folder, args.gwas, args.chrom)
zscore = pd.read_csv(z_file,index_col=0, sep="\t")
ref_panel_file = args.ref_folder + "/"+ args.chrom +".eur.1pct.bim"
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
# imputation
imputed_zscore = imputer.chromosome_imputation(args.chrom, zscore, ref_panel, args.ld_folder)
print("Imputation DONE")
# Saving results
z_fo = "{0}/z_{1}_{2}.txt".format(args.output_folder, args.gwas, args.chrom)
imputed_zscore.to_csv(z_fo, sep='\t')
print("Save imputation done at {0}".format(z_fo))
def add_chromosome_imputation_argument():
parser = argparse.ArgumentParser()
parser.add_argument('--chrom', required=True, help= "chromosome to impute to the chr\d+ format")
parser.add_argument('--gwas', required=True, help= "GWAS to impute to the consortia_trait format")
parser.add_argument('--ref-folder', required=True, help= "reference panel location (used to determine which snp to impute)")
parser.add_argument('--ld-folder', required=True, help= "Location LD correlation matrices")
parser.add_argument('--zscore-folder', required=True, help= "Location of the zscore files of the gwases to impute")
parser.add_argument('--output-folder', required=True, help= "Location of the impute zscore files")
parser.add_argument('--window-size', help= "Size of the non overlapping window", default = 500000)
parser.add_argument('--buffer-size', help= "Size of the buffer around the imputation window", default = 125000)
parser.add_argument('--l2-regularization', help= "Size of the buffer around the imputation window", default = 0.1)
parser.add_argument('--eigen-treshold', help= "treshold under which eigen vectors are removed for the computation of the pseudo inverse", default = 0.1)
parser.set_defaults(func=launch_chromosome_imputation)
return(parser)
def main():
#prog='impute_jass')
parser = add_chromosome_imputation_argument(parser)
args = parser.parse_args()
args.func(args)
if __name__=="__main__":
main()
# -*- coding: utf-8 -*-
"""Imputation launcher
Function set to launch SNP imputation on a complete chromosome or
on the genome
"""
import glob
import pandas as pd
from .windows import prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel
class ImputationLauncher(object):
"""
Class to perform imputation of snp from summary statistic
"""
def __init__(self, window_size=10000, buf=2500,
lamb= 0.01, pinv_rcond = 0.01):
"""
Initialise the imputation object. Fix the windows size, the buffer size
and the king of imputation employed
Args:
window_size (int): size of the imputation window in bp
buffer (int): the size of the padding around the windows of
imputation (relevant only for batch imputation)
lamb (float): size of the increment added to snp correlation
matrices to make it less singular
pinv_rcond (float): the rcond scipy.linalg.pinv function argument.
The scipy.linalg.pinv is used to invert the correlation matrices
"""
self.window_size = window_size
self.buffer = buf
self.lamb = lamb
self.rcond = pinv_rcond
def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
"""
Impute the panel zscore score for one chromosome and with the specified
parameters
Args:
chrom (str): chromosome "chr*"
zscore (pandas dataframe): known zscore
ref_panel (str): path of the folder of reference panel
ld_folder (str): path of the folder containing linkage desiquilibrium matrices
Returns:
pandas dataframe: Imputed zscore dataframe
"""
pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
zscore_results = zscore.copy(deep=True)
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore,
self.window_size, self.buffer,
self.lamb, self.rcond)
for ld_file in glob.glob(pattern):