Commit 8932d738 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

Merge branch '16-diamond-through-eggnog-mapper' into 'master'

add rule for diamond through eggnogmapper2

Closes #19, #18, #17, and #16

See merge request !4
parents 2d13178c 2fda3282
__eggnogmapper2_exec_command = config.get('eggnogmapper2', {}).get('exec_command', 'emapper.py')
__eggnogmapper2_annotate_options = config.get('eggnogmapper2', {}).get('annotate', {}).get('options', '')
__eggnogmapper2_annotate_threads = config.get('eggnogmapper2', {}).get('annotate', {}).get('threads', 4)
rule eggnogmapper2_annotate:
"""
Run annotate mode of eggnogmapper v2
"""
input:
__eggnogmapper2_annotate_input
output:
__eggnogmapper2_annotate_output
params:
exec_command = __eggnogmapper2_exec_command + ' --annotate_hits_table',
output_prefix = __eggnogmapper2_annotate_output_prefix,
options = __eggnogmapper2_annotate_options
threads:
__eggnogmapper2_annotate_threads
shell:
"""
{params.exec_command} {input} {params.options} --cpu {threads} -o {params.output_prefix}
"""
configfile: "config.yaml"
# ==== Snakefile path ====
__eggnogmapper2_annotate_rules = config.get("snakefiles", {}).get("eggnogmapper2_annotate")
__main_output_dir = config.get('output_dir', 'output')
# ==== Main config ====
SAMPLES = config.get('samples')
__input_dir = config.get('input_dir', 'data')
# ==== EggNOGmapper2 Annotate ====
__eggnogmapper2_output_dir = __main_output_dir + "/eggnogmapper2"
__eggnogmapper2_annotate_output_dir = __eggnogmapper2_output_dir + "/annotate"
__eggnogmapper2_annotate_input = "{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__input_dir)
__eggnogmapper2_annotate_output_prefix = "{dir}/{{sample}}".format(dir=__eggnogmapper2_annotate_output_dir)
__eggnogmapper2_annotate_output = "{dir}/{{sample}}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir)
include: __eggnogmapper2_annotate_rules
rule all:
input: expand("{dir}/{{sample}}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir), sample=SAMPLES)
snakefiles:
eggnogmapper2_annotate: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/annotate/Snakefile
input_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output/eggnogmapper2/diamond/
output_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output
samples:
- all
eggnogmapper2:
exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py"
annotate:
options: "--no_file_comments"
__eggnogmapper2_exec_command = config.get('eggnogmapper2', {}).get('exec_command', 'emapper.py')
__eggnogmapper2_diamond_options = config.get('eggnogmapper2', {}).get('diamond', {}).get('options', '')
__eggnogmapper2_diamond_threads = config.get('eggnogmapper2', {}).get('diamond', {}).get('threads', 4)
rule eggnogmapper2_diamond:
"""
Run diamond mode of eggnogmapper v2
"""
input:
__eggnogmapper2_diamond_input
output:
__eggnogmapper2_diamond_output
params:
exec_command = __eggnogmapper2_exec_command + ' -m diamond --no_annot',
output_prefix = __eggnogmapper2_diamond_output_prefix,
options = __eggnogmapper2_diamond_options
threads:
__eggnogmapper2_diamond_threads
shell:
"""
{params.exec_command} {params.options} --cpu {threads} -i {input} -o {params.output_prefix}
"""
configfile: "config.yaml"
# ==== Snakefile path ====
__eggnogmapper2_diamond_rules = config.get("snakefiles", {}).get("eggnogmapper2_diamond")
__main_output_dir = config.get('output_dir', 'output')
# ==== Main config ====
SAMPLES = config.get('samples')
__input_dir = config.get('input_dir', 'data')
# ==== EggNOGmapper2 Diamond ====
__eggnogmapper2_output_dir = __main_output_dir + "/eggnogmapper2"
__eggnogmapper2_diamond_output_dir = __eggnogmapper2_output_dir + "/diamond"
__eggnogmapper2_diamond_input = "{dir}/{{sample}}.fa".format(dir=__input_dir)
__eggnogmapper2_diamond_output_prefix = "{dir}/{{sample}}".format(dir=__eggnogmapper2_diamond_output_dir)
__eggnogmapper2_diamond_output = "{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir)
include: __eggnogmapper2_diamond_rules
rule all:
input: expand("{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir), sample=SAMPLES)
snakefiles:
eggnogmapper2_diamond: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/diamond/Snakefile
input_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output/split_fasta
output_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output
samples:
- test_00000
- test_00001
- test_00002
eggnogmapper2:
exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py"
diamond:
options: "--no_file_comments"
rule cat:
"""
Split a FASTA file with the desired number of sequences per chunk
"""
input:
__cat_input
output:
__cat_output
shell:
"""
cat {input} > {output}
"""
configfile: "config.yaml"
# ==== Snakefile path ====
__cat_rules = config.get("snakefiles", {}).get("cat")
__main_output_dir = config.get('output_dir', 'output')
# ==== Main config ====
SAMPLES = config.get('samples')
__input_dir = config.get('input_dir', 'data')
# ==== Cat ====
__cat_merged_name = config.get('cat', {}).get('name_merge', 'cat_file')
__cat_output_dir = __input_dir
__cat_input = expand("{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__input_dir), sample=SAMPLES)
__cat_output = "{dir}/{file_name}".format(dir=__cat_output_dir, file_name=__cat_merged_name)
include: __cat_rules
rule all:
input: "{dir}/{file_name}".format(dir=__cat_output_dir, file_name=__cat_merged_name)
snakefiles:
cat: /pasteur/projets/policy01/Atm/snakemake/tools/utils/cat/Snakefile
input_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output/eggnogmapper2/diamond/
output_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output
samples:
- test_00000
- test_00001
- test_00002
cat:
name_merge: all.emapper.seed_orthologs
# EggNOG mapper v2
This describes a workflow to run [eggnog mapper on a large dataset](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2#setting-up-large-annotation-jobs)
In brief it contains 4 steps:
* Splitting the input FASTA files into chunks
* Run diamond from eggnogmapper on each chunks
* Merge all results to one unique `emapper.seed_orthologs`
* Perform annotations from this unique file
configfile: "config.yaml"
def count_sequences(fasta_file):
with open(fasta_file, 'r') as file:
seq = 0
for line in file:
if '>' in line:
seq += 1
return seq
# ==== Snakefile path ====
__split_fasta_rules = config.get("snakefiles", {}).get("split_fasta")
__eggnogmapper2_diamond_rules = config.get("snakefiles", {}).get("eggnogmapper2_diamond")
__cat_rules = config.get("snakefiles", {}).get("cat")
__eggnogmapper2_annotate_rules = config.get("snakefiles", {}).get("eggnogmapper2_annotate")
# ---- Main config ----
__main_output_dir = config.get('output_dir', 'output')
__prefix = config['split_fasta']['prefix']
# ==== Split FASTA ====
__split_fasta_output_dir = __main_output_dir + "/split_fasta"
__split_fasta_input = config['input_fasta']
__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000)
total_number_sequences = count_sequences(__split_fasta_input)
SPLIT_NB = [f"{i:05d}" for i in range(0, int(total_number_sequences/__split_fasta_number_sequences) + 1)]
CHUNKS = [f"{__prefix}{i}" for i in SPLIT_NB]
__split_fasta_prefix = "/".join([__split_fasta_output_dir, __prefix])
__split_fasta_output = expand(__split_fasta_prefix + "{split_nb}.fa", split_nb=SPLIT_NB)
include: __split_fasta_rules
# ==== EggNOGmapper2 Diamond ====
__eggnogmapper2_output_dir = __main_output_dir + "/eggnogmapper2"
__eggnogmapper2_diamond_input_dir = __split_fasta_output_dir
__eggnogmapper2_diamond_output_dir = __eggnogmapper2_output_dir + "/diamond"
__eggnogmapper2_diamond_input = "{dir}/{{chunk}}.fa".format(dir=__eggnogmapper2_diamond_input_dir)
__eggnogmapper2_diamond_output_prefix = "{dir}/{{chunk}}".format(dir=__eggnogmapper2_diamond_output_dir)
__eggnogmapper2_diamond_output = "{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir)
include: __eggnogmapper2_diamond_rules
# ==== Cat ====
__cat_merged_name = config.get('cat', {}).get('name_merge', 'cat_file')
__cat_dir = __eggnogmapper2_diamond_output_dir
__cat_input = expand("{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__cat_dir), chunk=CHUNKS)
__cat_output = "{dir}/{file_name}".format(dir=__cat_dir, file_name=__cat_merged_name)
include: __cat_rules
# ==== EggNOGmapper2 Annotate ====
__eggnogmapper2_annotate_output_dir = __eggnogmapper2_output_dir + "/annotate"
__eggnogmapper2_annotate_input = __cat_output
__eggnogmapper2_annotate_outname_prefix = config.get('eggnogmapper2', {}).get('annotate', {}).get('outname_prefix', 'all')
__eggnogmapper2_annotate_output_prefix = "{dir}/{name}".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix)
__eggnogmapper2_annotate_output = "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix)
include: __eggnogmapper2_annotate_rules
rule all:
input: "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix)
snakefiles:
eggnogmapper2_diamond: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/diamond/Snakefile
split_fasta: /pasteur/projets/policy01/Atm/snakemake/tools/utils/split_fasta/Snakefile
cat: /pasteur/projets/policy01/Atm/snakemake/tools/utils/cat/Snakefile
eggnogmapper2_annotate: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/annotate/Snakefile
input_fasta: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test.fa
output_dir: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test_output
split_fasta:
prefix: test_
number_sequences: 100
eggnogmapper2:
exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py"
diamond:
options: "--no_file_comments --translate"
threads: 16
annotate:
options: "--no_file_comments"
threads: 10
outname_prefix: "test"
cat:
name_merge: all_test.emapper.seed_orthologs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment