diff --git a/tools/eggnogmapper2/annotate/Snakefile b/tools/eggnogmapper2/annotate/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..1bffbde450d6ce5aa9eed5bf5407a9d94b1836ba --- /dev/null +++ b/tools/eggnogmapper2/annotate/Snakefile @@ -0,0 +1,22 @@ +__eggnogmapper2_exec_command = config.get('eggnogmapper2', {}).get('exec_command', 'emapper.py') +__eggnogmapper2_annotate_options = config.get('eggnogmapper2', {}).get('annotate', {}).get('options', '') +__eggnogmapper2_annotate_threads = config.get('eggnogmapper2', {}).get('annotate', {}).get('threads', 4) + +rule eggnogmapper2_annotate: + """ + Run annotate mode of eggnogmapper v2 + """ + input: + __eggnogmapper2_annotate_input + output: + __eggnogmapper2_annotate_output + params: + exec_command = __eggnogmapper2_exec_command + ' --annotate_hits_table', + output_prefix = __eggnogmapper2_annotate_output_prefix, + options = __eggnogmapper2_annotate_options + threads: + __eggnogmapper2_annotate_threads + shell: + """ + {params.exec_command} {input} {params.options} --cpu {threads} -o {params.output_prefix} + """ diff --git a/tools/eggnogmapper2/annotate/example_usage/Snakefile b/tools/eggnogmapper2/annotate/example_usage/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..72d61da856936e629737b918d3616ae652ed7233 --- /dev/null +++ b/tools/eggnogmapper2/annotate/example_usage/Snakefile @@ -0,0 +1,22 @@ +configfile: "config.yaml" + +# ==== Snakefile path ==== +__eggnogmapper2_annotate_rules = config.get("snakefiles", {}).get("eggnogmapper2_annotate") + +__main_output_dir = config.get('output_dir', 'output') + +# ==== Main config ==== +SAMPLES = config.get('samples') +__input_dir = config.get('input_dir', 'data') + +# ==== EggNOGmapper2 Annotate ==== +__eggnogmapper2_output_dir = __main_output_dir + "/eggnogmapper2" +__eggnogmapper2_annotate_output_dir = __eggnogmapper2_output_dir + "/annotate" + +__eggnogmapper2_annotate_input = "{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__input_dir) +__eggnogmapper2_annotate_output_prefix = "{dir}/{{sample}}".format(dir=__eggnogmapper2_annotate_output_dir) +__eggnogmapper2_annotate_output = "{dir}/{{sample}}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir) +include: __eggnogmapper2_annotate_rules + +rule all: + input: expand("{dir}/{{sample}}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir), sample=SAMPLES) diff --git a/tools/eggnogmapper2/annotate/example_usage/config.yaml b/tools/eggnogmapper2/annotate/example_usage/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c7db75de9db3189ffa55b7a30aeb8813d1c34708 --- /dev/null +++ b/tools/eggnogmapper2/annotate/example_usage/config.yaml @@ -0,0 +1,13 @@ +snakefiles: + eggnogmapper2_annotate: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/annotate/Snakefile + +input_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output/eggnogmapper2/diamond/ +output_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output + +samples: +- all + +eggnogmapper2: + exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py" + annotate: + options: "--no_file_comments" diff --git a/tools/eggnogmapper2/diamond/Snakefile b/tools/eggnogmapper2/diamond/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..738f092d0c2c5e242bf44d8cc7d39a7f5c62c9e3 --- /dev/null +++ b/tools/eggnogmapper2/diamond/Snakefile @@ -0,0 +1,22 @@ +__eggnogmapper2_exec_command = config.get('eggnogmapper2', {}).get('exec_command', 'emapper.py') +__eggnogmapper2_diamond_options = config.get('eggnogmapper2', {}).get('diamond', {}).get('options', '') +__eggnogmapper2_diamond_threads = config.get('eggnogmapper2', {}).get('diamond', {}).get('threads', 4) + +rule eggnogmapper2_diamond: + """ + Run diamond mode of eggnogmapper v2 + """ + input: + __eggnogmapper2_diamond_input + output: + __eggnogmapper2_diamond_output + params: + exec_command = __eggnogmapper2_exec_command + ' -m diamond --no_annot', + output_prefix = __eggnogmapper2_diamond_output_prefix, + options = __eggnogmapper2_diamond_options + threads: + __eggnogmapper2_diamond_threads + shell: + """ + {params.exec_command} {params.options} --cpu {threads} -i {input} -o {params.output_prefix} + """ diff --git a/tools/eggnogmapper2/diamond/example_usage/Snakefile b/tools/eggnogmapper2/diamond/example_usage/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..ef94ddfdd73ee745c20fa9a4405198280bdaf119 --- /dev/null +++ b/tools/eggnogmapper2/diamond/example_usage/Snakefile @@ -0,0 +1,22 @@ +configfile: "config.yaml" + +# ==== Snakefile path ==== +__eggnogmapper2_diamond_rules = config.get("snakefiles", {}).get("eggnogmapper2_diamond") + +__main_output_dir = config.get('output_dir', 'output') + +# ==== Main config ==== +SAMPLES = config.get('samples') +__input_dir = config.get('input_dir', 'data') + +# ==== EggNOGmapper2 Diamond ==== +__eggnogmapper2_output_dir = __main_output_dir + "/eggnogmapper2" +__eggnogmapper2_diamond_output_dir = __eggnogmapper2_output_dir + "/diamond" + +__eggnogmapper2_diamond_input = "{dir}/{{sample}}.fa".format(dir=__input_dir) +__eggnogmapper2_diamond_output_prefix = "{dir}/{{sample}}".format(dir=__eggnogmapper2_diamond_output_dir) +__eggnogmapper2_diamond_output = "{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir) +include: __eggnogmapper2_diamond_rules + +rule all: + input: expand("{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir), sample=SAMPLES) diff --git a/tools/eggnogmapper2/diamond/example_usage/config.yaml b/tools/eggnogmapper2/diamond/example_usage/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5fe001f10aaab0b262680238247ff6684e7a9f5f --- /dev/null +++ b/tools/eggnogmapper2/diamond/example_usage/config.yaml @@ -0,0 +1,15 @@ +snakefiles: + eggnogmapper2_diamond: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/diamond/Snakefile + +input_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output/split_fasta +output_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output + +samples: +- test_00000 +- test_00001 +- test_00002 + +eggnogmapper2: + exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py" + diamond: + options: "--no_file_comments" diff --git a/tools/utils/cat/Snakefile b/tools/utils/cat/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..f75e57c7169397eac4dcb8e71423a32239ffbd3d --- /dev/null +++ b/tools/utils/cat/Snakefile @@ -0,0 +1,12 @@ +rule cat: + """ + Split a FASTA file with the desired number of sequences per chunk + """ + input: + __cat_input + output: + __cat_output + shell: + """ + cat {input} > {output} + """ diff --git a/tools/utils/cat/example_usage/Snakefile b/tools/utils/cat/example_usage/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..9505eb6419d8bd07160e14fcc42884112da20dd9 --- /dev/null +++ b/tools/utils/cat/example_usage/Snakefile @@ -0,0 +1,21 @@ +configfile: "config.yaml" + +# ==== Snakefile path ==== +__cat_rules = config.get("snakefiles", {}).get("cat") + +__main_output_dir = config.get('output_dir', 'output') + +# ==== Main config ==== +SAMPLES = config.get('samples') +__input_dir = config.get('input_dir', 'data') + +# ==== Cat ==== +__cat_merged_name = config.get('cat', {}).get('name_merge', 'cat_file') +__cat_output_dir = __input_dir + +__cat_input = expand("{dir}/{{sample}}.emapper.seed_orthologs".format(dir=__input_dir), sample=SAMPLES) +__cat_output = "{dir}/{file_name}".format(dir=__cat_output_dir, file_name=__cat_merged_name) +include: __cat_rules + +rule all: + input: "{dir}/{file_name}".format(dir=__cat_output_dir, file_name=__cat_merged_name) diff --git a/tools/utils/cat/example_usage/config.yaml b/tools/utils/cat/example_usage/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b0d0444ad73e58f3c1e6ebdde07204907fb38d95 --- /dev/null +++ b/tools/utils/cat/example_usage/config.yaml @@ -0,0 +1,13 @@ +snakefiles: + cat: /pasteur/projets/policy01/Atm/snakemake/tools/utils/cat/Snakefile + +input_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output/eggnogmapper2/diamond/ +output_dir: /pasteur/projets/policy01/Atm/kenzo/sandbox/20200210_test_snakemake/test_output + +samples: +- test_00000 +- test_00001 +- test_00002 + +cat: + name_merge: all.emapper.seed_orthologs diff --git a/workflows/eggnogmapperv2/README.md b/workflows/eggnogmapperv2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..008482c4b9e7856ea799b08159d396cc5dbd3779 --- /dev/null +++ b/workflows/eggnogmapperv2/README.md @@ -0,0 +1,10 @@ +# EggNOG mapper v2 + +This describes a workflow to run [eggnog mapper on a large dataset](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2#setting-up-large-annotation-jobs) + +In brief it contains 4 steps: + +* Splitting the input FASTA files into chunks +* Run diamond from eggnogmapper on each chunks +* Merge all results to one unique `emapper.seed_orthologs` +* Perform annotations from this unique file diff --git a/workflows/eggnogmapperv2/Snakefile b/workflows/eggnogmapperv2/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..83a483c1037717f420fdfadfdb0a1ce21cbeff2a --- /dev/null +++ b/workflows/eggnogmapperv2/Snakefile @@ -0,0 +1,68 @@ +configfile: "config.yaml" + +def count_sequences(fasta_file): + with open(fasta_file, 'r') as file: + seq = 0 + for line in file: + if '>' in line: + seq += 1 + return seq + +# ==== Snakefile path ==== +__split_fasta_rules = config.get("snakefiles", {}).get("split_fasta") +__eggnogmapper2_diamond_rules = config.get("snakefiles", {}).get("eggnogmapper2_diamond") +__cat_rules = config.get("snakefiles", {}).get("cat") +__eggnogmapper2_annotate_rules = config.get("snakefiles", {}).get("eggnogmapper2_annotate") + +# ---- Main config ---- +__main_output_dir = config.get('output_dir', 'output') +__prefix = config['split_fasta']['prefix'] + + +# ==== Split FASTA ==== +__split_fasta_output_dir = __main_output_dir + "/split_fasta" + +__split_fasta_input = config['input_fasta'] +__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) +total_number_sequences = count_sequences(__split_fasta_input) +SPLIT_NB = [f"{i:05d}" for i in range(0, int(total_number_sequences/__split_fasta_number_sequences) + 1)] +CHUNKS = [f"{__prefix}{i}" for i in SPLIT_NB] +__split_fasta_prefix = "/".join([__split_fasta_output_dir, __prefix]) +__split_fasta_output = expand(__split_fasta_prefix + "{split_nb}.fa", split_nb=SPLIT_NB) + +include: __split_fasta_rules + + +# ==== EggNOGmapper2 Diamond ==== +__eggnogmapper2_output_dir = __main_output_dir + "/eggnogmapper2" +__eggnogmapper2_diamond_input_dir = __split_fasta_output_dir +__eggnogmapper2_diamond_output_dir = __eggnogmapper2_output_dir + "/diamond" + +__eggnogmapper2_diamond_input = "{dir}/{{chunk}}.fa".format(dir=__eggnogmapper2_diamond_input_dir) +__eggnogmapper2_diamond_output_prefix = "{dir}/{{chunk}}".format(dir=__eggnogmapper2_diamond_output_dir) +__eggnogmapper2_diamond_output = "{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir) + +include: __eggnogmapper2_diamond_rules + + +# ==== Cat ==== +__cat_merged_name = config.get('cat', {}).get('name_merge', 'cat_file') +__cat_dir = __eggnogmapper2_diamond_output_dir + +__cat_input = expand("{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__cat_dir), chunk=CHUNKS) +__cat_output = "{dir}/{file_name}".format(dir=__cat_dir, file_name=__cat_merged_name) +include: __cat_rules + + +# ==== EggNOGmapper2 Annotate ==== +__eggnogmapper2_annotate_output_dir = __eggnogmapper2_output_dir + "/annotate" + +__eggnogmapper2_annotate_input = __cat_output +__eggnogmapper2_annotate_outname_prefix = config.get('eggnogmapper2', {}).get('annotate', {}).get('outname_prefix', 'all') +__eggnogmapper2_annotate_output_prefix = "{dir}/{name}".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix) +__eggnogmapper2_annotate_output = "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix) +include: __eggnogmapper2_annotate_rules + +rule all: + input: "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix) + diff --git a/workflows/eggnogmapperv2/config.yaml b/workflows/eggnogmapperv2/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..10bd4b1ef1e47f342d64ab38b703e4c292be6d56 --- /dev/null +++ b/workflows/eggnogmapperv2/config.yaml @@ -0,0 +1,26 @@ +snakefiles: + eggnogmapper2_diamond: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/diamond/Snakefile + split_fasta: /pasteur/projets/policy01/Atm/snakemake/tools/utils/split_fasta/Snakefile + cat: /pasteur/projets/policy01/Atm/snakemake/tools/utils/cat/Snakefile + eggnogmapper2_annotate: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/annotate/Snakefile + +input_fasta: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test.fa +output_dir: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test_output + +split_fasta: + prefix: test_ + number_sequences: 100 + +eggnogmapper2: + exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py" + diamond: + options: "--no_file_comments --translate" + threads: 16 + annotate: + options: "--no_file_comments" + threads: 10 + outname_prefix: "test" + +cat: + name_merge: all_test.emapper.seed_orthologs +