add workflow for eggnogmapperv2 on large file

2fda3282 · Kenzo-Hugo Hillion · 0462c446 · 2fda3282 · 2fda3282
Commit 2fda3282 authored 5 years ago by Kenzo-Hugo Hillion
--- a/workflows/eggnogmapperv2/Snakefile
+++ b/workflows/eggnogmapperv2/Snakefile
+configfile: "config.yaml"
+
+def count_sequences(fasta_file):
+    with open(fasta_file, 'r') as file:
+        seq = 0
+        for line in file:
+            if '>' in line:
+                seq += 1
+    return seq
+
+# ==== Snakefile path ====
+__split_fasta_rules = config.get("snakefiles", {}).get("split_fasta")
+__eggnogmapper2_diamond_rules = config.get("snakefiles", {}).get("eggnogmapper2_diamond")
+__cat_rules = config.get("snakefiles", {}).get("cat")
+__eggnogmapper2_annotate_rules = config.get("snakefiles", {}).get("eggnogmapper2_annotate")
+
+# ---- Main config ----
+__main_output_dir = config.get('output_dir', 'output')
+__prefix = config['split_fasta']['prefix']
+
+
+# ==== Split FASTA ====
+__split_fasta_output_dir = __main_output_dir +  "/split_fasta"
+
+__split_fasta_input = config['input_fasta']
+__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000)
+total_number_sequences = count_sequences(__split_fasta_input)
+SPLIT_NB = [f"{i:05d}" for i in range(0, int(total_number_sequences/__split_fasta_number_sequences) + 1)]
+CHUNKS = [f"{__prefix}{i}" for i in SPLIT_NB]
+__split_fasta_prefix = "/".join([__split_fasta_output_dir, __prefix])
+__split_fasta_output = expand(__split_fasta_prefix + "{split_nb}.fa", split_nb=SPLIT_NB)
+
+include: __split_fasta_rules
+
+
+# ==== EggNOGmapper2 Diamond ====
+__eggnogmapper2_output_dir = __main_output_dir +  "/eggnogmapper2"
+__eggnogmapper2_diamond_input_dir = __split_fasta_output_dir
+__eggnogmapper2_diamond_output_dir = __eggnogmapper2_output_dir + "/diamond"
+
+__eggnogmapper2_diamond_input = "{dir}/{{chunk}}.fa".format(dir=__eggnogmapper2_diamond_input_dir)
+__eggnogmapper2_diamond_output_prefix = "{dir}/{{chunk}}".format(dir=__eggnogmapper2_diamond_output_dir)
+__eggnogmapper2_diamond_output = "{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir)
+
+include: __eggnogmapper2_diamond_rules
+
+
+# ==== Cat ====
+__cat_merged_name = config.get('cat', {}).get('name_merge', 'cat_file')
+__cat_dir = __eggnogmapper2_diamond_output_dir
+
+__cat_input = expand("{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__cat_dir), chunk=CHUNKS)
+__cat_output = "{dir}/{file_name}".format(dir=__cat_dir, file_name=__cat_merged_name)
+include: __cat_rules
+
+
+# ==== EggNOGmapper2 Annotate ====
+__eggnogmapper2_annotate_output_dir = __eggnogmapper2_output_dir + "/annotate"
+
+__eggnogmapper2_annotate_input = __cat_output
+__eggnogmapper2_annotate_outname_prefix = config.get('eggnogmapper2', {}).get('annotate', {}).get('outname_prefix', 'all')
+__eggnogmapper2_annotate_output_prefix = "{dir}/{name}".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix)
+__eggnogmapper2_annotate_output = "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix)
+include: __eggnogmapper2_annotate_rules
+
+rule all:
+    input: "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix)
+
--- a/workflows/eggnogmapperv2/config.yaml
+++ b/workflows/eggnogmapperv2/config.yaml
+snakefiles:
+    eggnogmapper2_diamond: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/diamond/Snakefile
+    split_fasta: /pasteur/projets/policy01/Atm/snakemake/tools/utils/split_fasta/Snakefile
+    cat: /pasteur/projets/policy01/Atm/snakemake/tools/utils/cat/Snakefile
+    eggnogmapper2_annotate: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/annotate/Snakefile
+
+input_fasta: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test.fa
+output_dir: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test_output
+
+split_fasta:
+    prefix: test_
+    number_sequences: 100
+
+eggnogmapper2:
+    exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py"
+    diamond:
+        options: "--no_file_comments --translate"
+        threads: 16
+    annotate:
+        options: "--no_file_comments"
+        threads: 10
+        outname_prefix: "test"
+
+cat:
+    name_merge: all_test.emapper.seed_orthologs
+