diff --git a/workflows/eggnogmapperv2/Snakefile b/workflows/eggnogmapperv2/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..83a483c1037717f420fdfadfdb0a1ce21cbeff2a --- /dev/null +++ b/workflows/eggnogmapperv2/Snakefile @@ -0,0 +1,68 @@ +configfile: "config.yaml" + +def count_sequences(fasta_file): + with open(fasta_file, 'r') as file: + seq = 0 + for line in file: + if '>' in line: + seq += 1 + return seq + +# ==== Snakefile path ==== +__split_fasta_rules = config.get("snakefiles", {}).get("split_fasta") +__eggnogmapper2_diamond_rules = config.get("snakefiles", {}).get("eggnogmapper2_diamond") +__cat_rules = config.get("snakefiles", {}).get("cat") +__eggnogmapper2_annotate_rules = config.get("snakefiles", {}).get("eggnogmapper2_annotate") + +# ---- Main config ---- +__main_output_dir = config.get('output_dir', 'output') +__prefix = config['split_fasta']['prefix'] + + +# ==== Split FASTA ==== +__split_fasta_output_dir = __main_output_dir + "/split_fasta" + +__split_fasta_input = config['input_fasta'] +__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) +total_number_sequences = count_sequences(__split_fasta_input) +SPLIT_NB = [f"{i:05d}" for i in range(0, int(total_number_sequences/__split_fasta_number_sequences) + 1)] +CHUNKS = [f"{__prefix}{i}" for i in SPLIT_NB] +__split_fasta_prefix = "/".join([__split_fasta_output_dir, __prefix]) +__split_fasta_output = expand(__split_fasta_prefix + "{split_nb}.fa", split_nb=SPLIT_NB) + +include: __split_fasta_rules + + +# ==== EggNOGmapper2 Diamond ==== +__eggnogmapper2_output_dir = __main_output_dir + "/eggnogmapper2" +__eggnogmapper2_diamond_input_dir = __split_fasta_output_dir +__eggnogmapper2_diamond_output_dir = __eggnogmapper2_output_dir + "/diamond" + +__eggnogmapper2_diamond_input = "{dir}/{{chunk}}.fa".format(dir=__eggnogmapper2_diamond_input_dir) +__eggnogmapper2_diamond_output_prefix = "{dir}/{{chunk}}".format(dir=__eggnogmapper2_diamond_output_dir) +__eggnogmapper2_diamond_output = "{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__eggnogmapper2_diamond_output_dir) + +include: __eggnogmapper2_diamond_rules + + +# ==== Cat ==== +__cat_merged_name = config.get('cat', {}).get('name_merge', 'cat_file') +__cat_dir = __eggnogmapper2_diamond_output_dir + +__cat_input = expand("{dir}/{{chunk}}.emapper.seed_orthologs".format(dir=__cat_dir), chunk=CHUNKS) +__cat_output = "{dir}/{file_name}".format(dir=__cat_dir, file_name=__cat_merged_name) +include: __cat_rules + + +# ==== EggNOGmapper2 Annotate ==== +__eggnogmapper2_annotate_output_dir = __eggnogmapper2_output_dir + "/annotate" + +__eggnogmapper2_annotate_input = __cat_output +__eggnogmapper2_annotate_outname_prefix = config.get('eggnogmapper2', {}).get('annotate', {}).get('outname_prefix', 'all') +__eggnogmapper2_annotate_output_prefix = "{dir}/{name}".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix) +__eggnogmapper2_annotate_output = "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix) +include: __eggnogmapper2_annotate_rules + +rule all: + input: "{dir}/{name}.emapper.annotations".format(dir=__eggnogmapper2_annotate_output_dir, name=__eggnogmapper2_annotate_outname_prefix) + diff --git a/workflows/eggnogmapperv2/config.yaml b/workflows/eggnogmapperv2/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..10bd4b1ef1e47f342d64ab38b703e4c292be6d56 --- /dev/null +++ b/workflows/eggnogmapperv2/config.yaml @@ -0,0 +1,26 @@ +snakefiles: + eggnogmapper2_diamond: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/diamond/Snakefile + split_fasta: /pasteur/projets/policy01/Atm/snakemake/tools/utils/split_fasta/Snakefile + cat: /pasteur/projets/policy01/Atm/snakemake/tools/utils/cat/Snakefile + eggnogmapper2_annotate: /pasteur/projets/policy01/Atm/snakemake/tools/eggnogmapper2/annotate/Snakefile + +input_fasta: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test.fa +output_dir: /pasteur/homes/kehillio/Atm/kenzo/sandbox/20200210_test_snakemake/test_output + +split_fasta: + prefix: test_ + number_sequences: 100 + +eggnogmapper2: + exec_command: "/pasteur/homes/kehillio/venv/eggnog-mapper-v2/bin/python /pasteur/homes/kehillio/tools/eggnog-mapper/emapper.py" + diamond: + options: "--no_file_comments --translate" + threads: 16 + annotate: + options: "--no_file_comments" + threads: 10 + outname_prefix: "test" + +cat: + name_merge: all_test.emapper.seed_orthologs +