From 8c7edff76fc7763873a45dc578add7baa0fcc4dd Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr> Date: Tue, 11 Feb 2020 12:37:33 +0100 Subject: [PATCH] finish Snakefile for splitting Fasta --- tools/utils/split_fasta/Snakefile | 6 +--- .../utils/split_fasta/example_usage/Snakefile | 28 +++++++++++++++++++ .../split_fasta/example_usage/config.yaml | 9 ++++++ 3 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 tools/utils/split_fasta/example_usage/Snakefile create mode 100644 tools/utils/split_fasta/example_usage/config.yaml diff --git a/tools/utils/split_fasta/Snakefile b/tools/utils/split_fasta/Snakefile index d4bd950..1c4401a 100644 --- a/tools/utils/split_fasta/Snakefile +++ b/tools/utils/split_fasta/Snakefile @@ -1,8 +1,3 @@ -__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) -__split_fasta_prefix = config.get('split_fasta', {}).get('prefix', 'seq_chunk_') - -EXPECTED_EXT = [f"{i:05d}" for i in range(0, int(9898412/__split_fasta_number_sequences) + 1)] - rule split_fasta: """ Split a FASTA file with the desired number of sequences per chunk @@ -17,4 +12,5 @@ rule split_fasta: shell: """ cat {input} | awk '/^>/ {{if(N>0) printf("\\n"); printf("%s\\n",$0);++N;next;}} {{ printf("%s",$0);}} END {{printf("\\n");}}' | split -l {params.n_lines} -a 5 -d - {params.prefix} + for i in `ls {params.prefix}*`; do mv $i ${{i}}.fa;done """ diff --git a/tools/utils/split_fasta/example_usage/Snakefile b/tools/utils/split_fasta/example_usage/Snakefile new file mode 100644 index 0000000..4e6be41 --- /dev/null +++ b/tools/utils/split_fasta/example_usage/Snakefile @@ -0,0 +1,28 @@ +configfile: "config.yaml" + +def count_sequences(fasta_file): + with open(fasta_file, 'r') as file: + seq = 0 + for line in file: + if '>' in line: + seq += 1 + return seq + +# ==== Snakefile path ==== +__split_fasta_rules = config.get("snakefiles", {}).get("split_fasta") + +__main_output_dir = config.get('output_dir', 'output') + +# ==== Split FASTA ==== +__split_fasta_output_dir = __main_output_dir + "/split_fasta" + +__split_fasta_input = config['input_fasta'] +__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) +total_number_sequences = count_sequences(__split_fasta_input) +EXTENSIONS = [f"{i:05d}" for i in range(0, int(total_number_sequences/__split_fasta_number_sequences) + 1)] +__split_fasta_prefix = "/".join([__split_fasta_output_dir, config['split_fasta']['prefix']]) +__split_fasta_output = expand(__split_fasta_prefix + "{ext}.fa", ext=EXTENSIONS) +include: __split_fasta_rules + +rule all: + input: __split_fasta_output diff --git a/tools/utils/split_fasta/example_usage/config.yaml b/tools/utils/split_fasta/example_usage/config.yaml new file mode 100644 index 0000000..559a789 --- /dev/null +++ b/tools/utils/split_fasta/example_usage/config.yaml @@ -0,0 +1,9 @@ +snakefiles: + split_fasta: /pasteur/projets/policy01/Atm/snakemake/tools/utils/split_fasta/Snakefile + +input_fasta: /pasteur/projets/policy01/DBs/IGC/2014-9.9M/IGC.fa +output_dir: /pasteur/projets/policy01/sandbox/20200210_test_snakemake/output + +split_fasta: + prefix: IGC_ + number_sequences: 1000000 -- GitLab