From 2c621522b58a72e95b3953d7633211c7e14b5cfe Mon Sep 17 00:00:00 2001 From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr> Date: Mon, 10 Feb 2020 16:15:58 +0100 Subject: [PATCH] start Snakefile to split fasta --- tools/utils/split_fasta/Snakefile | 20 ++++++++++++++++++++ tools/utils/split_fasta/config_example.yaml | 0 2 files changed, 20 insertions(+) create mode 100644 tools/utils/split_fasta/Snakefile create mode 100644 tools/utils/split_fasta/config_example.yaml diff --git a/tools/utils/split_fasta/Snakefile b/tools/utils/split_fasta/Snakefile new file mode 100644 index 0000000..d4bd950 --- /dev/null +++ b/tools/utils/split_fasta/Snakefile @@ -0,0 +1,20 @@ +__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) +__split_fasta_prefix = config.get('split_fasta', {}).get('prefix', 'seq_chunk_') + +EXPECTED_EXT = [f"{i:05d}" for i in range(0, int(9898412/__split_fasta_number_sequences) + 1)] + +rule split_fasta: + """ + Split a FASTA file with the desired number of sequences per chunk + """ + input: + __split_fasta_input + output: + __split_fasta_output + params: + n_lines = __split_fasta_number_sequences * 2, + prefix = __split_fasta_prefix + shell: + """ + cat {input} | awk '/^>/ {{if(N>0) printf("\\n"); printf("%s\\n",$0);++N;next;}} {{ printf("%s",$0);}} END {{printf("\\n");}}' | split -l {params.n_lines} -a 5 -d - {params.prefix} + """ diff --git a/tools/utils/split_fasta/config_example.yaml b/tools/utils/split_fasta/config_example.yaml new file mode 100644 index 0000000..e69de29 -- GitLab