From 2c621522b58a72e95b3953d7633211c7e14b5cfe Mon Sep 17 00:00:00 2001
From: Kenzo-Hugo Hillion <kenzo-hugo.hillion1@pasteur.fr>
Date: Mon, 10 Feb 2020 16:15:58 +0100
Subject: [PATCH] start Snakefile to split fasta

---
 tools/utils/split_fasta/Snakefile           | 20 ++++++++++++++++++++
 tools/utils/split_fasta/config_example.yaml |  0
 2 files changed, 20 insertions(+)
 create mode 100644 tools/utils/split_fasta/Snakefile
 create mode 100644 tools/utils/split_fasta/config_example.yaml

diff --git a/tools/utils/split_fasta/Snakefile b/tools/utils/split_fasta/Snakefile
new file mode 100644
index 0000000..d4bd950
--- /dev/null
+++ b/tools/utils/split_fasta/Snakefile
@@ -0,0 +1,20 @@
+__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000)
+__split_fasta_prefix = config.get('split_fasta', {}).get('prefix', 'seq_chunk_')
+
+EXPECTED_EXT = [f"{i:05d}" for i in range(0, int(9898412/__split_fasta_number_sequences) + 1)]
+
+rule split_fasta:
+    """
+    Split a FASTA file with the desired number of sequences per chunk
+    """
+    input:
+        __split_fasta_input
+    output:
+        __split_fasta_output
+    params:
+        n_lines = __split_fasta_number_sequences * 2,
+        prefix = __split_fasta_prefix
+    shell:
+        """
+        cat {input} | awk '/^>/ {{if(N>0) printf("\\n"); printf("%s\\n",$0);++N;next;}} {{ printf("%s",$0);}} END {{printf("\\n");}}' | split -l {params.n_lines} -a 5 -d - {params.prefix}
+        """
diff --git a/tools/utils/split_fasta/config_example.yaml b/tools/utils/split_fasta/config_example.yaml
new file mode 100644
index 0000000..e69de29
-- 
GitLab