diff --git a/tools/utils/split_fasta/Snakefile b/tools/utils/split_fasta/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..d4bd95099a378d108e841f418707016ddad00aa7 --- /dev/null +++ b/tools/utils/split_fasta/Snakefile @@ -0,0 +1,20 @@ +__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) +__split_fasta_prefix = config.get('split_fasta', {}).get('prefix', 'seq_chunk_') + +EXPECTED_EXT = [f"{i:05d}" for i in range(0, int(9898412/__split_fasta_number_sequences) + 1)] + +rule split_fasta: + """ + Split a FASTA file with the desired number of sequences per chunk + """ + input: + __split_fasta_input + output: + __split_fasta_output + params: + n_lines = __split_fasta_number_sequences * 2, + prefix = __split_fasta_prefix + shell: + """ + cat {input} | awk '/^>/ {{if(N>0) printf("\\n"); printf("%s\\n",$0);++N;next;}} {{ printf("%s",$0);}} END {{printf("\\n");}}' | split -l {params.n_lines} -a 5 -d - {params.prefix} + """ diff --git a/tools/utils/split_fasta/config_example.yaml b/tools/utils/split_fasta/config_example.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391