diff --git a/tools/utils/split_fasta/Snakefile b/tools/utils/split_fasta/Snakefile index d4bd95099a378d108e841f418707016ddad00aa7..1c4401acdf436d8e8becd335d9943980d96a93d8 100644 --- a/tools/utils/split_fasta/Snakefile +++ b/tools/utils/split_fasta/Snakefile @@ -1,8 +1,3 @@ -__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) -__split_fasta_prefix = config.get('split_fasta', {}).get('prefix', 'seq_chunk_') - -EXPECTED_EXT = [f"{i:05d}" for i in range(0, int(9898412/__split_fasta_number_sequences) + 1)] - rule split_fasta: """ Split a FASTA file with the desired number of sequences per chunk @@ -17,4 +12,5 @@ rule split_fasta: shell: """ cat {input} | awk '/^>/ {{if(N>0) printf("\\n"); printf("%s\\n",$0);++N;next;}} {{ printf("%s",$0);}} END {{printf("\\n");}}' | split -l {params.n_lines} -a 5 -d - {params.prefix} + for i in `ls {params.prefix}*`; do mv $i ${{i}}.fa;done """ diff --git a/tools/utils/split_fasta/example_usage/Snakefile b/tools/utils/split_fasta/example_usage/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..4e6be41eb2c899a927f473f3d503bff872b4ec3b --- /dev/null +++ b/tools/utils/split_fasta/example_usage/Snakefile @@ -0,0 +1,28 @@ +configfile: "config.yaml" + +def count_sequences(fasta_file): + with open(fasta_file, 'r') as file: + seq = 0 + for line in file: + if '>' in line: + seq += 1 + return seq + +# ==== Snakefile path ==== +__split_fasta_rules = config.get("snakefiles", {}).get("split_fasta") + +__main_output_dir = config.get('output_dir', 'output') + +# ==== Split FASTA ==== +__split_fasta_output_dir = __main_output_dir + "/split_fasta" + +__split_fasta_input = config['input_fasta'] +__split_fasta_number_sequences = config.get('split_fasta', {}).get('number_sequences', 1000000) +total_number_sequences = count_sequences(__split_fasta_input) +EXTENSIONS = [f"{i:05d}" for i in range(0, int(total_number_sequences/__split_fasta_number_sequences) + 1)] +__split_fasta_prefix = "/".join([__split_fasta_output_dir, config['split_fasta']['prefix']]) +__split_fasta_output = expand(__split_fasta_prefix + "{ext}.fa", ext=EXTENSIONS) +include: __split_fasta_rules + +rule all: + input: __split_fasta_output diff --git a/tools/utils/split_fasta/example_usage/config.yaml b/tools/utils/split_fasta/example_usage/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..559a78914eed47dc84b3382b4e651ad793500dcb --- /dev/null +++ b/tools/utils/split_fasta/example_usage/config.yaml @@ -0,0 +1,9 @@ +snakefiles: + split_fasta: /pasteur/projets/policy01/Atm/snakemake/tools/utils/split_fasta/Snakefile + +input_fasta: /pasteur/projets/policy01/DBs/IGC/2014-9.9M/IGC.fa +output_dir: /pasteur/projets/policy01/sandbox/20200210_test_snakemake/output + +split_fasta: + prefix: IGC_ + number_sequences: 1000000