diff --git a/tools/cd-hit-est/Snakefile b/tools/cd-hit-est/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..5b08bdd95da031c56c69c1cd487305251e2278d5 --- /dev/null +++ b/tools/cd-hit-est/Snakefile @@ -0,0 +1,129 @@ +""" +CD-HIT-EST manual + + ====== CD-HIT version 4.8.1 (built on May 23 2020) ====== + +Usage: cd-hit-est [Options] + +Options + + -i input filename in fasta format, required, can be in .gz format + -j input filename in fasta/fastq format for R2 reads if input are paired end (PE) files + -i R1.fq -j R2.fq -o output_R1 -op output_R2 or + -i R1.fa -j R2.fa -o output_R1 -op output_R2 + -o output filename, required + -op output filename for R2 reads if input are paired end (PE) files + -c sequence identity threshold, default 0.9 + this is the default cd-hit's "global sequence identity" calculated as: + number of identical amino acids or bases in alignment + divided by the full length of the shorter sequence + -G use global sequence identity, default 1 + if set to 0, then use local sequence identity, calculated as : + number of identical amino acids or bases in alignment + divided by the length of the alignment + NOTE!!! don't use -G 0 unless you use alignment coverage controls + see options -aL, -AL, -aS, -AS + -b band_width of alignment, default 20 + -M memory limit (in MB) for the program, default 800; 0 for unlimitted; + -T number of threads, default 1; with 0, all CPUs will be used + -n word_length, default 10, see user's guide for choosing it + -l length of throw_away_sequences, default 10 + -d length of description in .clstr file, default 20 + if set to 0, it takes the fasta defline and stops at first space + -s length difference cutoff, default 0.0 + if set to 0.9, the shorter sequences need to be + at least 90% length of the representative of the cluster + -S length difference cutoff in amino acid, default 999999 + if set to 60, the length difference between the shorter sequences + and the representative of the cluster can not be bigger than 60 + -aL alignment coverage for the longer sequence, default 0.0 + if set to 0.9, the alignment must covers 90% of the sequence + -AL alignment coverage control for the longer sequence, default 99999999 + if set to 60, and the length of the sequence is 400, + then the alignment must be >= 340 (400-60) residues + -aS alignment coverage for the shorter sequence, default 0.0 + if set to 0.9, the alignment must covers 90% of the sequence + -AS alignment coverage control for the shorter sequence, default 99999999 + if set to 60, and the length of the sequence is 400, + then the alignment must be >= 340 (400-60) residues + -A minimal alignment coverage control for the both sequences, default 0 + alignment must cover >= this value for both sequences + -uL maximum unmatched percentage for the longer sequence, default 1.0 + if set to 0.1, the unmatched region (excluding leading and tailing gaps) + must not be more than 10% of the sequence + -uS maximum unmatched percentage for the shorter sequence, default 1.0 + if set to 0.1, the unmatched region (excluding leading and tailing gaps) + must not be more than 10% of the sequence + -U maximum unmatched length, default 99999999 + if set to 10, the unmatched region (excluding leading and tailing gaps) + must not be more than 10 bases + -B 1 or 0, default 0, by default, sequences are stored in RAM + if set to 1, sequence are stored on hard drive + !! No longer supported !! + -P input paired end (PE) reads, default 0, single file + if set to 1, please use -i R1 -j R2 to input both PE files + -cx length to keep after trimming the tail of sequence, default 0, not trimming + if set to 50, the program only uses the first 50 letters of input sequence + -cy length to keep after trimming the tail of R2 sequence, default 0, not trimming + if set to 50, the program only uses the first 50 letters of input R2 sequence + e.g. -cx 100 -cy 80 for paired end reads + -ap alignment position constrains, default 0, no constrain + if set to 1, the program will force sequences to align at beginings + when set to 1, the program only does +/+ alignment + -p 1 or 0, default 0 + if set to 1, print alignment overlap in .clstr file + -g 1 or 0, default 0 + by cd-hit's default algorithm, a sequence is clustered to the first + cluster that meet the threshold (fast cluster). If set to 1, the program + will cluster it into the most similar cluster that meet the threshold + (accurate but slow mode) + but either 1 or 0 won't change the representatives of final clusters + -r 1 or 0, default 1, by default do both +/+ & +/- alignments + if set to 0, only +/+ strand alignment + -mask masking letters (e.g. -mask NX, to mask out both 'N' and 'X') + -match matching score, default 2 (1 for T-U and N-N) + -mismatch mismatching score, default -2 + -gap gap opening score, default -6 + -gap-ext gap extension score, default -1 + -bak write backup cluster file (1 or 0, default 0) + -sc sort clusters by size (number of sequences), default 0, output clusters by decreasing length + if set to 1, output clusters by decreasing size + -sf sort fasta/fastq by cluster size (number of sequences), default 0, no sorting + if set to 1, output sequences by decreasing cluster size + this can be very slow if the input is in .gz format + -h print this help + + Questions, bugs, contact Weizhong Li at liwz@sdsc.edu + For updated versions and information, please visit: http://cd-hit.org + or https://github.com/weizhongli/cdhit + + cd-hit web server is also available from http://cd-hit.org + + If you find cd-hit useful, please kindly cite: + + "CD-HIT: a fast program for clustering and comparing large sets of protein or nucleotide sequences", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659 + "CD-HIT: accelerated for clustering the next generation sequencing data", Limin Fu, Beifang Niu, Zhengwei Zhu, Sitao Wu & Weizhong Li. Bioinformatics, (2012) 28:3150-3152 +""" + +__cd_hit_est_exec_command = config.get('cd_hit_est', {}).get('exec_command', 'cd_hit_est') +__cd_hit_est_modules = config.get('cd_hit_est', {}).get('modules') +__cd_hit_est_options = config.get('cd_hit_est', {}).get('options', '') +__cd_hit_est_threads = config.get('cd_hit_est', {}).get('threads', 1) + +rule cd_hit_est: + input: + __cd_hit_est_input + output: + __cd_hit_est_output + params: + exec_command = __cd_hit_est_exec_command, + modules = __cd_hit_est_modules, + options = __cd_hit_est_options + threads: + __cd_hit_est_threads + run: + command = [] + if params.modules: + command.append("module load {params.modules}") + command.append("{params.exec_command} {params.options} -i {input} -T {threads} -M 0 -o {output}") + shell(" && ".join(command)) diff --git a/tools/cd-hit-est/example_usage/Snakefile b/tools/cd-hit-est/example_usage/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..b037c44c16567054a7ce10cefa1c944bfcd1594a --- /dev/null +++ b/tools/cd-hit-est/example_usage/Snakefile @@ -0,0 +1,20 @@ +configfile: "config.yaml" + +# ==== Snakefile path ==== +__cd-hit_rules = config.get("snakefiles", {}).get("cd_hit_est") + +__main_output_dir = config.get('output_dir', 'output') + +# ==== Main config ==== +SAMPLES = config.get('samples') +__input_dir = config.get('input_dir', 'data') + +# ==== Run cd-hit ==== +__cd-hit_output_dir = f"{__main_output_dir}/cd-hit" +__cd-hit_input = "{dir}/{{sample}}.fa".format(dir=__input_dir, sample="{sample}") +__cd-hit_output = "{dir}/{{sample}}.fa".format(dir=__cd-hit_output_dir, sample="{sample}") +include: __cd-hit_rules + +rule all: + input: + expand("{dir}/{{sample}}.fa".format(dir=__cd-hit_output_dir), sample=SAMPLES) diff --git a/tools/cd-hit-est/example_usage/config.yaml b/tools/cd-hit-est/example_usage/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..765e44d0eac0c009dd42b323b78bc91b0e44eb36 --- /dev/null +++ b/tools/cd-hit-est/example_usage/config.yaml @@ -0,0 +1,15 @@ +snakefiles: + cd_hit_est: /pasteur/zeus/projets/p02/metasig/gitlab/snakemake/tools/cd-hit/Snakefile + +input_dir: /some/input/directory +output_dir: /some/output/directory + +samples: +- test_00000 +- test_00001 +- test_00002 + +cd_hit_est: + exec_command: cd-hit-est + modules: blast+/2.10.0 cd-hit + threads: 16 diff --git a/tools/prodigal/Snakefile b/tools/prodigal/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..b304adea1f7348077ed365d3741a143d33f6e2e9 --- /dev/null +++ b/tools/prodigal/Snakefile @@ -0,0 +1,45 @@ +""" +Prodigal manual + +Usage: prodigal [-a trans_file] [-c] [-d nuc_file] [-f output_type] + [-g tr_table] [-h] [-i input_file] [-m] [-n] [-o output_file] + [-p mode] [-q] [-s start_file] [-t training_file] [-v] + + -a: Write protein translations to the selected file. + -c: Closed ends. Do not allow genes to run off edges. + -d: Write nucleotide sequences of genes to the selected file. + -f: Select output format (gbk, gff, or sco). Default is gbk. + -g: Specify a translation table to use (default 11). + -h: Print help menu and exit. + -i: Specify FASTA/Genbank input file (default reads from stdin). + -m: Treat runs of N as masked sequence; don't build genes across them. + -n: Bypass Shine-Dalgarno trainer and force a full motif scan. + -o: Specify output file (default writes to stdout). + -p: Select procedure (single or meta). Default is single. + -q: Run quietly (suppress normal stderr output). + -s: Write all potential genes (with scores) to the selected file. + -t: Write a training file (if none exists); otherwise, read and use + the specified training file. + -v: Print version number and exit. +""" + +__prodigal_exec_command = config.get('prodigal', {}).get('exec_command', 'prodigal') +__prodigal_modules = config.get('prodigal', {}).get('modules') +__prodigal_options = config.get('prodigal', {}).get('options', '') + +rule prodigal: + input: + __prodigal_input + output: + fasta_genes = __prodigal_fasta_genes, + genes = __prodigal_genes + params: + exec_command = __prodigal_exec_command, + modules = __prodigal_modules, + options = __prodigal_options + run: + command = [] + if params.modules: + command.append("module load {params.modules}") + command.append("{params.exec_command} {params.options} -i {input} -d {output.fasta_genes} -o {output.genes}") + shell(" && ".join(command)) diff --git a/tools/prodigal/example_usage/Snakefile b/tools/prodigal/example_usage/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..4b32331d84ec33fa11ea1b3299fcdbd9cfe12f3c --- /dev/null +++ b/tools/prodigal/example_usage/Snakefile @@ -0,0 +1,22 @@ +configfile: "config.yaml" + +# ==== Snakefile path ==== +__prodigal_rules = config.get("snakefiles", {}).get("prodigal") + +__main_output_dir = config.get('output_dir', 'output') + +# ==== Main config ==== +SAMPLES = config.get('samples') +__input_dir = config.get('input_dir', 'data') + +# ==== Run prodigal ==== +__prodigal_output_dir = f"{__main_output_dir}/prodigal" +__prodigal_input = "{dir}/{{sample}}.fa".format(dir=__input_dir, sample="{sample}") +__prodigal_fasta_genes = "{dir}/{{sample}}.fa".format(dir=__prodigal_output_dir, sample="{sample}") +__prodigal_genes = "{dir}/{{sample}}.gbk".format(dir=__prodigal_output_dir, sample="{sample}") +include: __prodigal_rules + +rule all: + input: + fasta_genes = expand("{dir}/{{sample}}.fa".format(dir=__prodigal_output_dir), sample=SAMPLES), + genes = expand("{dir}/{{sample}}.gbk".format(dir=__prodigal_output_dir), sample=SAMPLES) diff --git a/tools/prodigal/example_usage/config.yaml b/tools/prodigal/example_usage/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3314b2d9914f22ea2a1f2ee845374147bdf236ac --- /dev/null +++ b/tools/prodigal/example_usage/config.yaml @@ -0,0 +1,14 @@ +snakefiles: + prodigal: /pasteur/zeus/projets/p02/metasig/gitlab/snakemake/tools/prodigal/Snakefile + +input_dir: /some/input/directory +output_dir: /some/output/directory + +samples: +- test_00000 +- test_00001 +- test_00002 + +prodigal: + exec_command: prodigal + modules: prodigal