start cd-hit rule

8eb87e66 · Kenzo-Hugo Hillion · 6af6b328 · 8eb87e66 · 8eb87e66 · 8eb87e66
Commit 8eb87e66 authored 3 years ago by Kenzo-Hugo Hillion
--- a/tools/cd-hit/Snakefile
+++ b/tools/cd-hit/Snakefile
+"""
+CD-HIT manual
+
+		====== CD-HIT version 4.8.1 (built on May 23 2020) ======
+
+Usage: cd-hit [Options]
+
+Options
+
+   -i	input filename in fasta format, required, can be in .gz format
+   -o	output filename, required
+   -c	sequence identity threshold, default 0.9
+ 	this is the default cd-hit's "global sequence identity" calculated as:
+ 	number of identical amino acids or bases in alignment
+ 	divided by the full length of the shorter sequence
+   -G	use global sequence identity, default 1
+ 	if set to 0, then use local sequence identity, calculated as :
+ 	number of identical amino acids or bases in alignment
+ 	divided by the length of the alignment
+ 	NOTE!!! don't use -G 0 unless you use alignment coverage controls
+ 	see options -aL, -AL, -aS, -AS
+   -b	band_width of alignment, default 20
+   -M	memory limit (in MB) for the program, default 800; 0 for unlimitted;
+   -T	number of threads, default 1; with 0, all CPUs will be used
+   -n	word_length, default 5, see user's guide for choosing it
+   -l	length of throw_away_sequences, default 10
+   -t	tolerance for redundance, default 2
+   -d	length of description in .clstr file, default 20
+ 	if set to 0, it takes the fasta defline and stops at first space
+   -s	length difference cutoff, default 0.0
+ 	if set to 0.9, the shorter sequences need to be
+ 	at least 90% length of the representative of the cluster
+   -S	length difference cutoff in amino acid, default 999999
+ 	if set to 60, the length difference between the shorter sequences
+ 	and the representative of the cluster can not be bigger than 60
+   -aL	alignment coverage for the longer sequence, default 0.0
+ 	if set to 0.9, the alignment must covers 90% of the sequence
+   -AL	alignment coverage control for the longer sequence, default 99999999
+ 	if set to 60, and the length of the sequence is 400,
+ 	then the alignment must be >= 340 (400-60) residues
+   -aS	alignment coverage for the shorter sequence, default 0.0
+ 	if set to 0.9, the alignment must covers 90% of the sequence
+   -AS	alignment coverage control for the shorter sequence, default 99999999
+ 	if set to 60, and the length of the sequence is 400,
+ 	then the alignment must be >= 340 (400-60) residues
+   -A	minimal alignment coverage control for the both sequences, default 0
+ 	alignment must cover >= this value for both sequences
+   -uL	maximum unmatched percentage for the longer sequence, default 1.0
+ 	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ 	must not be more than 10% of the sequence
+   -uS	maximum unmatched percentage for the shorter sequence, default 1.0
+ 	if set to 0.1, the unmatched region (excluding leading and tailing gaps)
+ 	must not be more than 10% of the sequence
+   -U	maximum unmatched length, default 99999999
+ 	if set to 10, the unmatched region (excluding leading and tailing gaps)
+ 	must not be more than 10 bases
+   -B	1 or 0, default 0, by default, sequences are stored in RAM
+ 	if set to 1, sequence are stored on hard drive
+ 	!! No longer supported !!
+   -p	1 or 0, default 0
+ 	if set to 1, print alignment overlap in .clstr file
+   -g	1 or 0, default 0
+ 	by cd-hit's default algorithm, a sequence is clustered to the first
+ 	cluster that meet the threshold (fast cluster). If set to 1, the program
+ 	will cluster it into the most similar cluster that meet the threshold
+ 	(accurate but slow mode)
+ 	but either 1 or 0 won't change the representatives of final clusters
+   -sc	sort clusters by size (number of sequences), default 0, output clusters by decreasing length
+ 	if set to 1, output clusters by decreasing size
+   -sf	sort fasta/fastq by cluster size (number of sequences), default 0, no sorting
+ 	if set to 1, output sequences by decreasing cluster size
+ 	this can be very slow if the input is in .gz format
+   -bak	write backup cluster file (1 or 0, default 0)
+   -h	print this help
+
+   Questions, bugs, contact Weizhong Li at liwz@sdsc.edu
+   For updated versions and information, please visit: http://cd-hit.org
+                                                    or https://github.com/weizhongli/cdhit
+
+   cd-hit web server is also available from http://cd-hit.org
+
+   If you find cd-hit useful, please kindly cite:
+
+   "CD-HIT: a fast program for clustering and comparing large sets of protein or nucleotide sequences", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659
+   "CD-HIT: accelerated for clustering the next generation sequencing data", Limin Fu, Beifang Niu, Zhengwei Zhu, Sitao Wu & Weizhong Li. Bioinformatics, (2012) 28:3150-3152
+"""
+
+__cd-hit_exec_command = config.get('cd-hit', {}).get('exec_command', 'cd-hit')
+__cd-hit_modules = config.get('cd-hit', {}).get('modules')
+__cd-hit_options = config.get('cd-hit', {}).get('options', '')
+
+rule cd-hit:
+    input:
+        __cd-hit_input
+    output:
+        __cd-hit_output
+    params:
+        exec_command = __cd-hit_exec_command,
+        modules = __cd-hit_modules,
+        options = __cd-hit_options
+    run:
+        command = []
+        if params.modules:
+        	command.append("module load {params.modules}")
+        command.append("{params.exec_command} {params.options} -i {input} -o {output}")
+        shell(" && ".join(command))
--- a/tools/cd-hit/example_usage/Snakefile
+++ b/tools/cd-hit/example_usage/Snakefile
+configfile: "config.yaml"
+
+# ==== Snakefile path ====
+__cd-hit_rules = config.get("snakefiles", {}).get("cd-hit")
+
+__main_output_dir = config.get('output_dir', 'output')
+
+# ==== Main config ====
+SAMPLES = config.get('samples')
+__input_dir = config.get('input_dir', 'data')
+
+# ==== Run cd-hit ====
+__cd-hit_output_dir = f"{__main_output_dir}/cd-hit"
+__cd-hit_input = "{dir}/{{sample}}.fa".format(dir=__input_dir, sample="{sample}")
+__cd-hit_output = "{dir}/{{sample}}.fa".format(dir=__cd-hit_output_dir, sample="{sample}")
+include: __cd-hit_rules
+
+rule all:
+    input:
+        expand("{dir}/{{sample}}.fa".format(dir=__cd-hit_output_dir), sample=SAMPLES)
--- a/tools/cd-hit/example_usage/config.yaml
+++ b/tools/cd-hit/example_usage/config.yaml
+snakefiles:
+    prodigal: /pasteur/zeus/projets/p02/metasig/gitlab/snakemake/tools/cd-hit/Snakefile
+
+input_dir: /some/input/directory
+output_dir: /some/output/directory
+
+samples:
+- test_00000
+- test_00001
+- test_00002
+
+prodigal:
+  exec_command: cd-hit
+  modules: blast+/2.10.0 cd-hit