Skip to content
Snippets Groups Projects
nextflow.config 24.4 KiB
Newer Older
/*
#########################################################################
##                                                                     ##
##     gmillot A. Millot                                               ##
##     Bioinformatics and Biostatistics Hub                            ##
##     Computational Biology Department                                ##
##     Institut Pasteur Paris                                          ##
##                                                                     ##
#########################################################################
*/

/*
#########################################################################
##                                                                     ##
##     Parameters that must be set by the user                         ##
##                                                                     ##
#########################################################################
*/
########################
##                    ##
##     Data           ##
##                    ##
########################
    sample_path = "https://zenodo.org/records/14509916/files/ig_clustering_test_1_VH.zip" // single character string of the path of the fasta files directory. The last / can be added or not, as it is removed by nextflow file(). Can also be a .zip file that contains only fasta files. Warning: the fasta names must not start by a digit, before alakazam::readChangeoDb() function is fixed by the maintainer. Example : sample_path="/mnt/c/Users/gmillot/Documents/Git_projects/repertoire_profiler/dataset/A1_IgG_H_fw" or sample_path="/pasteur/appa/homes/gmillot/dataset/20210707_AV07016_HAD-III-89_plate3_IgK_sanger_seq". Example with spaces in the path: sample_path="/mnt/x/ROCURONIUM PROJECT/01 Primary data/04.Repertoire analysis/SORT1/SORT1 Seq-original/xlsx_to_fasta_1669018924/All/VL". Example: sample_path = "/mnt/c/Users/gmillot/Documents/Git_projects/repertoire_profiler/dataset/ig_clustering_test_1_VH". Example: sample_path = "https://zenodo.org/records/14500292/files/ig_clustering_test_1_VH.zip"
    meta_path = "https://zenodo.org/records/14500245/files/metadata.tsv" // single character string of a valid path of a metadata file for adding info to the results. Write "NULL" if no metadata to add. WARNING: the metadata .tsv table must include a first column named "Label" containing sequence names, i.e., the header of some of the fasta files from sample_path, without the ">" of the header. Additionnal columns (quanti or quali) can then be added after the fisrt column to modify the leafs of the tree. For instance: "KD", or Antibody name. Example: meta_path = "/mnt/c/Users/gmillot/Documents/Git_projects/repertoire_profiler/dataset/metadata.tsv". Example: meta_path = "NULL". Example: meta_path = "https://zenodo.org/records/14500245/files/metadata.tsv"
    meta_name_replacement = "Name" // single character string of the name of a character column of the metadata table indicated in the meta_path parameter. This column will be used to replace the sequence names/IDs (header of the fasta files) by more appropriate names in returned .tsv and .pdf files (but the initial sequence name/ID remains indicated in all .tsv files, in another column). This is convenient to easily identify some Ig of interest in a huge set of Ig. Write "NULL" if not required and if meta_path is not "NULL". Ignored if meta_path = "NULL". Example: meta_name_replacement = "KD". Of note, germ_tree_leaf_size parameter is ignored if meta_name_replacement is a numeric column ,and germ_tree_leaf_shape is ignored if the column is another mode
}

/*
#########################################
##                                     ##
##     Ig annotation and clustering    ##
##                                     ##
#########################################
*/

env {
    igblast_organism = "mouse" // single character string indicating the organism analyzed. Either "mouse", "human", "rabbit", "rat" or "rhesus_monkey" (value of the --organism option of AssignGenes.py igblast). Example: igblast_organism="human". Example: igblast_organism="mouse"
    igblast_database_path = "germlines/imgt/mouse/vdj" // single character string of the path of the database provided by igblast indicating a folder of fasta files, WITHOUT the last /. Normally, only the organism name should be changed in the path, to be the same as in the igblast_organism parameter. Example: igblast_database_path="germlines/imgt/human/vdj". Example: igblast_database_path="germlines/imgt/mouse/vdj". Warnings (for developers only): (1) see \\wsl$\Ubuntu-20.04\home\gmillot\share for the different possibilities of paths and (2) change this code in the .nf file " MakeDb.py igblast -i \${FILE}_igblast.fmt7 -s ${fs} -r \${REPO_PATH}/imgt_human_IGHV.fasta \${REPO_PATH}/imgt_human_IGHD.fasta \${REPO_PATH}/imgt_human_IGHJ.fasta --extended" if the present path is modified
    igblast_loci = "ig" // single character string of the value of the --loci option of AssignGenes.py igblast. Example: igblast_loci="ig"
    igblast_aa = "false" // single character string either of the igblast protein (true) or nucleic (false) aligmnent. Either "true" or "false" in lower capitals. Warning: "true" means that the fasta sequences in sample_path must be aa sequences. Example: igblast_aa = "false". igblast_aa = "true" does not work for the moment because no j data in the imgt database (compare AssignGenes.py igblast --help and AssignGenes.py igblast-aa --help). In addition, no junction data are returned by igblast-aa, which block the clone_assignment process
    igblast_files = "imgt_mouse_IGHV.fasta imgt_mouse_IGHD.fasta imgt_mouse_IGHJ.fasta" // single character string of the files of igblast database (igblast_database_path parameter) to use for annotations. Each file msut be separated by a single space. Example for human IGH: igblast_files="imgt_human_IGHV.fasta imgt_human_IGHD.fasta imgt_human_IGHJ.fasta". Example for human IGK: igblast_files="imgt_human_IGKV.fasta imgt_human_IGKJ.fasta". Example for mouse IGK: igblast_files="imgt_mouse_IGKV.fasta imgt_mouse_IGKJ.fasta". Example for human IGK + IGL: igblast_files = "imgt_human_IGLV.fasta imgt_human_IGLJ.fasta imgt_human_IGKV.fasta imgt_human_IGKJ.fasta". Of note, correspond to the -r option of MakeDb.py igblast
Gael  MILLOT's avatar
Gael MILLOT committed
    clone_nb_seq = "3" // single character string of a positive integer value greater or equal to 3. Minimun number of non-identical sequences per clonal group for tree plotting. Example: clone_nb_seq = "5". Cannot be below 3 as dowser::getTrees(clones, build="igphyml") needs at least three different sequences
Gael  MILLOT's avatar
Gael MILLOT committed
    clone_model = "ham" // single character string of the model used to compute the distance between sequences, before clutering them to clonal groups. Either "ham", "aa", "hh_s1f", "hh_s5f", "mk_rs1nf", "mk_rs5nf", "m1n_compat", "hs1f_compat". Warning. Must be "aa" if aa are used. See https://shazam.readthedocs.io/en/stable/topics/distToNearest/ and https://shazam.readthedocs.io/en/stable/vignettes/DistToNearest-Vignette/
    clone_normalize = "len" // single character string of the normalization used to compute the distance between sequences, before clutering them to clonal groups. Either "len" or "none". See the links in clone_model
    clone_distance = "0.15" // single character string of a positive proportion value, setting the distance threhold that defines the appartenance of 2 sequences to a same clonal group or not. See https://shazam.readthedocs.io/en/stable/vignettes/DistToNearest-Vignette/. Example: clone_distance = "0.15"
}

/*
########################
##                    ##
##     Graphics       ##
##                    ##
########################
*/

env {
    meta_legend = "KD" // single character string of the name of a column of the table indicated in the meta_path parameter. This column will be used to add a legend in trees (and only in trees), in order to visualize an additionnal parameter like KD, names, etc. If a numeric column is indicated, it will be used for leaf size. If a non numeric column is indicated, it will be used for coloring the leafs according to the classes inside the column. Ignored if meta_path = "NULL". Example: meta_legend = "KD". Of note, germ_tree_leaf_size parameter is ignored if meta_legend is a numeric column ,and germ_tree_leaf_shape is ignored if the column is another mode
    germ_tree_kind = "rectangular" // single character string of the kind of tree. Can be "rectangular", "roundrect", "slanted", "ellipse", "circular", "fan", "equal_angle", "daylight". See https://yulab-smu.top/treedata-book/chapter4.html#tree-layouts
    germ_tree_duplicate_seq = "FALSE" // single character string indicating if identical sequences (with difference cell or sequence names) must be removed from trees or not. Either "TRUE" for keeping or "FALSE" for removing
    germ_tree_leaf_color = "NULL" // single character string of the color of leaf tip. Ignored if meta_legend parameter is a name of a non numeric column of the meta_path parameter
Gael  MILLOT's avatar
Gael MILLOT committed
    germ_tree_leaf_shape = "21" // single character string of the shape of leaf tip. See https://stat.ethz.ch/R-manual/R-devel/library/graphics/html/points.html
    germ_tree_leaf_size = "3" // single character string of the size of leafs (positive numeric value in mm, the size of the plot being 120 x 120 mm). Ignored if meta_legend parameter is a name of a numeric column of the meta_path parameter
    germ_tree_label_size = "2" // single character string of the size of leaf labeling (positive numeric value in mm, the size of the plot being 120 x 120 mm)
Gael  MILLOT's avatar
Gael MILLOT committed
    germ_tree_label_hjust = "-0.2" // single character string of the adjustment of leaf labeling (numeric value)
    germ_tree_label_rigth = "FALSE" // single character string of the position of the labeling. Either "TRUE" or "FALSE". Only works for germ_tree_kind = "rectangular" and users need to use theme() to adjust tip labels in this case
    germ_tree_label_outside = "TRUE" // single character string of the display of the labeling outside of the plot region (if labels are truncated). Either "TRUE" or "FALSE"
    germ_tree_right_margin = "1.5" // single character string of the positive numeric value for the right margin in inches, considering 5 inches the width of the graphic device
    germ_tree_legend = "TRUE" // single character string of the display of the legend. Either "TRUE" or "FALSE"
    donut_palette = "Accent" // single character string of the color palette of the donut. Write "NULL" for default. Example: donut_palette = "Accent". See https://ggplot2.tidyverse.org/reference/scale_brewer.html#palettes
    donut_hole_size = "0.5" // single character string of the positive proportion of donut central hole, 0 meaning no hole (pie chart) and 1 no plot (donut with a null thickness)
    donut_hole_text = "TRUE" // single character string of the display of the sum of frequencies in the middle of the donut. Either "TRUE" or "FALSE"
    donut_hole_text_size = "8" // single character string of the positive numeric value of the font size in mm of the text in the middle of the donut. Ignored if hole.text is FALSE
    donut_border_color = "gray50" // single character string of the color of the donut borders. Either "black" or "white". Example: donut_border_colors = "black"
    donut_border_size = "0.2" // single character string of the size of the donut borders (positive numeric value in mm, the size of the plot being 120 x 120 mm)
    donut_annotation_distance = "0.8" // single character string of the distance from the center of the slice. 0 means center of the slice, 0.5 means at the edge (positive numeric value ). Above 0.5, the donut will be reduced to make place for the annotation. Ignored if meta_path is NULL
    donut_annotation_size = "4" // single character string of the annotation font size in mm (numeric value). Ignored if meta_path is NULL
    donut_annotation_force = "100" // single character string of the force of repulsion between overlapping text labels (numeric value). See ?ggrepel::geom_text_repel() in R. Ignored if meta_path is NULL
    donut_annotation_force_pull = "1" // single character string of the force of attraction between a text label and its corresponding data point (numeric value). See ?ggrepel::geom_text_repel() in R. Ignored if meta_path is NULL
    donut_legend_width = "0.55" // single character string indicating the relative width of the legend sector (on the right of the plot) relative to the width of the plot (single proportion between 0 and 1). Value 1 means that the window device width is split in 2, half for the plot and half for the legend. Value 0 means no room for the legend, which will overlay the plot region. Write NULL to inactivate the legend sector. In such case, ggplot2 will manage the room required for the legend display, meaning that the width of the plotting region can vary between graphs, depending on the text in the legend
    donut_legend_text_size = "6" // single character string of the font size in mm of the legend labels (numeric value)
    donut_legend_box_size = "3" // single character string of the size of the legend squares in mm (numeric value)
    donut_legend_box_space = "2" // single character string of the space between the legend boxes in mm (numeric value)
Gael  MILLOT's avatar
Gael MILLOT committed
    donut_legend_limit = "0.05" // single character string of the classes displayed in the legend for which the corresponding proportion is over the mentionned proportion threshold (positive proportion). Example: donut_legend_limit = 0.4 means that only the sectors over 40% of the donut will be in the legend. Write "NULL" for all the sectors in the legend (no limit required).
    phylo_tree_heavy = "true" // single character string indicating whether the analyzed sequences correspond to heavy chain or light chain (as it impacts alignment options). Either "true" or "false"
    phylo_tree_model_path = "https://gitlab.pasteur.fr/gmillot/repertoire_profiler/-/tree/master/bin/AB_model" // single character string indicating the path of the evolutionary model/matrix file, dedicated to antibodies (10.1093/molbev/msu340). Change this parameter value only if you want another model. Example: phylo_tree_model_path = "/mnt/c/Users/gmillot/Documents/Git_projects/repertoire_profiler/bin/AB_model". Example: phylo_tree_model="$baseDir/bin/AB_model". Example: phylo_tree_model="https://gitlab.pasteur.fr/gmillot/repertoire_profiler/-/tree/master/bin/AB_model"
    phylo_tree_itolkey = "eOIzrxSbR2pyDVxMwEGY2g" // single character string indicating the iTOL user api key, tu upload the trees and download the images. Example: phylo_tree_itolkey = "eOIzrxSbR2pyDVxMwEGY2g"
######################################
##                                  ##
##     Local / Cluster execution    ##
##                                  ##
######################################
Gael  MILLOT's avatar
Gael MILLOT committed

apptainer_path = "NULL" // single character string of the path of the apptainer folder (where all the apptainer images are are pulled and stored for proper nextflow execution). You can indicate an empty folder. In that case, docker images will be pulled from dockerhub, converted into apptainer images and stored into this indicted folder for next executions. Warning: Writing "NULL" for default path is possible but will work only if you update cacheDir = '/pasteur/helix/projects/BioIT/gmillot/apptainer' and cacheDir = '/mnt/c/Users/gmillot/apptainer' in the apptainer{} section below. Example: apptainer_path='/pasteur/helix/projects/BioIT/gmillot/apptainer'. Example: apptainer_path='/mnt/c/Users/gmillot/apptainer'. Example: apptainer_path="$projectDir/apptainer" # do not forget double quotes
Gael  MILLOT's avatar
Gael MILLOT committed
// see https://confluence.pasteur.fr/pages/viewpage.action?pageId=69304504
system_exec = 'local' // single character string of the system that runs the workflow. Either 'local' to run on our own computer or 'slurm' to run on the pasteur cluster. Example: system_exec = 'local'
simult_jobs = 3000 // number of max simultaneous jobs. This is to avoid to saturated a cluster if millions of jobs in parallel. Write 0 for all the threads
Gael  MILLOT's avatar
Gael MILLOT committed
queue = 'common,dedicated' // single character string of the -p option of slurm. Example: queue = 'common,dedicated'. Example: queue = 'hubbioit'
qos = '--qos=ultrafast' // single character string of the --qos option of slurm. Example: qos= '--qos=fast'. Example: qos = '--qos=ultrafast'. Example: qos = '--qos=hubbioit'
add_options = ' ' // single character string of the additional option of slurm. Example: add_options = '--exclude=maestro-1101,maestro-1034' or add_options = ' ', add_options = '--time=70:00:00' (acceptable time formats include "minutes", "minutes:seconds", "hours:minutes:seconds", "days-hours", "days-hours:minutes" and "days-hours:minutes:seconds"). See https://slurm.schedmd.com/sbatch.html#OPT_time for other options

#############################
##                         ##
##     Other (optional)    ##
##                         ##
#############################
Gael  MILLOT's avatar
Gael MILLOT committed

    cute_path = "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v12.8/cute_little_R_functions.R" // single character string indicating the file (and absolute pathway) of the required cute_little_R_functions toolbox. With ethernet connection available, this can also be used: "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v5.1.0/cute_little_R_functions.R" or local "C:\\Users\\gmillot\\Documents\\Git_projects\\cute_little_R_functions\\cute_little_R_functions.R"
    igphylm_exe_path = "/usr/local/share/igphyml/src/igphyml" // single character string indicating the path of the igphyml exec file. No need to change that path when using the containers defined below. Example: igphylm_exe_path = "/usr/local/share/igphyml/src/igphyml". Example: igphylm_exe_path = "\\\\wsl$\\Ubuntu-20.04\\home\\gmillot\\bin\\igphyml\\src\\igphyml"
out_path_ini = "$projectDir/results" // single character string of where the output files will be saved. Example out_path_ini = '.' for where the main.nf run is executed or out_path_ini = "$projectDir/results" to put the results in a result folder (created if required), $projectDir indicating where the main.nf run is executed. Example: out_path_ini = '/mnt/c/Users/gmillot/Desktop'. Example : out_path_ini="/pasteur/helix/projects/BioIT/gmillot/08002_bourgeron/results". Warning: this does not work: out_path_ini = "/mnt/share/Users/gmillot/Desktop"
result_folder_name = "repertoire_profiler" // single character string.of the name of the folder where the results files are dorpped
/*
#########################################################################
##                                                                     ##
##     End Parameters that must be set by the user                     ##
##                                                                     ##
#########################################################################
*/








//////// Pre processing

int secs = (new Date().getTime())/1000
out_path="${out_path_ini}/${result_folder_name}_${secs}"

//////// end Pre processing



//////// variables used here and also in the main.nf file

env {
    system_exec = "${system_exec}"
    out_path_ini = "${out_path_ini}"
    out_path = "${out_path}"
    queue = "${queue}"
    qos = "${qos}"
    add_options = "${add_options}"
}

//////// variables used here and also in the main.nf file





//////// Scopes

// those are closures. See https://www.nextflow.io/docs/latest/script.html#closures
executor { 
    // name = "slurm" // means "if name of the executor is slurm, then ${simult_jobs}, i.e., max ${simult_jobs} simultaneous jobs". Inactivated because applied to all kinds of executors
    queueSize = "${simult_jobs}" // can only be written here and not below because queueSize is a method of executor{}
}

// create a report folder and print a html report file . If no absolute path, will be where the run is executed
// see https://www.nextflow.io/docs/latest/config.html#config-report
report {
       enabled = true
       file = "${out_path}/reports/report.html" // warning: here double quotes to get the nextflow variable interpretation
}

// txt file with all the processes and info
trace {
    enabled = true
    file = "${out_path}/reports/trace.txt"
}

// html file with all the processes
timeline {
    enabled = true
    file = "${out_path}/reports/timeline.html"
}

// .dot picture of the workflow
dag {
    enabled = true
    file = "${out_path}/reports/nf_dag.png"
// define apptainer parameters
apptainer {
    enabled = true
    autoMounts = true // automatically mounts host paths in the executed container
    if(system_exec == 'slurm' || system_exec == 'slurm_local'){
        runOptions = '--no-home --bind /pasteur' //-B /run/shm:/run/shm has been removed because block the pipeline. Warning: clone_assignment process use python. Thus, -B /run/shm:/run/shm should be required normally
        runOptions = '--no-home -B /run/shm:/run/shm' // --no-home prevent apptainer to mount the $HOME path and thus forces apptainer to work with only what is inside the container
    //runOptions = '--home $HOME:/home/$USER --bind /pasteur' // provide any extra command line options supported by the apptainer exec. Here, fait un bind de tout /pasteur dans /pasteur du container. Sinon pas d accès
    if(apptainer_path == "NULL"){
Gael  MILLOT's avatar
Gael MILLOT committed
        if(system_exec == 'slurm'){
Gael  MILLOT's avatar
Gael MILLOT committed
            cacheDir = '/pasteur/helix/projects/BioIT/gmillot/apptainer' // name of the directory where remote Singularity images are stored. When rerun, the exec directly uses these without redownloading them. When using a computing cluster it must be a shared folder accessible to all computing nodes
Gael  MILLOT's avatar
Gael MILLOT committed
        }else{
            cacheDir = '/mnt/c/Users/gmillot/apptainer' // "$projectDir/apptainer" can be used but do not forget double quotes.
Gael  MILLOT's avatar
Gael MILLOT committed
        }
        cacheDir = "${apptainer_path}"
    }
}

//////// end Scopes



//////// directives

// provide the default directives for all the processes in the main.nf pipeline calling this config file
process {
// directives for all the processes
    executor = "${system_exec}"
// end directives for all the processes
    if(system_exec == 'slurm'){
        queue = "$queue"
        clusterOptions = "$qos $add_options"
        scratch=false
Gael  MILLOT's avatar
Gael MILLOT committed
        maxRetries=3
Gael  MILLOT's avatar
Gael MILLOT committed
        errorStrategy = {task.exitStatus in 137..143 ? 'retry' : 'terminate' }
    }else{
        maxRetries=0
        errorStrategy='terminate'
    }

    withLabel: bash {
Gael  MILLOT's avatar
Gael MILLOT committed
        container = 'gmillot/bash-extended_v4.0:gitlab_v8.0'
        cpus = 1
        memory = '500M'
    withLabel: unzip {
        container='gmillot/ubuntu_v22.04_extended_v1.0:gitlab_v10.2'
        cpus=1 // only used when name = "local" in the executor part above
        memory='1G' // only used when name = "local" in the executor part above
    }

    withLabel: immcantation {
Gael  MILLOT's avatar
Gael MILLOT committed
        container = 'gmillot/immcantation_v1.2:gitlab_v10.1'
        cpus = 1
        memory = '3G'
Gael  MILLOT's avatar
Gael MILLOT committed
        container = 'gmillot/immcantation_v1.2:gitlab_v10.1'
        cpus = 10
        memory = '30G'
    withLabel: r_ext {
        container='gmillot/r_v4.1.2_ig_clustering_v1.3:gitlab_v9.9'
        cpus=1 // only used when name = "local" in the executor part above
        memory='3G' // only used when name = "local" in the executor part above
    }

    withLabel: seqkit {
        container='pegi3s/seqkit:2.3.0'
        cpus=1 // only used when name = "local" in the executor part above
        memory='3G' // only used when name = "local" in the executor part above
    }

    //////// Fred's part
    withLabel: msa {
        container= 'evolbioinfo/mafft:v7.520'
    }

Gael  MILLOT's avatar
Gael MILLOT committed
    withLabel: abalign {
        container= 'gmillot/abalign2_v1.0:gitlab_v10.2'
    }

    withLabel: tabletoitol {
        container= 'evolbioinfo/table2itol:fa4b43c'
    withLabel: gotree {
        container= 'evolbioinfo/gotree:v0.4.4'
    }
    withLabel: snag {
        container= 'evolbioinfo/snag:v0.2'
    }
    withLabel: raxml {
        container= 'evolbioinfo/raxml-ng:v1.2.0'
    }
    withLabel: goalign {
        container = 'evolbioinfo/goalign:dev103ea5b'
    }
    withLabel : iqtree {
        container = 'evolbioinfo/iqtree:v2.2.5'
        cpus=5
        memory='5G'
    }    
    withLabel: python {
        container = 'evolbioinfo/python-evol:v3.7.3condor'
    }
    withLabel : pastml {
        container = 'evolbioinfo/pastml:v1.9.33'
        cpus = 5
        memory='5G'
    }
    withLabel : iqtree2 {
        container = 'evolbioinfo/iqtree:v2.1.3'
        cpus=5
        memory='5G'
    }
    withName : ITOL {
        executor='local' // must be local to have internet
    }
    //////// end Fred's part
}

//////// end directives