Commit 85eea35b authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

tempo pour fred

parent a12b9b1b
work/
singularity/
results/
.nextflow/
.nextflow*
\ No newline at end of file
......@@ -286,9 +286,8 @@ if(length(ter_coord) != 2 & any(grepl(ter_coord, pattern = "\\D"))){# normally n
}else{
ter_coord <- as.integer(ter_coord)
}
genome_size <- strsplit(genome_size, split = " ")[[1]]
if(length(genome_size) != 1 & any(grepl(genome_size, pattern = "\\D"))){# normally no NA with is.null()
tempo.cat <- paste0("ERROR IN plot_insertion.R:\nTHE genome_size PARAMETER MUST BE TWO INTEGERS SEPARATED BY A SINGLE SPACE\nHERE IT IS: \n", paste0(genome_size, collapse = " "))
tempo.cat <- paste0("ERROR IN plot_insertion.R:\nTHE genome_size PARAMETER MUST BE A SINGLE INTEGER\nHERE IT IS: \n", paste0(genome_size, collapse = " "))
stop(paste0("\n\n================\n\n", tempo.cat, "\n\n================\n\n"), call. = FALSE) # == in stop() to be able to add several messages between ==
}else{
genome_size <- as.integer(genome_size)
......
This diff is collapsed.
......@@ -29,6 +29,12 @@ ref_name = file("${ref_path}/${ref_file}").baseName
primer = file(primer_fasta)
ref = file("${ref_path}/${ref_file}")
sum_of_2_nb = fivep_seq_nb.toInteger()+added_nb.toInteger()
categ = Channel.from("leading0", "leading16", "lagging0", "lagging16")
grep_pattern = Channel.from("LEADING_0", "LEADING_16", "LAGGING_0", "LAGGING_16")
modules = params.modules // remove the dot -> can be used in bash scripts
//////// end Variables
......@@ -682,6 +688,114 @@ process plot_insertion { // sections 24.7.2, 44.1 and 45.1 of the labbook 202005
}
process seq_around_insertion { // sections 24.9.1 of the labbook 20200707
label 'r_ext' // see the withLabel: bash in the nextflow config file
publishDir "${out_path}/files", mode: 'copy', pattern: "{*.bed}", overwrite: false
publishDir "${out_path}/reports", mode: 'copy', pattern: "seq_around_insertion_report.txt", overwrite: false
cache 'true'
input:
val file_name
file pos from orient_ch2
val ori_coord
val ter_coord
val insertion_dist
output:
file "${file_name}_around_insertion.bed" into around_insertion_bed_ch1
file "seq_around_insertion_report.txt"
file "report.rmd" into log_ch18
script:
"""
echo -e "<br /><br />\\n\\n### Sequences around insertion sites\\n\\n" > report.rmd
seq_around_insertion.R "${pos}" "${ori_coord}" "${ter_coord}" "${insertion_dist}" "${file_name}" "${cute_path}" "seq_around_insertion_report.txt"
echo -e "<br /><br />\\n\\nIn each sequence of length \$((${insertion_dist} * 2)) <br />position \$((${insertion_dist} + 1)) corresponds to the first nucleotide of the coli part of the read" >> report.rmd
"""
// single quotes required because of the !
}
process extract_seq { // section 24.9.2 of the labbook 20200707
// Warning: the fasta is always taken on the watson (top) strand. The -s option need a special bed file. See https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html
label 'bedtools'
publishDir "${out_path}/files", mode: 'copy', pattern: "${bed.baseName}.fasta", overwrite: false
publishDir "${out_path}/reports", mode: 'copy', pattern: "extract_seq_report.txt"
cache 'true'
input:
file bed from around_insertion_bed_ch1
file ref
output:
file "${bed.baseName}.fasta" into extract_seq_ch
file "extract_seq_report.txt"
script:
"""
# make a bed file from Ecoli genome
echo ">ref" > tempo.ref.fasta
awk '{lineKind=(NR-1)%2}lineKind==1{print \$0}' ${ref} >> tempo.ref.fasta |& tee extract_seq_report.txt
bedtools getfasta -fi tempo.ref.fasta -bed ${bed} -fo "${bed.baseName}.fasta" -name |& tee extract_seq_report.txt
rm tempo.ref.fasta
rm tempo.ref.fasta.fai
"""
}
process base_freq { // section 24.9.2 of the labbook 20200707
// parallelization, i.e., computed for each forward, reverse, leadding or lagging seq from the fasta file
label 'bash'
publishDir "${out_path}/files", mode: 'copy', pattern: "{*.seq,*.stat}", overwrite: false
publishDir "${out_path}/reports", mode: 'copy', pattern: "base_freq_report.txt"
cache 'true'
input:
val file_name
tuple val("categ"), val("grep_pattern") from categ.spread(grep_pattern)
file fasta from extract_seq_ch.first()
file ref
output:
file "${file_name}_${categ}.seq" into base_freq_seq_ch
file "${file_name}_${categ}.stat" into base_freq_stat_ch
file "base_freq_report.txt"
script:
"""
# file splitting into seq
awk -v var1=${grep_pattern} '
{lineKind=(NR-1)%2}
lineKind==0{toGet=(\$0 ~ ">" var1) ; next}
lineKind==1{if(toGet){print \$0}}
' ${fasta} > ${file_name}_${categ}.seq |& tee base_freq_report.txt
# ATGC contingency
gawk '{
L=length(\$0);
for(i=1;i<=L;i++) {
B=substr(\$0,i,1);
T[i][B]++;
}
}
END{
for(BI=0;BI<5;BI++) {
B=(BI==0?"A":(BI==1?"C":(BI==2?"G":(BI==3?"T":"Other"))));
printf("%s",B);
for(i in T) {
tot=0.0;
for(B2 in T[i]){
tot+=T[i][B2];
}
printf("\t%0.5f",(T[i][B])); # T[i][B]/tot for proportion
}
printf("\\n");
}
}' ${file_name}_${categ}.seq > ${file_name}_${categ}.stat |& tee base_freq_report.txt
"""
}
process backup {
label 'bash' // see the withLabel: bash in the nextflow config file
......@@ -695,7 +809,7 @@ process backup {
output:
file "${config_file}" // warning message if we use file config_file
file "${log_file}" // warning message if we use file log_file
file "report.rmd" into log_ch18
file "report.rmd" into log_ch19
script:
"""
......@@ -711,7 +825,7 @@ process workflowVersion { // create a file with the workflow version in out_path
cache 'false'
output:
file "report.rmd" into log_ch19
file "report.rmd" into log_ch20
script:
"""
......
......@@ -55,6 +55,7 @@ env {
color_coverage="5" // three integers for the color of the three coverage plots[1, 2, 5]
xlab="Ecoli Genome (bp)" // name of the reference genome for graphics
genome_size="4641652" // in bp
insertion_dist="20" // nb of bases upstream and downstream of insertions sites on the ref genome to define a consensus sequence
cute_path="https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v10.9.0/cute_little_R_functions.R" // single character string indicating the file (and absolute pathway) of the required cute_little_R_functions toolbox. With ethernet connection available, this can also be used: "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v5.1.0/cute_little_R_functions.R" or local "C:\\Users\\Gael\\Documents\\Git_projects\\cute_little_R_functions\\cute_little_R_functions.R"
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment