Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bioinfo_utils
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
bioinfo_utils
Commits
f241ee02
Commit
f241ee02
authored
6 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Deduplication of GRO-seq data is now optional.
parent
cff1d2d4
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
PRO-seq/PRO-seq.snakefile
+53
-2
53 additions, 2 deletions
PRO-seq/PRO-seq.snakefile
PRO-seq/PRO-seq_trim_only.sh
+137
-0
137 additions, 0 deletions
PRO-seq/PRO-seq_trim_only.sh
with
190 additions
and
2 deletions
PRO-seq/PRO-seq.snakefile
+
53
−
2
View file @
f241ee02
...
...
@@ -352,11 +352,62 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer}
shell(shell_commands)
rule trim_only:
"""The adaptor is trimmed, then reads are treated in two groups depending
on whether the adapter was found or not. For each group, the extra k-mers
are removed at both ends."""
input:
rules.link_raw_data.output,
output:
noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt.fastq.gz"),
adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt.fastq.gz"),
nb_raw = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
nb_adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
nb_noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
params:
adapter = lambda wildcards : lib2adapt[wildcards.lib],
process_type = "PRO-seq",
trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
threads: 4 # Actually, to avoid too much IO
message:
"Trimming adaptor from raw data using {wildcards.trimmer} and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
benchmark:
OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
log:
trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
run:
shell_commands = """
THREADS="{threads}" {params.process_type}_trim_only.sh {wildcards.trimmer} {input} \\
{params.adapter} {params.trim5} {params.trim3} \\
{output.adapt} {output.noadapt} {log.trim} \\
{output.nb_raw} {output.nb_adapt} {output.nb_noadapt} \\
1> {log.log} 2> {log.err}
"""
shell(shell_commands)
def source_fastq(wildcards):
"""
Determine the correct pre-processed fastq file depending on the pipeline
configuration and the current wildcards.
"""
if config.get("deduplicate", False):
return OPJ(
data_dir, f"trimmed_{wildcards.trimmer}",
f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}_deduped.fastq.gz"),
else:
return OPJ(
data_dir, f"trimmed_{wildcards.trimmer}",
f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}.fastq.gz"),
# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
rule map_on_genome:
input:
fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
#rules.trim_and_dedup.output,
fastq = source_fastq,
output:
# sam files take a lot of space
sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")),
...
...
This diff is collapsed.
Click to expand it.
PRO-seq/PRO-seq_trim_only.sh
0 → 100755
+
137
−
0
View file @
f241ee02
#!/usr/bin/env bash
# Usage: PRO-seq_trim_only.sh <trimmer> <raw fastq> <ADAPTER> <nb 5'> <nb 3'> <trimmed fastq> <untrimmed fastq> <trimmer log> <nb_raw> <nb_adapt> <nb_noadapt>
# http://linuxcommand.org/wss0150.php
PROGNAME
=
$(
basename
$0
)
function
error_exit
{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo
"
${
PROGNAME
}
:
${
1
:-
"Unknown Error"
}
"
1>&2
exit
1
}
# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in
#PACKAGEDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
trimmer
=
${
1
}
raw_in
=
${
2
}
adapt
=
${
3
}
fiveprime_random
=
${
4
}
threeprime_random
=
${
5
}
# For the untrimmed, the adaptor was not found,
# but maybe its first 3 bases were there.
threeprime_random_with_margin
=
$((${
threeprime_random
}
+
3
))
trimmed_out
=
${
6
}
untrimmed_out
=
${
7
}
log
=
${
8
}
nb_raw
=
${
9
}
nb_adapt
=
${
10
}
nb_noadapt
=
${
11
}
count_fastq_reads
()
{
# $1: file in which to write the number of fastq records
wc
-l
|
{
read
nblines
;
echo
${
nblines
}
/ 4 | bc
>
${
1
}
;
}
}
trim_random_nt
()
{
# $1: nb of bases to trim at 5' end
# $2: nb of bases to trim at 3' end
cutadapt
-u
${
1
}
-u
-
${
2
}
- 2> /dev/null
||
error_exit
"trim_random_nt failed"
}
# This named pipe is used to avoid writing the intermediate file to disk
# It will transmit reads that did not seem to contain the adapter to the
# UMI trimming step.
mkfifo
${
untrimmed_out
}
.fifo
minsize_trimmed
=
$(
echo
"
${
fiveprime_random
}
+ 16 +
${
threeprime_random
}
"
| bc
)
case
"
${
trimmer
}
"
in
"cutadapt"
)
if
[
${
MAX_ERROR_RATE
}
]
then
#>&2 echo "Cutadapt multithreading not working fully yet. Ignoring THREADS."
error_opt
=
"-e
${
MAX_ERROR_RATE
}
"
else
error_opt
=
""
fi
if
[
${
THREADS
}
]
then
#thread_opt="-j ${THREADS}"
>
&2
echo
"Cutadapt multithreading not working fully yet. Ignoring THREADS."
thread_opt
=
""
else
thread_opt
=
""
fi
# -m ${minsize_random} is to discard reads that are shorter than this after trimming
trim_cmd
=
"
${
trimmer
}
-a
${
adapt
}
-m
${
minsize_trimmed
}
${
error_opt
}
${
thread_opt
}
--untrimmed-output=
${
untrimmed_out
}
.fifo - 2>
${
log
}
"
;;
"fastx_clipper"
)
# -n is to keep reads with N
# -l is equivalent to -m in cutadapt (not sure it has effect with -C)
# -c is to keep only the trimmed reads
# -C is to keep only the non-trimmed reads
# -v is to have verbose things to put in the log
trim_cmd
=
"tee >(
${
trimmer
}
-a
${
adapt
}
-l
${
minsize_trimmed
}
-C -M 3 -n >
${
untrimmed_out
}
.fifo) |
${
trimmer
}
-a
${
adapt
}
-l
${
minsize_trimmed
}
-c -M 3 -n -v 2>>
${
log
}
"
;;
*
)
error_exit
"Trimming adapter with
${
trimmer
}
not implemented."
;;
esac
# a second cutadapt step removes the random nucleotides that helped identify PCR duplicates.
trim_trimmed
()
{
# $1: file in which to write the number of fastq records after adapter trimming
cmd
=
"
${
trim_cmd
}
| tee >(count_fastq_reads
${
1
}
) | trim_random_nt
${
fiveprime_random
}
${
threeprime_random
}
| gzip"
echo
${
cmd
}
eval
${
cmd
}
>
${
trimmed_out
}
||
error_exit
"
${
cmd
}
failed"
}
trim_untrimmed
()
{
cmd
=
"trim_random_nt
${
fiveprime_random
}
${
threeprime_random_with_margin
}
| gzip"
echo
${
cmd
}
eval
${
cmd
}
>
${
untrimmed_out
}
||
error_exit
"
${
cmd
}
failed"
}
filetype
=
"
$(
file
-L
${
raw_in
}
|
cut
-d
" "
-f2
)
"
case
"
${
filetype
}
"
in
"gzip"
)
cat_cmd
=
"zcat"
;;
"ASCII"
)
cat_cmd
=
"cat"
;;
*
)
error_exit
"Unexpected file type for
${
raw_in
}
"
;;
esac
${
cat_cmd
}
${
raw_in
}
\
|
tee
>(
count_fastq_reads
${
nb_raw
}
)
\
| trim_trimmed
${
nb_adapt
}
${
nb_adapt_deduped
}
&
pid_to_wait
=
$!
cat
${
untrimmed_out
}
.fifo
\
|
tee
>(
count_fastq_reads
${
nb_noadapt
}
)
\
| trim_untrimmed
${
nb_noadapt_deduped
}
||
rm
-f
${
untrimmed_out
}
.fifo
wait
${
pid_to_wait
}
rm
-f
${
untrimmed_out
}
.fifo
exit
0
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment