Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bioinfo_utils
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
bioinfo_utils
Commits
cff1d2d4
Commit
cff1d2d4
authored
6 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Adding more biotypes, fixing various bug.
parent
01da51a2
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
PRO-seq/PRO-seq.snakefile
+55
-28
55 additions, 28 deletions
PRO-seq/PRO-seq.snakefile
with
55 additions
and
28 deletions
PRO-seq/PRO-seq.snakefile
+
55
−
28
View file @
cff1d2d4
...
@@ -59,11 +59,11 @@ import matplotlib.pyplot as plt
...
@@ -59,11 +59,11 @@ import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
import husl
import husl
from libdeseq import do_deseq2
,
from libdeseq import do_deseq2
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import ensure_relative, cleanup_and_backup
, texscape
from libworkflows import get_chrom_sizes, column_converter
from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, last_lines, save_plot, test_na_file
from libworkflows import strip_split,
file_len,
last_lines, save_plot, test_na_file
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
from libworkflows import feature_orientation2stranded
from libworkflows import read_htseq_counts, sum_htseq_counts
from libworkflows import read_htseq_counts, sum_htseq_counts
...
@@ -140,8 +140,15 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
...
@@ -140,8 +140,15 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
COUNT_BIOTYPES = config["count_biotypes"]
COUNT_BIOTYPES = config["count_biotypes"]
ANNOT_BIOTYPES = config["annot_biotypes"]
ANNOT_BIOTYPES = config["annot_biotypes"]
#METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
#METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
METAGENE_BIOTYPES=["protein_coding"]
#METAGENE_BIOTYPES=["protein_coding"]
ID_LISTS = ["lfc_statuses", "germline_specific", "histone", "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
METAGENE_BIOTYPES=["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
ID_LISTS = [
"lfc_statuses",
"germline_specific",
"histone",
"spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014",
"piRNA_dependent_prot_si_down4_WR_RT_top200", "piRNA_dependent_prot_si_down4",
"csr1_prot_si_supertargets_common"]
annot_dir = config["annot_dir"]
annot_dir = config["annot_dir"]
gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
...
@@ -336,7 +343,7 @@ rule trim_and_dedup:
...
@@ -336,7 +343,7 @@ rule trim_and_dedup:
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
run:
run:
shell_commands = """
shell_commands = """
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {
wildcards.
trimmer} {input} \\
{params.adapter} {params.trim5} {params.trim3} \\
{params.adapter} {params.trim5} {params.trim3} \\
{output.adapt} {output.noadapt} {log.trim} \\
{output.adapt} {output.noadapt} {log.trim} \\
{output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
{output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
...
@@ -345,6 +352,7 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
...
@@ -345,6 +352,7 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
shell(shell_commands)
shell(shell_commands)
# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
rule map_on_genome:
rule map_on_genome:
input:
input:
fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
...
@@ -465,6 +473,7 @@ rule check_last_base:
...
@@ -465,6 +473,7 @@ rule check_last_base:
print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file)
print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file)
# TODO: use Python to make the plot
rule plot_last_base:
rule plot_last_base:
input:
input:
OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
...
@@ -629,6 +638,7 @@ rule do_PCA:
...
@@ -629,6 +638,7 @@ rule do_PCA:
OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"),
OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"),
message:
message:
"Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
"Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
threads: 12 # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]"
log:
log:
OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log")
OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log")
run:
run:
...
@@ -1032,7 +1042,12 @@ rule join_all_counts:
...
@@ -1032,7 +1042,12 @@ rule join_all_counts:
"""concat counts for all biotypes into all"""
"""concat counts for all biotypes into all"""
input:
input:
#counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
#counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES),
counts_tables = expand(
OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
"{{counter}}", "all_on_C_elegans",
"{biotype}_{{orientation}}_counts.txt"),
# We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
output:
output:
counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
run:
run:
...
@@ -1196,8 +1211,11 @@ rule make_MA_plot:
...
@@ -1196,8 +1211,11 @@ rule make_MA_plot:
group2colour=group2colour,
group2colour=group2colour,
lfc_range = params.lfc_range,
lfc_range = params.lfc_range,
fold_type = "log2FoldChange",
fold_type = "log2FoldChange",
title="MA-plot for %s, %s_%s" % (
title=texscape(
wildcards.contrast, wildcards.orientation, wildcards.biotype))
"MA-plot for %s, %s_%s" % (
wildcards.contrast,
wildcards.orientation,
wildcards.biotype)))
##################
##################
...
@@ -1522,6 +1540,13 @@ def meta_params(wildcards):
...
@@ -1522,6 +1540,13 @@ def meta_params(wildcards):
"-m %d" % META_SCALE,
"-m %d" % META_SCALE,
"--unscaled3prime %d" % UNSCALED_INSIDE,
"--unscaled3prime %d" % UNSCALED_INSIDE,
"-a %d" % META_MARGIN])
"-a %d" % META_MARGIN])
if biotype.startswith("protein_coding_"):
return " ".join([
"-b %d" % 0,
"--unscaled5prime %d" % UNSCALED_INSIDE,
"-m %d" % META_SCALE,
"--unscaled3prime %d" % UNSCALED_INSIDE,
"-a %d" % 0])
else:
else:
raise NotImplementedError("Metagene analyses for %s not implemented." % biotype)
raise NotImplementedError("Metagene analyses for %s not implemented." % biotype)
...
@@ -1543,25 +1568,27 @@ rule plot_meta_profile_mean:
...
@@ -1543,25 +1568,27 @@ rule plot_meta_profile_mean:
plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)),
plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)),
threads: 12 # to limit memory usage, actually
threads: 12 # to limit memory usage, actually
shell:
run:
"""
if file_len(input.bed):
tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX")
shell("""tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX")
computeMatrix scale-regions -S {input.bigwig} \\
computeMatrix scale-regions -S {input.bigwig} \\
-R {input.bed} \\
-R {input.bed} \\
{params.meta_params} \\
{params.meta_params} \\
-p 1 \\
-p 1 \\
-out ${{tmpdir}}/meta_profile.gz \\
-out ${{tmpdir}}/meta_profile.gz \\
--skipZeros \\
--skipZeros \\
1> {log.plot_TSS_log} \\
1> {log.plot_TSS_log} \\
2> {log.plot_TSS_err} \\
2> {log.plot_TSS_err} \\
|| error_exit "computeMatrix failed"
|| error_exit "computeMatrix failed"
plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\
plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\
1>> {log.plot_TSS_log} \\
1>> {log.plot_TSS_log} \\
2>> {log.plot_TSS_err} \\
2>> {log.plot_TSS_err} \\
|| error_exit "plotProfile failed"
|| error_exit "plotProfile failed"
rm -rf ${{tmpdir}}
rm -rf ${{tmpdir}}
""" % META_MIN_LEN
""" % META_MIN_LEN)
else:
warnings.warn("No regions selected in {input.bed}. Generating empty figure.\n")
shell("""> {output}""")
onsuccess:
onsuccess:
print("PRO-seq analysis finished.")
print("PRO-seq analysis finished.")
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment