Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bioinfo_utils
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
bioinfo_utils
Commits
485dd97f
Commit
485dd97f
authored
6 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Folds of RPM for counts of remapped reads.
parent
e69418a4
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
small_RNA-seq/small_RNA-seq.snakefile
+105
-15
105 additions, 15 deletions
small_RNA-seq/small_RNA-seq.snakefile
with
105 additions
and
15 deletions
small_RNA-seq/small_RNA-seq.snakefile
+
105
−
15
View file @
485dd97f
...
@@ -359,7 +359,7 @@ mapping_dir = OPJ(output_dir, aligner, f"mapped_{genome}")
...
@@ -359,7 +359,7 @@ mapping_dir = OPJ(output_dir, aligner, f"mapped_{genome}")
reads_dir = OPJ(mapping_dir, "reads")
reads_dir = OPJ(mapping_dir, "reads")
annot_counts_dir = OPJ(mapping_dir, "annotation")
annot_counts_dir = OPJ(mapping_dir, "annotation")
feature_counts_dir = OPJ(mapping_dir, "feature_count")
feature_counts_dir = OPJ(mapping_dir, "feature_count")
READ_TYPES_FOR_COUNTING = [
size_selected,
"all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"]))
READ_TYPES_FOR_COUNTING = ["all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"]))
# Actually, we don't need to enforce this: snakemake will map whatever read type is needed for counting
# Actually, we don't need to enforce this: snakemake will map whatever read type is needed for counting
assert set(READ_TYPES_FOR_COUNTING) < set(READ_TYPES_FOR_MAPPING), "Cannot count reads that were not mapped."
assert set(READ_TYPES_FOR_COUNTING) < set(READ_TYPES_FOR_MAPPING), "Cannot count reads that were not mapped."
# Reads remapped and counted using featureCounts
# Reads remapped and counted using featureCounts
...
@@ -480,7 +480,8 @@ wildcard_constraints:
...
@@ -480,7 +480,8 @@ wildcard_constraints:
biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES)),
biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES)),
id_list="|".join(GENE_LISTS),
id_list="|".join(GENE_LISTS),
type_set="|".join(["all", "protein_coding", "protein_coding_TE"]),
type_set="|".join(["all", "protein_coding", "protein_coding_TE"]),
small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu|" + "|".join(REMAPPED_COUNTED),
small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu",
# read_type = "|".join(REMAPPED_COUNTED),
# read_type="|".join([
# read_type="|".join([
# "raw", "trimmed", "deduped", f"{size_selected}",
# "raw", "trimmed", "deduped", f"{size_selected}",
# "nomap", "nomap_siRNA", "mapped",
# "nomap", "nomap_siRNA", "mapped",
...
@@ -607,7 +608,22 @@ read_graphs = [
...
@@ -607,7 +608,22 @@ read_graphs = [
ip_fold_heatmaps = []
ip_fold_heatmaps = []
de_fold_heatmaps = []
de_fold_heatmaps = []
ip_fold_boxplots = []
ip_fold_boxplots = []
# Temporary, until used for boxplots:
remapped_folds = []
if contrasts_dict["ip"]:
if contrasts_dict["ip"]:
remapped_folds = expand(
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "{counted_type}_mean_log2_RPM_fold.txt"),
counted_type=REMAPPED_COUNTED),
# snakemake -n OK
# expand(
# OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all",
# "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome),
# read_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
# snakemake -n OK
# expand(
# OPJ(mapping_dir, f"RPM_folds_{size_selected}", "{contrast}",
# "{contrast}_{small_type}_on_%s_{biotype}_{orientation}_transcript_RPM_folds.txt" % genome),
# contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
ip_fold_heatmaps = expand(
ip_fold_heatmaps = expand(
OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"),
OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"),
small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold"], fig_format=FIG_FORMATS)
small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold"], fig_format=FIG_FORMATS)
...
@@ -714,12 +730,9 @@ rule all:
...
@@ -714,12 +730,9 @@ rule all:
expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
#expand(OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES),
#expand(OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES),
exploratory_graphs,
exploratory_graphs,
remapped_folds,
fold_boxplots,
fold_boxplots,
#expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS),
#expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS),
# snakemake -n OK
expand(OPJ(
feature_counts_dir,
"all_{small_type}_counts.txt"), small_type=REMAPPED_COUNTED),
#expand(OPJ(
#expand(OPJ(
# feature_counts_dir,
# feature_counts_dir,
# "all_{read_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), read_type=READ_TYPES_FOR_COUNTING, biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS),
# "all_{read_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), read_type=READ_TYPES_FOR_COUNTING, biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS),
...
@@ -727,11 +740,6 @@ rule all:
...
@@ -727,11 +740,6 @@ rule all:
# feature_counts_dir,
# feature_counts_dir,
# "all_{small_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome),
# "all_{small_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome),
# small_type=READ_TYPES_FOR_MAPPING, biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
# small_type=READ_TYPES_FOR_MAPPING, biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
# snakemake -n not OK (Missing input files for rule gather_small_RNA_counts)
#expand(
# OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
# small_type=REMAPPED_COUNTED),
##
expand(
expand(
OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]),
small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]),
...
@@ -1075,8 +1083,8 @@ rule gather_feature_counts:
...
@@ -1075,8 +1083,8 @@ rule gather_feature_counts:
input:
input:
counts_tables = expand(
counts_tables = expand(
OPJ(feature_counts_dir,
OPJ(feature_counts_dir,
"{lib}_{rep
}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome),
"{cond_name
}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome),
filtered_product, lib=LIBS, rep=REP
S),
cond_name=COND_NAME
S),
output:
output:
counts_table = OPJ(
counts_table = OPJ(
feature_counts_dir,
feature_counts_dir,
...
@@ -1334,8 +1342,6 @@ rule join_si_reads:
...
@@ -1334,8 +1342,6 @@ rule join_si_reads:
# df = pd.pd.DataFrame(speeds).set_index(0)
# df = pd.pd.DataFrame(speeds).set_index(0)
# Reminder:
# annot_counts_dir = OPJ(mapping_dir, "annotation")
rule gather_small_RNA_counts:
rule gather_small_RNA_counts:
"""For a given small_type, gather counts from all libraries in one table."""
"""For a given small_type, gather counts from all libraries in one table."""
input:
input:
...
@@ -1803,6 +1809,8 @@ rule gather_RPM_folds:
...
@@ -1803,6 +1809,8 @@ rule gather_RPM_folds:
output:
output:
all_folds = OPJ(
all_folds = OPJ(
mapping_dir, f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"),
mapping_dir, f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"),
wildcard_constraints:
small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu"
log:
log:
log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"),
log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"),
run:
run:
...
@@ -1814,6 +1822,88 @@ rule gather_RPM_folds:
...
@@ -1814,6 +1822,88 @@ rule gather_RPM_folds:
output.all_folds, sep="\t")
output.all_folds, sep="\t")
# TODO: replace counted_type with all subwildcards. Why does it work with cond_name?
# Wildcards in input files cannot be determined from output files:
# 'read_type'
rule compute_remapped_RPM_folds:
input:
counts_table = rules.gather_feature_counts.output.counts_table,
#exon_lengths = rules.compute_genes_exon_lengths.output.exon_lengths,
summary_table = rules.gather_read_counts_summaries.output.summary_table,
#tags_table = rules.associate_small_type.output.tags_table,
output:
fold_results = OPJ(
mapping_dir, f"RPM_folds_{size_selected}",
"{contrast}", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_RPM_folds.txt" % genome),
log:
log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome),
run:
with open(log.log, "w") as logfile:
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
logfile.write(f"cond: {cond}, ref: {ref}\n")
column_list = expand("{lib}_{rep}", lib=[ref, cond], rep=REPS)
logfile.write(f"Reading columns {column_list} from {input.counts_table}\n")
counts_data = pd.read_table(
input.counts_table,
usecols=["gene", *column_list],
index_col="gene")
logfile.write(f"Found {len(counts_data.index)} genes\n")
#logfile.write(f"Reading exon lengths from {exon_lengths_file}\n")
#exon_lengths = pd.read_table(
# input.exon_lengths_file,
# index_col="gene")
#logfile.write(f"Found {len(exon_lengths.index)} genes\n")
#logfile.write("Determining set of common genes\n")
#common = counts_data.index.intersection(exon_lengths.index)
#logfile.write(f"Found {len(common)} common genes\n")
#logfile.write("Computing counts per kilobase\n")
#RPK = 1000 * counts_data.loc[common].div(exon_lengths.loc[common]["union_exon_len"], axis="index")
logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n")
norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"][column_list]
logfile.write(str(norm))
logfile.write("Computing counts per million non-structural mappers\n")
RPM = 1000000 * counts_data / norm
logfile.write("Computing RPM log2 fold changes\n")
# Add 0.25 as RPM "pseudo-count" (edgeR does this ? https://www.biostars.org/p/102568/#102604)
lfc = np.log2(pd.DataFrame(
{f"log2({cond}_{rep}/{ref}_{rep})" : (RPM[f"{cond}_{rep}"] + 0.25) / (RPM[f"{ref}_{rep}"] + 0.25) for rep in REPS}))
# Compute the mean
lfc = lfc.assign(mean_log2_RPM_fold=lfc.mean(axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
#logfile.write("Adding small read type info\n")
##pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
#pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t")
rule gather_remapped_RPM_folds:
"""Gathers RPM folds across contrasts."""
input:
fold_results = expand(OPJ(
mapping_dir, f"RPM_folds_{size_selected}",
"{contrast}", "{contrast}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_transcript_RPM_folds.txt" % genome), contrast=IP_CONTRASTS),
output:
all_folds = OPJ(
mapping_dir, f"RPM_folds_{size_selected}", "all", "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome),
log:
log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome),
run:
pd.DataFrame({contrast : pd.read_table(
# OPJ(mapping_dir, f"RPM_folds_{size_selected}",
# f"{contrast}", f"{contrast}_{wildcards.counted_type}_RPM_folds.txt"),
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_transcript_RPM_folds.txt"),
index_col=["gene", "cosmid", "name"],
usecols=["gene", "cosmid", "name", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
# TODO add remapped_RPM_folds
def source_gathered_folds(wildcards):
def source_gathered_folds(wildcards):
if wildcards.fold_type == "mean_log2_RPM_fold":
if wildcards.fold_type == "mean_log2_RPM_fold":
return rules.gather_RPM_folds.output.all_folds
return rules.gather_RPM_folds.output.all_folds
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment