Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bioinfo_utils
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
bioinfo_utils
Commits
bd7d9415
Commit
bd7d9415
authored
5 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Fixing all_si_22G crashes.
parent
313d0ce7
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
small_RNA-seq/small_RNA-seq.snakefile
+102
-59
102 additions, 59 deletions
small_RNA-seq/small_RNA-seq.snakefile
with
102 additions
and
59 deletions
small_RNA-seq/small_RNA-seq.snakefile
+
102
−
59
View file @
bd7d9415
...
...
@@ -1705,57 +1705,66 @@ rule gather_small_RNA_counts:
counts_table = OPJ(
annot_counts_dir,
"all_{mapped_type}_on_%s" % genome, "{small_type}_counts.txt"),
log:
log = OPJ(log_dir, "gather_small_RNA_counts", "{mapped_type}_{small_type}.log"),
run:
# Gathering the counts data
############################
counts_files = (OPJ(
annot_counts_dir,
f"{cond_name}_{wildcards.mapped_type}_on_{genome}",
f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES)
if wildcards.small_type == "pi":
drop = ["piRNA"]
elif wildcards.small_type == "mi":
drop = ["miRNA"]
elif wildcards.small_type in {
*{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"]
elif wildcards.small_type in {
*{f"te_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"]
elif wildcards.small_type in {
*{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["pseudogene"]
elif wildcards.small_type in {
*{f"satel_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["satellites_rmsk"]
elif wildcards.small_type in {
*{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["simple_repeats_rmsk"]
elif wildcards.small_type in {"all_si", "all_siu"}:
drop = ["all_siRNA"]
else:
raise NotImplementedError("Unknown read type: %s" % wildcards.small_type)
counts_data = pd.concat(
(pd.read_table(
counts_file,
index_col=0,
na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files),
axis=1).fillna(0).astype(int)
counts_data.columns = COND_NAMES
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.small_type in RMSK_SISIU_TYPES:
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
with open(log.log, "w") as logfile:
# Gathering the counts data
############################
counts_files = (OPJ(
annot_counts_dir,
f"{cond_name}_{wildcards.mapped_type}_on_{genome}",
f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES)
if wildcards.small_type == "pi":
drop = ["piRNA"]
elif wildcards.small_type == "mi":
drop = ["miRNA"]
elif wildcards.small_type in {
*{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"]
elif wildcards.small_type in {
*{f"te_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"]
elif wildcards.small_type in {
*{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["pseudogene"]
elif wildcards.small_type in {
*{f"satel_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["satellites_rmsk"]
elif wildcards.small_type in {
*{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = ["simple_repeats_rmsk"]
elif wildcards.small_type in {
*{f"all_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"all_siu_{suffix}" for suffix in SI_SUFFIXES}}:
drop = [f"all_si_{suffix}RNA" for suffix in SI_SUFFIXES]
elif wildcards.small_type in {"all_si", "all_siu"}:
drop = ["all_siRNA"]
else:
# Unknown read type: all_si_22G
raise NotImplementedError("Unknown read type: %s" % wildcards.small_type)
logfile.write(f"Will drop {drop} for {wildcards.small_type}.\n")
counts_data = pd.concat(
(pd.read_table(
counts_file,
index_col=0,
na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files),
axis=1).fillna(0).astype(int)
counts_data.columns = COND_NAMES
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.small_type in RMSK_SISIU_TYPES:
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
def counts_to_join(wildcards):
...
...
@@ -1960,11 +1969,15 @@ rule associate_small_type:
tags_table.to_csv(output.tags_table, sep="\t")
def add_tags_column(data, tags_table, tag_name):
# TODO: check what this does with index column name (can it loose "gene"?)
def add_tags_column(data, tags_table, tag_name, logfile=None):
"""Adds a column *tag_name* to *data* based on the DataFrame *tag_table*
associating tags to row names."""
# Why "inner" ?
df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1)
if logfile is not None:
logfile.write(f"Index in data is: {data.index.name}\n")
logfile.write(f"Index in df is: {df.index.name}\n")
df.columns = (*data.columns, tag_name)
return df
...
...
@@ -2180,6 +2193,9 @@ rule compute_RPM_folds:
input.counts_table,
usecols=["gene", *column_list],
index_col="gene")
assert counts_data.index.name == "gene", f"Wrong index: {counts_data.index.name}"
# logfile.write(f"Columns in counts_data are: {counts_data.columns}")
# logfile.write(f"Index in counts_data is: {counts_data.index.name}")
logfile.write(f"Found {len(counts_data.index)} genes\n")
#logfile.write(f"Reading exon lengths from {exon_lengths_file}\n")
#exon_lengths = pd.read_table(
...
...
@@ -2196,6 +2212,9 @@ rule compute_RPM_folds:
logfile.write(str(norm))
logfile.write("Computing counts per million non-structural mappers\n")
RPM = 1000000 * counts_data / norm
# logfile.write(f"Columns in RPM are: {RPM.columns}\n")
# logfile.write(f"Index in RPM is: {RPM.index.name}\n")
assert RPM.index.name == "gene", f"Wrong index: {RPM.index.name}"
logfile.write("Computing RPM log2 fold changes\n")
# Maybe not a very sensible way to compute the mean folds: it pairs replicates, but the pairing is arbitrary
# TODO: Should we compute the fold of the means instead?
...
...
@@ -2212,9 +2231,28 @@ rule compute_RPM_folds:
lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
logfile.write("Adding small read type info\n")
# logfile.write(f"Columns in lfc are: {lfc.columns}\n")
# logfile.write(f"Index in lfc is: {lfc.index.name}\n")
assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
logfile.write(f"Adding small read type info from {input.tags_table}\n")
#pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in tags_table are: {tags_table.columns}\n")
logfile.write(f"Index in tags_table is: {tags_table.index.name}\n")
lfc_idx = lfc.index
RPM_idx = RPM.index
tags_table_idx = tags_table.index
lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
# with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
with_tags = pd.concat((RPM, tags_table), axis=1)
logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
# pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
logfile.write(f"Then writing to {output.fold_results}\n")
with_tags.to_csv(output.fold_results, sep="\t")
rule gather_RPM_folds:
...
...
@@ -2233,12 +2271,17 @@ rule gather_RPM_folds:
benchmark:
OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"),
run:
pd.DataFrame({contrast : pd.read_table(
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
index_col=["gene", "cosmid", "name", "small_type"],
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
with open(log.log, "w") as logfile:
actual_input = [
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt") for contrast in IP_CONTRASTS]
logfile.write(f"Gathering RPM folds from:\n{actual_input}\nShould be from:\n{input.fold_results}\n")
pd.DataFrame({contrast : pd.read_table(
OPJ(mapping_dir, f"RPM_folds_{size_selected}",
f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
index_col=["gene", "cosmid", "name", "small_type"],
usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
output.all_folds, sep="\t")
# TODO: update consumers with mapping_type
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment