Commit 5ff10a86 authored by Blaise Li's avatar Blaise Li
Browse files

More checks before DESeq2, mean TPM.

parent ffe9186d
......@@ -1067,66 +1067,73 @@ rule differential_expression:
#################
formula = Formula("~ lib")
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
contrast = StrVector(["lib", cond, ref])
try:
res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
except RRuntimeError as e:
if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
warnings.warn(
"Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
"Reference data is all zero.\nSkipping %s_%s_%s" % (
wildcards.contrast, wildcards.orientation, wildcards.biotype))
for outfile in output:
shell(f"echo 'NA' > {outfile}")
else:
# Determining fold-change category
###################################
set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
res = res.assign(status=res.apply(set_de_status, axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
# Just to see if column_converter works also with named column, and not just index:
# with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
# res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
##########################################
# res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
# Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
counts_and_res = counts_data
for normalizer in SIZE_FACTORS:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
#pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
#size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
#raise NotImplementedError(f"{normalizer} normalization not implemented")
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
counts_and_res = pd.concat((counts_and_res, res), axis=1)
counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
# Saving lists of genes gaining or loosing siRNAs
up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
with open(output.up_genes, "w") as up_file:
if up_genes:
up_file.write("%s\n" % "\n".join(up_genes))
else:
up_file.truncate(0)
with open(output.down_genes, "w") as down_file:
if down_genes:
down_file.write("%s\n" % "\n".join(down_genes))
else:
down_file.truncate(0)
contrast = StrVector(["lib", cond, ref])
try:
res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
except RRuntimeError as e:
warnings.warn(
"Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
for outfile in output:
shell(f"echo 'NA' > {outfile}")
else:
# Determining fold-change category
###################################
set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
res = res.assign(status=res.apply(set_de_status, axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
# Just to see if column_converter works also with named column, and not just index:
# with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
# res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
##########################################
# res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
# Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
counts_and_res = counts_data
for normalizer in SIZE_FACTORS:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
#pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
#size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
#raise NotImplementedError(f"{normalizer} normalization not implemented")
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
counts_and_res = pd.concat((counts_and_res, res), axis=1)
counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
# Saving lists of genes gaining or loosing siRNAs
up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
with open(output.up_genes, "w") as up_file:
if up_genes:
up_file.write("%s\n" % "\n".join(up_genes))
else:
up_file.truncate(0)
with open(output.down_genes, "w") as down_file:
if down_genes:
down_file.write("%s\n" % "\n".join(down_genes))
else:
down_file.truncate(0)
rule make_lfc_distrib_plot:
......
......@@ -270,6 +270,10 @@ def specific_input(aligner):
return files
counts_files = [
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
"{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
lib=LIBS, counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
......@@ -964,6 +968,23 @@ rule compute_TPM:
tpm.to_csv(output.tpm_file, sep="\t")
# Useful to compute translation efficiency in the Ribo-seq pipeline
rule compute_mean_TPM:
input:
all_tmp_file = rules.compute_TPM.output.tpm_file
output:
tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
"{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
run:
# TODO
tpm = pd.read_table(
input.all_tmp_file, index_col="gene",
usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]])
tpm_mean = tpm.mean(axis=1)
tpm_mean.columns = [wildcards.lib]
tpm_mean.to_csv(output.tpm_file, sep="\t")
rule compute_RPM:
input:
counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
......@@ -1525,71 +1546,78 @@ rule differential_expression:
#################
formula = Formula("~ lib")
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
contrast = StrVector(["lib", cond, ref])
try:
res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
except RRuntimeError as e:
if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
warnings.warn(
"Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
"Reference data is all zero.\nSkipping %s_%s_%s" % (
wildcards.contrast, wildcards.orientation, wildcards.biotype))
for outfile in output:
shell(f"echo 'NA' > {outfile}")
else:
# Determining fold-change category
###################################
set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
res = res.assign(status=res.apply(set_de_status, axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
# Just to see if column_converter works also with named column, and not just index:
# with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
# res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
##########################################
# res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
# Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
counts_and_res = counts_data
for normalizer in SIZE_FACTORS:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
#pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
#size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
#raise NotImplementedError(f"{normalizer} normalization not implemented")
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
counts_and_res = pd.concat((counts_and_res, res), axis=1)
counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
# Saving lists of genes gaining or loosing siRNAs
# diff_expressed = res.query("padj < 0.05")
# up_genes = list(diff_expressed.query("log2FoldChange >= 1").index)
# down_genes = list(diff_expressed.query("log2FoldChange <= -1").index)
up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
#up_genes = list(counts_and_res.query("status == 'up'").index)
#down_genes = list(counts_and_res.query("status == 'down'").index)
with open(output.up_genes, "w") as up_file:
if up_genes:
up_file.write("%s\n" % "\n".join(up_genes))
else:
up_file.truncate(0)
with open(output.down_genes, "w") as down_file:
if down_genes:
down_file.write("%s\n" % "\n".join(down_genes))
else:
down_file.truncate(0)
contrast = StrVector(["lib", cond, ref])
try:
res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
except RRuntimeError as e:
warnings.warn(
"Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
for outfile in output:
shell(f"echo 'NA' > {outfile}")
else:
# Determining fold-change category
###################################
set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
res = res.assign(status=res.apply(set_de_status, axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
# Just to see if column_converter works also with named column, and not just index:
# with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
# res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
##########################################
# res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
# Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
counts_and_res = counts_data
for normalizer in SIZE_FACTORS:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
#pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
#size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
#raise NotImplementedError(f"{normalizer} normalization not implemented")
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
#counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
counts_and_res = pd.concat((counts_and_res, res), axis=1)
counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
# Saving lists of genes gaining or loosing siRNAs
# diff_expressed = res.query("padj < 0.05")
# up_genes = list(diff_expressed.query("log2FoldChange >= 1").index)
# down_genes = list(diff_expressed.query("log2FoldChange <= -1").index)
up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
#up_genes = list(counts_and_res.query("status == 'up'").index)
#down_genes = list(counts_and_res.query("status == 'down'").index)
with open(output.up_genes, "w") as up_file:
if up_genes:
up_file.write("%s\n" % "\n".join(up_genes))
else:
up_file.truncate(0)
with open(output.down_genes, "w") as down_file:
if down_genes:
down_file.write("%s\n" % "\n".join(down_genes))
else:
down_file.truncate(0)
rule make_lfc_distrib_plot:
......
......@@ -3078,74 +3078,69 @@ rule small_RNA_differential_expression:
#################
formula = Formula("~ lib")
(cond, ref) = CONTRAST2PAIR[wildcards.contrast]
contrast = StrVector(["lib", cond, ref])
#if wildcards.contrast == "%s_vs_%s" % (MUT, REF):
# contrast = StrVector(["lib", MUT, REF])
#elif wildcards.contrast in ["%s_vs_%s" % cond_pair for cond_pair in combinations(reversed(TREATS), 2)]:
# (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
# contrast = StrVector(["treat", cond, ref])
#elif wildcards.contrast in ["%s_%s_vs_%s_%s" % (lib2, treat2, lib1, treat1) for (
# (lib1, treat1), (lib2, treat2)) in combinations(product(LIBS, TREATS), 2)]:
# formula = Formula("~ lib_treat")
# (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
# contrast = StrVector(["lib_treat", cond, ref])
#else:
# raise NotImplementedError("Unknown contrast: %s" % wildcards.contrast)
try:
res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
except RRuntimeError as e:
warn(
"Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
str(e), wildcards.contrast, wildcards.orientation, wildcards.small_type))
if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
warnings.warn(
"Reference data is all zero.\nSkipping %s_%s_%s" % (
wildcards.contrast, wildcards.orientation, wildcards.small_type))
for outfile in output:
shell(f"echo 'NA' > {outfile}")
else:
# Determining fold-change category
###################################
set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
res = res.assign(status=res.apply(set_de_status, axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
##########################################
# res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
# Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
counts_and_res = counts_data
for normalizer in DE_SIZE_FACTORS:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
#pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
#size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
counts_and_res = add_tags_column(
pd.concat((counts_and_res, res), axis=1),
input.tags_table, "small_type")
counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
# Saving lists of genes gaining or loosing siRNAs
up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
with open(output.up_genes, "w") as up_file:
if up_genes:
up_file.write("%s\n" % "\n".join(up_genes))
else:
up_file.truncate(0)
with open(output.down_genes, "w") as down_file:
if down_genes:
down_file.write("%s\n" % "\n".join(down_genes))
else:
down_file.truncate(0)
contrast = StrVector(["lib", cond, ref])
try:
res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
except RRuntimeError as e:
warn(
"Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
str(e), wildcards.contrast, wildcards.orientation, wildcards.small_type))
for outfile in output:
shell(f"echo 'NA' > {outfile}")
else:
# Determining fold-change category
###################################
set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
res = res.assign(status=res.apply(set_de_status, axis=1))
# Converting gene IDs
######################
with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
##########################################
# res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
# Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
counts_and_res = counts_data
for normalizer in DE_SIZE_FACTORS:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
#pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
#size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
counts_and_res = add_tags_column(
pd.concat((counts_and_res, res), axis=1),
input.tags_table, "small_type")
counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
# Saving lists of genes gaining or loosing siRNAs
up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
with open(output.up_genes, "w") as up_file:
if up_genes:
up_file.write("%s\n" % "\n".join(up_genes))
else:
up_file.truncate(0)
with open(output.down_genes, "w") as down_file:
if down_genes:
down_file.write("%s\n" % "\n".join(down_genes))
else:
down_file.truncate(0)
rule gather_DE_folds:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment