diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile
index 9491fbb925a8974746e29165420d5d07230568dc..8b33ab64e3f61894773f58d089ce0b817136ec21 100644
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -70,6 +70,7 @@ from libworkflows import read_intersect_counts, sum_intersect_counts
from libworkflows import read_feature_counts, sum_feature_counts
from smincludes import rules as irules
+alignment_settings = {"bowtie2": "", "hisat2": "", "crac": "-k 20 --stranded --use-x-in-cigar"}
# Possible feature ID conversions
ID_TYPES = ["name", "cosmid"]
@@ -86,7 +87,6 @@ LFC_RANGE = {
"RNA_transposons_rmsk_families" : (-10, 10),
"simple_repeats_rmsk_families" : (-10, 10),
"alltypes" : (-10, 10)}
-DE_BIOTYPES = list(LFC_RANGE.keys())
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
@@ -100,12 +100,22 @@ DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
# "RNA-seq.config.json"
-
-spikein_microliters = config["spikein_microliters"]
-spikein_dilution_factor = config["spikein_dilution_factor"]
+spikein_dict = config["spikein_dict"]
+spikein_microliters = spikein_dict["microliters"]
+spikein_dilution_factor = spikein_dict["dilution_factor"]
# To get the expected number of attomoles of a given spike-in,
# multiply this by the value found in column 4 of Genomes/spike-ins/ERCC92_concentrations.txt
spikein_microliter_equivalent = spikein_microliters / spikein_dilution_factor
+if spikein_microliter_equivalent:
+ spikein_concentrations = pd.read_table(
+ spikein_dict["concentrations"],
+ index_col="ERCC ID",
+ usecols=["ERCC ID", "concentration in Mix %s (attomoles/ul)" % spikein_dict["mix"]])
+ from scipy.constants import Avogadro
+ # attomole: 10^{-18} mole
+ molecules_per_attomole = Avogadro / 10**18
+ spikein_expected_counts = spikein_concentrations * spikein_microliter_equivalent * molecules_per_attomole
+ spikein_expected_counts.columns = ["expected_counts"]
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
@@ -132,6 +142,7 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
BIOTYPES = config["biotypes"]
+DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES]
if "spike_ins" in BIOTYPES:
PAIR_SCATTER_BIOTYPES = ["spike_ins"]
else:
@@ -164,10 +175,15 @@ transcriptomes_dir = OPJ(output_dir, "merged_transcriptomes")
log_dir = OPJ(output_dir, f"logs_{genome}")
data_dir = OPJ(output_dir, "data")
-SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref", "spike_ins"]
-assert set(SIZE_FACTORS).issubset(set(BIOTYPES) | {"median_ratio_to_pseudo_ref"})
#NORM_TYPES = config["norm_types"]
-NORM_TYPES = ["spike_ins", "protein_coding", "median_ratio_to_pseudo_ref"]
+if spikein_microliter_equivalent:
+ SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref", "spike_ins"]
+ NORM_TYPES = ["spike_ins", "protein_coding", "median_ratio_to_pseudo_ref"]
+else:
+ SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref"]
+ NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"]
+ assert "spike_ins" not in BIOTYPES
+assert set(SIZE_FACTORS).issubset(set(BIOTYPES) | {"median_ratio_to_pseudo_ref"})
assert set(NORM_TYPES).issubset(set(SIZE_FACTORS))
@@ -193,15 +209,21 @@ def specific_input(aligner):
OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),]
elif aligner == "bowtie2":
files = []
+ elif aligner == "crac":
+ files = []
else:
raise NotImplementedError("%s is not yet handeled." % aligner)
return files
counts_files = [
+ expand(
+ OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
+ f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
+ counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
f"all_on_{genome}", "{biotype}_{orientation}_RPK.txt"),
- counter=COUNTERS, biotype=BIOTYPES, orientation=ORIENTATIONS),
+ counter=COUNTERS, biotype=BIOTYPES + ["alltypes"], orientation=ORIENTATIONS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
@@ -244,17 +266,31 @@ bigwig_files = [
OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean",
f"{{lib}}_mean_on_{genome}_by_{{norm_type}}_{{orientation}}.bw"),
lib=LIBS, rep=REPS, norm_type=NORM_TYPES, orientation=ORIENTATIONS),]
+if spikein_microliter_equivalent:
+ spikein_files = [
+ expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
+ f"all_on_{genome}", "{lib}_{rep}_spike_ins_{orientation}_TPM_response.pdf"),
+ counter=COUNTERS, orientation=ORIENTATIONS, lib=LIBS, rep=REPS),]
+else:
+ spikein_files = []
rule all:
input:
specific_input(aligner),
+ expand(
+ OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
+ lib=LIBS, rep=REPS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"),
lib=LIBS, rep=REPS),
bigwig_files,
+ spikein_files,
+ expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
+ f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
+ counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS, quantif_type=["TPM"]),
expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
- f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_scatter_pairs.pdf"),
- counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS),
+ f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
+ counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS, quantif_type=["counts", "RPK"]),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
"summaries", f"all_on_{genome}_{{orientation}}_{{biotype}}_norm_correlations.pdf"),
@@ -271,23 +307,23 @@ include: ensure_relative(irules["link_raw_data"], workflow.basedir)
#include: "../snakemake_wrappers/includes/link_raw_data.rules"
-def mapping_command(aligner):
- """This function returns the shell commands to run given the *aligner*."""
- if aligner == "hisat2":
- shell_commands = """
-cmd="hisat2 -p {threads} --dta --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
-echo ${{cmd}} 1> {log.log}
-eval ${{cmd}} 1>> {log.log} 2> {log.err}
-""" % genome_db
- elif aligner == "bowtie2":
- shell_commands = """
-cmd="bowtie2 -p {threads} --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
-echo ${{cmd}} 1> {log.log}
-eval ${{cmd}} 1>> {log.log} 2> {log.err}
-""" % genome_db
- else:
- raise NotImplementedError("%s is not yet handled." % aligner)
- return shell_commands
+# def mapping_command(aligner):
+# """This function returns the shell commands to run given the *aligner*."""
+# if aligner == "hisat2":
+# shell_commands = """
+# cmd="hisat2 -p {threads} --dta --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
+# echo ${{cmd}} 1> {log.log}
+# eval ${{cmd}} 1>> {log.log} 2> {log.err}
+# """ % genome_db
+# elif aligner == "bowtie2":
+# shell_commands = """
+# cmd="bowtie2 -p {threads} --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
+# echo ${{cmd}} 1> {log.log}
+# eval ${{cmd}} 1>> {log.log} 2> {log.err}
+# """ % genome_db
+# else:
+# raise NotImplementedError("%s is not yet handled." % aligner)
+# return shell_commands
rule map_on_genome:
@@ -297,6 +333,10 @@ rule map_on_genome:
# temp because it uses a lot of space
sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam")),
nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", f"{{lib}}_{{rep}}_unmapped_on_{genome}.fastq.gz"),
+ params:
+ aligner = aligner,
+ index = genome_db,
+ settings = alignment_settings[aligner],
message:
f"Mapping {{wildcards.lib}}_{{wildcards.rep}} on {genome}."
log:
@@ -306,8 +346,10 @@ rule map_on_genome:
io=5
threads:
4
- shell:
- mapping_command(aligner)
+ #shell:
+ # mapping_command(aligner)
+ wrapper:
+ "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
rule sam2indexedbam:
@@ -329,9 +371,31 @@ rule sam2indexedbam:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"
+rule compute_mapping_stats:
+ input:
+ sorted_bam = rules.sam2indexedbam.output.sorted_bam,
+ #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
+ output:
+ stats = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
+ shell:
+ """samtools stats {input.sorted_bam} > {output.stats}"""
+
+
+rule get_nb_mapped:
+ """Extracts the number of mapped reads from samtools stats results."""
+ input:
+ stats = rules.compute_mapping_stats.output.stats,
+ #stats = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
+ output:
+ nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_nb_mapped.txt"),
+ shell:
+ """samstats2mapped {input.stats} {output.nb_mapped}"""
+
+
rule compute_coverage:
input:
- sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
+ sorted_bam = rules.sam2indexedbam.output.sorted_bam,
+ #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
output:
coverage = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"),
params:
@@ -715,19 +779,145 @@ rule compute_RPK:
rpk.to_csv(output.rpk_file, sep="\t")
+# Compute TPM using total number of mappers divided by genome length (not sum of RPK across biotypes: some mappers may not be counted)
+# No, doesn't work: mappers / genome length not really comparable
+# Needs to be done on all_types
+rule compute_TPM:
+ """For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
+ input:
+ rpk_file = rules.compute_RPK.output.rpk_file
+ output:
+ tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
+ f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
+ # The sum must be done over all counted features
+ wildcard_constraints:
+ biotype = "alltypes"
+ run:
+ rpk = pd.read_table(input.rpk_file, index_col="gene")
+ tpm = 1000000 * rpk / rpk.sum()
+ tpm.to_csv(output.tpm_file, sep="\t")
+
+
+def source_quantif(wildcards):
+ """Determines from which rule the gathered counts should be sourced."""
+ if wildcards.quantif_type == "counts":
+ return rules.gather_counts.output.counts_table
+ elif wildcards.quantif_type == "RPK":
+ return rules.compute_RPK.output.rpk_file
+ elif wildcards.quantif_type == "TPM":
+ return rules.compute_TPM.output.tpm_file
+ else:
+ raise NotImplementedError("%s is not yet handeled." % wildcards.quantif_type)
+
+
rule plot_biotype_scatter_pairs:
input:
- counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
- f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
+ data_file = source_quantif,
output:
plot_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
- f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_scatter_pairs.pdf"),
+ f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
run:
- counts_data = pd.read_table(input.counts_data, index_col="gene")
+ data = pd.read_table(input.data_file, index_col="gene")
save_plot(
output.plot_file, plot_paired_scatters,
- counts_data,
- title="{wildcards.biotype} {wildcards.orientation} counts")
+ data, log_log=False,
+ title="{wildcards.biotype} {wildcards.orientation} {wildcards.quantif_type}")
+
+
+def plot_spikein_response(data, lib):
+ fig, axis = plt.subplots()
+ #axis.set(xscale="log", yscale="log")
+ #sns.regplot("expected_counts", lib, data, ax=axis)
+ try:
+ sns.regplot(
+ "expected_counts", lib,
+ np.log10(data[["expected_counts", lib]]).replace(
+ [np.inf, -np.inf], np.nan).dropna(), ax=axis)
+ axis.set_xlabel("log10(expected count)")
+ axis.set_ylabel("log10(TPM)")
+ except ValueError:
+ pass
+ #sns.regplot("expected_counts", lib, data, ax=axis, robust=True, ci=None)
+ #axis = sns.lmplot("expected_counts", lib, data)
+ #plt.xscale("log")
+ #plt.yscale("log")
+ # axis.set_xscale('log')
+ # axis.set_yscale('log')
+
+
+def do_linear_regression(data, x_col, y_col, y_min=0, fit_intercept=True, transform=None):
+ """Performs the linear regression for the dependance of y_col on x_col.
+ Returns the filtered and transformed data, the slope and intercept, and the
+ relative squared y-distance distances of the points with respect to the values
+ predicted by the regression line.
+ """
+ if transform is not None:
+ data = getattr(np, transform)(
+ data[[x_col, y_col]].dropna().query(f"{y_col} >= {y_min}")).replace(
+ [np.inf, -np.inf], np.nan).dropna()
+ else:
+ data = data[[x_col, y_col]].dropna().query(f"{y_col} >= {y_min}")
+ lm = LinearRegression(fit_intercept=fit_intercept)
+ X = data.drop(y_col, axis=1)
+ lm.fit(X, data[y_col])
+ [slope] = lm.coef_
+ intercept = lm.intercept_
+ predicted = lm.predict(X)
+ # squared_diffs_lm = np.power(data[y_col] - predicted_lm, 2)
+ rel_squared_diffs = np.power((data[y_col] - predicted), 2) / predicted
+ # I confirmed in one example this gives the same values:
+ # predicted_manual = intercept + (slope * data[x_col])
+ # squared_diffs_manual = np.power(data[y_col] - predicted_manual, 2)
+ #return data, slope, intercept, squared_diffs_lm
+ return data, slope, intercept, rel_squared_diffs_lm
+
+
+# TODO: plot log2(TPM) vs log2(spikein_expected_counts), for each library
+rule plot_spikein_responses:
+ input:
+ tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
+ f"all_on_{genome}", "alltypes_{orientation}_TPM.txt"),
+ output:
+ response_plots = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}",
+ f"all_on_{genome}", "{lib}_{rep}_spike_ins_{{orientation}}_TPM_response.pdf"), lib=LIBS, rep=REPS),
+ spikein_slope_files = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "spike_ins_{orientation}_TPM_response_slope.txt"),
+ run:
+ tpm_table = pd.read_table(input.tpm_file, index_col="gene")
+ common = tpm_table.index.intersection(spikein_expected_counts.index)
+ data = pd.concat(
+ [tpm_table.loc[common], spikein_expected_counts],
+ axis=1).replace([np.nan], 0)
+ # pp = PdfPages(output.response_plot)
+ # for lib in data.columns[:-1]:
+ # save_plot(
+ # pp,
+ # plot_spikein_response, data, lib,
+ # title="spike-ins TPM response")
+ # pp.close()
+ rel_sq_diffs_list = []
+ for libname in data.columns[:-1]:
+ filename = OPJ(
+ output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}",
+ f"all_on_{genome}",
+ f"{libname}_spike_ins_{wildcards.orientation}_TPM_response.pdf")
+ save_plot(
+ filename,
+ plot_spikein_response, data, libname,
+ title=f"{libname} spike-ins TPM response")
+ # TODO: gather squared_diffs across libraries and find the most stable spike-ins
+ # Then use those to compute slope again and use it for normalization
+ (
+ transformed_data,
+ regline_slope,
+ regline_intercept,
+ rel_squared_diffs) = do_linear_regression(
+ # intercept supposed to be 0 for spikein response
+ data, "expected_counts", libname, y_min=1, fit_intercept=False, transform="log10")
+ rel_sq_diffs_list.append(rel_squared_diffs)
+ all_rel_sq_diffs = pd.concat(rel_sq_diffs_list, axis=1).dropna()
+ #all_rel_sq_diffs.assign(p_normal=normaltest(all_rel_sq_diffs, axis=1, nan_policy="omit").pvalue)
+ chi_squared = all_rel_sq_diffs.sum(axis=1)
+ #plt.scatter(x=chi2.cdf(chi_squared.as_matrix(), len(data.columns) - 2), y=chi_squared.as_matrix())
rule compute_median_ratio_to_pseudo_ref_size_factors:
@@ -753,7 +943,7 @@ rule make_bigwig:
input:
bam = rules.sam2indexedbam.output.sorted_bam,
# TODO: use norm_type
- median_ratios_file = OPJ(output_dir, aligner, f"mapped_{genome}", COUNTERS[0], f"all_on_{genome}", "{norm_type}_fwd_median_ratios_to_pseudo_ref.txt"),
+ median_ratios_file = OPJ(output_dir, aligner, f"mapped_{genome}", COUNTERS[0], f"all_on_{genome}", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
output:
bigwig_norm = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{{lib}}_{{rep}}_on_{genome}_by_{{norm_type}}_{{orientation}}.bw"),
params:
@@ -848,7 +1038,7 @@ rule join_all_counts:
def source_counts(wildcards):
- """Determines from which rule the gathered small counts should be sourced."""
+ """Determines from which rule the gathered counts should be sourced."""
if wildcards.biotype == "alltypes":
return rules.join_all_counts.output.counts_table
else:
@@ -893,15 +1083,6 @@ rule test_size_factor:
pp = PdfPages(output.norm_counts_distrib_plot)
for normalizer in params.size_factor_types:
if normalizer == "median_ratio_to_pseudo_ref":
- ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
- ## add pseudo-count to compute the geometric mean, then remove it
- ##pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
- ## Ignore lines with zeroes instead:
- #pseudo_ref = (counts_data[counts_data.prod(axis=1) > 0]).apply(gmean, axis=1)
- #def median_ratio_to_pseudo_ref(col):
- # return (col / pseudo_ref).median()
- ##size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
- #size_factors = counts_data[counts_data.prod(axis=1) > 0].apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
size_factors = summaries.loc[normalizer]
diff --git a/libworkflows/libworkflows/libworkflows.py b/libworkflows/libworkflows/libworkflows.py
index 8e71b12a1ffa3504f955cea5ca1db14a6c2ce553..1b1328e04341577ab9d3b08240bfdc9b771f5d57 100644
--- a/libworkflows/libworkflows/libworkflows.py
+++ b/libworkflows/libworkflows/libworkflows.py
@@ -72,7 +72,7 @@ samstats2mapped()
{{
# $1: file containing the output of samtools stats
# $2: file in which to put the number of mappers
- grep "^SN" ${{1}} | cut -f 2- | grep "^reads mapped and paired:" | cut -f 2 > ${{2}}
+ grep "^SN" ${{1}} | cut -f 2- | grep "^reads mapped:" | cut -f 2 > ${{2}}
}}
"""
diff --git a/snakemake_wrappers/map_on_genome/wrapper.py b/snakemake_wrappers/map_on_genome/wrapper.py
index 86e8bb2497374bbc334d96af1b27cfa8cdd923ba..786a29fcaa5c6e444cbf2c28679e9a73fd565b5c 100644
--- a/snakemake_wrappers/map_on_genome/wrapper.py
+++ b/snakemake_wrappers/map_on_genome/wrapper.py
@@ -5,8 +5,8 @@ def mapping_command(aligner):
if aligner == "hisat2":
shell_commands = """
cmd="hisat2 -p {snakemake.threads} --dta --seed 123 -t {snakemake.params.settings} --mm -x {snakemake.params.index} -U {snakemake.input.fastq} --no-unal --un-gz {snakemake.output.nomap_fastq} -S {snakemake.output.sam}"
-echo ${{cmd}} 1> {log.log}
-eval ${{cmd}} 1>> {log.log} 2> {log.err}
+echo ${{cmd}} 1> {snakemake.log.log}
+eval ${{cmd}} 1>> {snakemake.log.log} 2> {snakemake.log.err}
"""
elif aligner == "bowtie2":
shell_commands = """