diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 9491fbb925a8974746e29165420d5d07230568dc..8b33ab64e3f61894773f58d089ce0b817136ec21 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -70,6 +70,7 @@ from libworkflows import read_intersect_counts, sum_intersect_counts from libworkflows import read_feature_counts, sum_feature_counts from smincludes import rules as irules +alignment_settings = {"bowtie2": "", "hisat2": "", "crac": "-k 20 --stranded --use-x-in-cigar"} # Possible feature ID conversions ID_TYPES = ["name", "cosmid"] @@ -86,7 +87,6 @@ LFC_RANGE = { "RNA_transposons_rmsk_families" : (-10, 10), "simple_repeats_rmsk_families" : (-10, 10), "alltypes" : (-10, 10)} -DE_BIOTYPES = list(LFC_RANGE.keys()) # Cutoffs in log fold change LFC_CUTOFFS = [0.5, 1, 2] UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS] @@ -100,12 +100,22 @@ DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS] # "RNA-seq.config.json" - -spikein_microliters = config["spikein_microliters"] -spikein_dilution_factor = config["spikein_dilution_factor"] +spikein_dict = config["spikein_dict"] +spikein_microliters = spikein_dict["microliters"] +spikein_dilution_factor = spikein_dict["dilution_factor"] # To get the expected number of attomoles of a given spike-in, # multiply this by the value found in column 4 of Genomes/spike-ins/ERCC92_concentrations.txt spikein_microliter_equivalent = spikein_microliters / spikein_dilution_factor +if spikein_microliter_equivalent: + spikein_concentrations = pd.read_table( + spikein_dict["concentrations"], + index_col="ERCC ID", + usecols=["ERCC ID", "concentration in Mix %s (attomoles/ul)" % spikein_dict["mix"]]) + from scipy.constants import Avogadro + # attomole: 10^{-18} mole + molecules_per_attomole = Avogadro / 10**18 + spikein_expected_counts = spikein_concentrations * spikein_microliter_equivalent * molecules_per_attomole + spikein_expected_counts.columns = ["expected_counts"] # http://sailfish.readthedocs.io/en/master/library_type.html LIB_TYPE = config["lib_type"] @@ -132,6 +142,7 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign( BIOTYPES = config["biotypes"] +DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES] if "spike_ins" in BIOTYPES: PAIR_SCATTER_BIOTYPES = ["spike_ins"] else: @@ -164,10 +175,15 @@ transcriptomes_dir = OPJ(output_dir, "merged_transcriptomes") log_dir = OPJ(output_dir, f"logs_{genome}") data_dir = OPJ(output_dir, "data") -SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref", "spike_ins"] -assert set(SIZE_FACTORS).issubset(set(BIOTYPES) | {"median_ratio_to_pseudo_ref"}) #NORM_TYPES = config["norm_types"] -NORM_TYPES = ["spike_ins", "protein_coding", "median_ratio_to_pseudo_ref"] +if spikein_microliter_equivalent: + SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref", "spike_ins"] + NORM_TYPES = ["spike_ins", "protein_coding", "median_ratio_to_pseudo_ref"] +else: + SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref"] + NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"] + assert "spike_ins" not in BIOTYPES +assert set(SIZE_FACTORS).issubset(set(BIOTYPES) | {"median_ratio_to_pseudo_ref"}) assert set(NORM_TYPES).issubset(set(SIZE_FACTORS)) @@ -193,15 +209,21 @@ def specific_input(aligner): OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),] elif aligner == "bowtie2": files = [] + elif aligner == "crac": + files = [] else: raise NotImplementedError("%s is not yet handeled." % aligner) return files counts_files = [ + expand( + OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"), + counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS), expand( OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "{biotype}_{orientation}_RPK.txt"), - counter=COUNTERS, biotype=BIOTYPES, orientation=ORIENTATIONS), + counter=COUNTERS, biotype=BIOTYPES + ["alltypes"], orientation=ORIENTATIONS), expand( OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"), @@ -244,17 +266,31 @@ bigwig_files = [ OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", f"{{lib}}_mean_on_{genome}_by_{{norm_type}}_{{orientation}}.bw"), lib=LIBS, rep=REPS, norm_type=NORM_TYPES, orientation=ORIENTATIONS),] +if spikein_microliter_equivalent: + spikein_files = [ + expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + f"all_on_{genome}", "{lib}_{rep}_spike_ins_{orientation}_TPM_response.pdf"), + counter=COUNTERS, orientation=ORIENTATIONS, lib=LIBS, rep=REPS),] +else: + spikein_files = [] rule all: input: specific_input(aligner), + expand( + OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"), + lib=LIBS, rep=REPS), expand( OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"), lib=LIBS, rep=REPS), bigwig_files, + spikein_files, + expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"), + counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS, quantif_type=["TPM"]), expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", - f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_scatter_pairs.pdf"), - counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS), + f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"), + counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS, quantif_type=["counts", "RPK"]), expand( OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", f"all_on_{genome}_{{orientation}}_{{biotype}}_norm_correlations.pdf"), @@ -271,23 +307,23 @@ include: ensure_relative(irules["link_raw_data"], workflow.basedir) #include: "../snakemake_wrappers/includes/link_raw_data.rules" -def mapping_command(aligner): - """This function returns the shell commands to run given the *aligner*.""" - if aligner == "hisat2": - shell_commands = """ -cmd="hisat2 -p {threads} --dta --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}" -echo ${{cmd}} 1> {log.log} -eval ${{cmd}} 1>> {log.log} 2> {log.err} -""" % genome_db - elif aligner == "bowtie2": - shell_commands = """ -cmd="bowtie2 -p {threads} --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}" -echo ${{cmd}} 1> {log.log} -eval ${{cmd}} 1>> {log.log} 2> {log.err} -""" % genome_db - else: - raise NotImplementedError("%s is not yet handled." % aligner) - return shell_commands +# def mapping_command(aligner): +# """This function returns the shell commands to run given the *aligner*.""" +# if aligner == "hisat2": +# shell_commands = """ +# cmd="hisat2 -p {threads} --dta --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}" +# echo ${{cmd}} 1> {log.log} +# eval ${{cmd}} 1>> {log.log} 2> {log.err} +# """ % genome_db +# elif aligner == "bowtie2": +# shell_commands = """ +# cmd="bowtie2 -p {threads} --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}" +# echo ${{cmd}} 1> {log.log} +# eval ${{cmd}} 1>> {log.log} 2> {log.err} +# """ % genome_db +# else: +# raise NotImplementedError("%s is not yet handled." % aligner) +# return shell_commands rule map_on_genome: @@ -297,6 +333,10 @@ rule map_on_genome: # temp because it uses a lot of space sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam")), nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", f"{{lib}}_{{rep}}_unmapped_on_{genome}.fastq.gz"), + params: + aligner = aligner, + index = genome_db, + settings = alignment_settings[aligner], message: f"Mapping {{wildcards.lib}}_{{wildcards.rep}} on {genome}." log: @@ -306,8 +346,10 @@ rule map_on_genome: io=5 threads: 4 - shell: - mapping_command(aligner) + #shell: + # mapping_command(aligner) + wrapper: + "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome" rule sam2indexedbam: @@ -329,9 +371,31 @@ rule sam2indexedbam: "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam" +rule compute_mapping_stats: + input: + sorted_bam = rules.sam2indexedbam.output.sorted_bam, + #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), + output: + stats = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"), + shell: + """samtools stats {input.sorted_bam} > {output.stats}""" + + +rule get_nb_mapped: + """Extracts the number of mapped reads from samtools stats results.""" + input: + stats = rules.compute_mapping_stats.output.stats, + #stats = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"), + output: + nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_nb_mapped.txt"), + shell: + """samstats2mapped {input.stats} {output.nb_mapped}""" + + rule compute_coverage: input: - sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), + sorted_bam = rules.sam2indexedbam.output.sorted_bam, + #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), output: coverage = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"), params: @@ -715,19 +779,145 @@ rule compute_RPK: rpk.to_csv(output.rpk_file, sep="\t") +# Compute TPM using total number of mappers divided by genome length (not sum of RPK across biotypes: some mappers may not be counted) +# No, doesn't work: mappers / genome length not really comparable +# Needs to be done on all_types +rule compute_TPM: + """For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers).""" + input: + rpk_file = rules.compute_RPK.output.rpk_file + output: + tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"), + # The sum must be done over all counted features + wildcard_constraints: + biotype = "alltypes" + run: + rpk = pd.read_table(input.rpk_file, index_col="gene") + tpm = 1000000 * rpk / rpk.sum() + tpm.to_csv(output.tpm_file, sep="\t") + + +def source_quantif(wildcards): + """Determines from which rule the gathered counts should be sourced.""" + if wildcards.quantif_type == "counts": + return rules.gather_counts.output.counts_table + elif wildcards.quantif_type == "RPK": + return rules.compute_RPK.output.rpk_file + elif wildcards.quantif_type == "TPM": + return rules.compute_TPM.output.tpm_file + else: + raise NotImplementedError("%s is not yet handeled." % wildcards.quantif_type) + + rule plot_biotype_scatter_pairs: input: - counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", - f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"), + data_file = source_quantif, output: plot_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", - f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_scatter_pairs.pdf"), + f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"), run: - counts_data = pd.read_table(input.counts_data, index_col="gene") + data = pd.read_table(input.data_file, index_col="gene") save_plot( output.plot_file, plot_paired_scatters, - counts_data, - title="{wildcards.biotype} {wildcards.orientation} counts") + data, log_log=False, + title="{wildcards.biotype} {wildcards.orientation} {wildcards.quantif_type}") + + +def plot_spikein_response(data, lib): + fig, axis = plt.subplots() + #axis.set(xscale="log", yscale="log") + #sns.regplot("expected_counts", lib, data, ax=axis) + try: + sns.regplot( + "expected_counts", lib, + np.log10(data[["expected_counts", lib]]).replace( + [np.inf, -np.inf], np.nan).dropna(), ax=axis) + axis.set_xlabel("log10(expected count)") + axis.set_ylabel("log10(TPM)") + except ValueError: + pass + #sns.regplot("expected_counts", lib, data, ax=axis, robust=True, ci=None) + #axis = sns.lmplot("expected_counts", lib, data) + #plt.xscale("log") + #plt.yscale("log") + # axis.set_xscale('log') + # axis.set_yscale('log') + + +def do_linear_regression(data, x_col, y_col, y_min=0, fit_intercept=True, transform=None): + """Performs the linear regression for the dependance of y_col on x_col. + Returns the filtered and transformed data, the slope and intercept, and the + relative squared y-distance distances of the points with respect to the values + predicted by the regression line. + """ + if transform is not None: + data = getattr(np, transform)( + data[[x_col, y_col]].dropna().query(f"{y_col} >= {y_min}")).replace( + [np.inf, -np.inf], np.nan).dropna() + else: + data = data[[x_col, y_col]].dropna().query(f"{y_col} >= {y_min}") + lm = LinearRegression(fit_intercept=fit_intercept) + X = data.drop(y_col, axis=1) + lm.fit(X, data[y_col]) + [slope] = lm.coef_ + intercept = lm.intercept_ + predicted = lm.predict(X) + # squared_diffs_lm = np.power(data[y_col] - predicted_lm, 2) + rel_squared_diffs = np.power((data[y_col] - predicted), 2) / predicted + # I confirmed in one example this gives the same values: + # predicted_manual = intercept + (slope * data[x_col]) + # squared_diffs_manual = np.power(data[y_col] - predicted_manual, 2) + #return data, slope, intercept, squared_diffs_lm + return data, slope, intercept, rel_squared_diffs_lm + + +# TODO: plot log2(TPM) vs log2(spikein_expected_counts), for each library +rule plot_spikein_responses: + input: + tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + f"all_on_{genome}", "alltypes_{orientation}_TPM.txt"), + output: + response_plots = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", + f"all_on_{genome}", "{lib}_{rep}_spike_ins_{{orientation}}_TPM_response.pdf"), lib=LIBS, rep=REPS), + spikein_slope_files = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "spike_ins_{orientation}_TPM_response_slope.txt"), + run: + tpm_table = pd.read_table(input.tpm_file, index_col="gene") + common = tpm_table.index.intersection(spikein_expected_counts.index) + data = pd.concat( + [tpm_table.loc[common], spikein_expected_counts], + axis=1).replace([np.nan], 0) + # pp = PdfPages(output.response_plot) + # for lib in data.columns[:-1]: + # save_plot( + # pp, + # plot_spikein_response, data, lib, + # title="spike-ins TPM response") + # pp.close() + rel_sq_diffs_list = [] + for libname in data.columns[:-1]: + filename = OPJ( + output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + f"all_on_{genome}", + f"{libname}_spike_ins_{wildcards.orientation}_TPM_response.pdf") + save_plot( + filename, + plot_spikein_response, data, libname, + title=f"{libname} spike-ins TPM response") + # TODO: gather squared_diffs across libraries and find the most stable spike-ins + # Then use those to compute slope again and use it for normalization + ( + transformed_data, + regline_slope, + regline_intercept, + rel_squared_diffs) = do_linear_regression( + # intercept supposed to be 0 for spikein response + data, "expected_counts", libname, y_min=1, fit_intercept=False, transform="log10") + rel_sq_diffs_list.append(rel_squared_diffs) + all_rel_sq_diffs = pd.concat(rel_sq_diffs_list, axis=1).dropna() + #all_rel_sq_diffs.assign(p_normal=normaltest(all_rel_sq_diffs, axis=1, nan_policy="omit").pvalue) + chi_squared = all_rel_sq_diffs.sum(axis=1) + #plt.scatter(x=chi2.cdf(chi_squared.as_matrix(), len(data.columns) - 2), y=chi_squared.as_matrix()) rule compute_median_ratio_to_pseudo_ref_size_factors: @@ -753,7 +943,7 @@ rule make_bigwig: input: bam = rules.sam2indexedbam.output.sorted_bam, # TODO: use norm_type - median_ratios_file = OPJ(output_dir, aligner, f"mapped_{genome}", COUNTERS[0], f"all_on_{genome}", "{norm_type}_fwd_median_ratios_to_pseudo_ref.txt"), + median_ratios_file = OPJ(output_dir, aligner, f"mapped_{genome}", COUNTERS[0], f"all_on_{genome}", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), output: bigwig_norm = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{{lib}}_{{rep}}_on_{genome}_by_{{norm_type}}_{{orientation}}.bw"), params: @@ -848,7 +1038,7 @@ rule join_all_counts: def source_counts(wildcards): - """Determines from which rule the gathered small counts should be sourced.""" + """Determines from which rule the gathered counts should be sourced.""" if wildcards.biotype == "alltypes": return rules.join_all_counts.output.counts_table else: @@ -893,15 +1083,6 @@ rule test_size_factor: pp = PdfPages(output.norm_counts_distrib_plot) for normalizer in params.size_factor_types: if normalizer == "median_ratio_to_pseudo_ref": - ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but - ## add pseudo-count to compute the geometric mean, then remove it - ##pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 - ## Ignore lines with zeroes instead: - #pseudo_ref = (counts_data[counts_data.prod(axis=1) > 0]).apply(gmean, axis=1) - #def median_ratio_to_pseudo_ref(col): - # return (col / pseudo_ref).median() - ##size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) - #size_factors = counts_data[counts_data.prod(axis=1) > 0].apply(median_ratio_to_pseudo_ref, axis=0) size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) else: size_factors = summaries.loc[normalizer] diff --git a/libworkflows/libworkflows/libworkflows.py b/libworkflows/libworkflows/libworkflows.py index 8e71b12a1ffa3504f955cea5ca1db14a6c2ce553..1b1328e04341577ab9d3b08240bfdc9b771f5d57 100644 --- a/libworkflows/libworkflows/libworkflows.py +++ b/libworkflows/libworkflows/libworkflows.py @@ -72,7 +72,7 @@ samstats2mapped() {{ # $1: file containing the output of samtools stats # $2: file in which to put the number of mappers - grep "^SN" ${{1}} | cut -f 2- | grep "^reads mapped and paired:" | cut -f 2 > ${{2}} + grep "^SN" ${{1}} | cut -f 2- | grep "^reads mapped:" | cut -f 2 > ${{2}} }} """ diff --git a/snakemake_wrappers/map_on_genome/wrapper.py b/snakemake_wrappers/map_on_genome/wrapper.py index 86e8bb2497374bbc334d96af1b27cfa8cdd923ba..786a29fcaa5c6e444cbf2c28679e9a73fd565b5c 100644 --- a/snakemake_wrappers/map_on_genome/wrapper.py +++ b/snakemake_wrappers/map_on_genome/wrapper.py @@ -5,8 +5,8 @@ def mapping_command(aligner): if aligner == "hisat2": shell_commands = """ cmd="hisat2 -p {snakemake.threads} --dta --seed 123 -t {snakemake.params.settings} --mm -x {snakemake.params.index} -U {snakemake.input.fastq} --no-unal --un-gz {snakemake.output.nomap_fastq} -S {snakemake.output.sam}" -echo ${{cmd}} 1> {log.log} -eval ${{cmd}} 1>> {log.log} 2> {log.err} +echo ${{cmd}} 1> {snakemake.log.log} +eval ${{cmd}} 1>> {snakemake.log.log} 2> {snakemake.log.err} """ elif aligner == "bowtie2": shell_commands = """