From bd07159a000ba46c4a1de888f2c608547253414f Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 24 Oct 2022 12:06:00 +0200
Subject: [PATCH] TODO notes for spike-ins.

---
 RNA_Seq_Cecere/RNA-seq.snakefile | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile
index 13deca1..429e66a 100644
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -20,6 +20,15 @@ major, minor = sys.version_info[:2]
 if major < 3 or (major == 3 and minor < 6):
     sys.exit("Need at least python 3.6\n")
 
+
+# TODO (04/10/2022):
+# * normalize spike-in counts by their length (RPKM)
+# * use scikit-learn to have a correction factor for transcript RPKM
+# TODO first (04/10/2022):
+# * output normalizations by total spike-ins (currently normalization is hard-coded to use protein_coding): raw from featureCounts / spike and RPKM (M would be "by million spike-in reads")
+# * output slope and intercept of spike-in response in a file (and on the plot?)
+# * find example config file activating spike-in stuff
+
 # TODO: plot spike-in vs spike-in between libraries to detect anormal spike-ins: should be a straight line
 
 # TODO: Add rules to take into account spike-ins.
@@ -1408,7 +1417,10 @@ rule plot_spikein_responses:
                 title=f"{libname} spike-ins TPM response")
             # TODO: gather squared_diffs across libraries and find the most stable spike-ins
             # Then use those to compute slope again and use it for normalization
+            # TODO (04/10/2022): save regline_slope and regline_intercept somewhere.
             (
+                # Not the data transformed by LinearRegression
+                # Just pre-processed data
                 transformed_data,
                 regline_slope,
                 regline_intercept,
-- 
GitLab