git pat

8bceb897 · Bernd JAGLA · 5cedc52a · 8bceb897 · 8bceb897
Commit 8bceb897 authored 2 years ago by Bernd JAGLA
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,8 +55,11 @@ image: pf2dock/schnapps
 test2:
  stage: test2
  script:
+    - set
+    - echo $GITHUB_PAT
    - R -e "sessionInfo()"
-    - R -e "devtools::update_packages(dependencies = T, upgrade='always')"    
+    - R -e "installed.packages()"
+    - export GITHUB_PAT=${R_PAT_TOKEN};R -e "devtools::update_packages(dependencies = T, upgrade='always')"    
    - R -e "devtools::install_gitlab(host='https://gitlab.pasteur.fr', repo = 'bernd/UTechSCB-SCHNAPPs', dependencies = TRUE)"
  only:
    - master
--- a/inst/develo/Untitled.R
+++ b/inst/develo/Untitled.R
-library(rhdf5)
-h5ls("~/Downloads/GSE115189/suppl/GSM3169075_filtered_gene_bc_matrices_h5.h5")
-mydata <- h5read("~/Downloads/GSE115189/suppl/GSM3169075_filtered_gene_bc_matrices_h5.h5", "/Homo_sapiens.GRCh38.v90.cellranger/data")
+# normal distribution

+data = rnorm(100000,sd = 1000,mean = 0)

-load(file = "GSE3611.RData")
+hist(data)

-as(gse[[2]], "SingleCellExperiment")
-assayData(gse[[1]])$exprs
+adata = asinh(data)

-colnames(gse[[1]])
-pData(gse[[1]])
-gse[[1]]
-SingleCellExperiment(assays = assayData(gse[[1]])$exprs)
+hist(adata)

-# need to set rownames before

-scEx <- SingleCellExperiment(
-  counts = assayData(gse[[1]])$exprs,
-  colData = pData(phenoData(gse[[1]]))
-)
+minb = 1
+maxb = 500
+stepb = 5
+
+library(dplyr)
+for (b in seq(minb, maxb, stepb)){
+  adata = asinh(data/b)
+  hist(adata, main = paste("b:",b),breaks = 1000,plot = T)
+}
+
+# standard deviation is the driving factor
+
+for( sd in c(1,10,50,100,200,500,1000,10000,1000000)) {
+  data = rnorm(100000,sd = sd,mean = 0)
+  for (b in c(1,5,10,100,500,1000,5000)){
+    adata = asinh(data/b)
+    hist(adata,breaks = 1000, main=paste("sd:",sd, "b=",b))
+  }
+}
+
+# 
+# sd:1, b=5
+# sd:10, b=100
+# sd:50, b=200
+# sd:100, b=500
+# sd:200, b=800
+# sd:500, b=5000
+# sd:1000, b=5000
+# sd:10,000, b>5000
+
+# if the mean is slightly off, what are the effects
+# shift to the negative, negative peak is much higher.
+# the effecto of b seems to be independant of the mean
+
+for( sd in c(1,100,500)) {
+  for(mean in c(-100, -10,0,10, 100)){
+    data = rnorm(100000,sd = sd,mean = mean)
+    for (b in c(1,5,500,5000)){
+      adata = asinh(data/b)
+      hist(adata,breaks = 1000, main=paste("sd:",sd, "b=",b, "mean:", mean))
+    }
+  }
+}
+
+# what about the number of cells?
+# seems to have no effect...
+sd=500
+b=500
+mean=100
+for (nz in c(10000,20000,30000,40000,50000,100000,1000000,1e10)){
+  data = rnorm(100000,sd = sd,mean = mean)
+  adata = asinh(data/b)
+  hist(adata,breaks = 1000, main=paste("sd:",sd, "b=",b, "mean:", mean,"n:",nz))
+}

-example_sce <- SingleCellExperiment(
-  assays = list(counts = sc_example_counts),
-  colData = sc_example_cell_info
+
+
+`
+That means that only the standdeiviation has an effect with regards to b. 
+this might have some biological reason,
+could be related to the age of the fluorchromes or others.
+`
+
+# Does the mean / median of the negative values says something about the std?
+mean = 0
+sd = 500
+outDF = data.frame(mean = numeric(),
+                   sd = character(),
+                   b= numeric(),
+                   meanVal = numeric(),
+                   medianVal = numeric()
 )

+minb = 1
+maxb = 5000
+stepb = 5
+
+for( sd in c(1,10,50,100,200,500,1000,10000,1000000)) {
+  data = rnorm(100000,sd = sd,mean = 0)
+  for (b in seq(minb, maxb, stepb)){
+    as = asinh(data/b)
+    as = as[as<0]
+    m1 = mean(as)
+    m2 = median(as)
+    outDF[nrow(outDF) + 1,] = c(mean = mean, sd=sd, b=b, meanVal = m1, medianVal = m2)
+  }
+}
+
+outDF$sd = as.factor(outDF$sd)
+outDF$b = as.numeric(outDF$b)
+outDF$medianVal = as.numeric(outDF$medianVal)
+outDF$meanVal = as.numeric(outDF$meanVal)
+
+outDF = outDF[!is.null(outDF$medianVal),]
+library(ggplot2)
+# this is IT!!! this shows that the value of b=1 can be used to determine the 
+# optimal value...
+ggplot(outDF, aes(x=b, y=meanVal, color=sd)) + geom_line() 
+
+# But what is the optimal value???
+library(parallel)
+cl <- makeCluster(detectCores()-1) #not to overload your computer
+doParallel::registerDoParallel(cl)
+library(BiocParallel)
+library(foreach)
+b=5000
+sd = 500
+bVals = data.frame(sd = numeric(), b= numeric(), shPval=numeric())
+# for(sd in seq(1, 5000, 10)){
+out = foreach (sd = seq(1, 5000, 10), .combine = rbind) %dopar% {
+  for (b in 1:10000){
+    data = rnorm(5000,sd = sd,mean = 0)
+    adata = asinh(data/b)
+    hist(adata)
+    sh = shapiro.test(adata) 
+    if( sh$p.value >0.1) {
+      bVals[nrow(bVals)+1,] = c(sd,b,sh$p.value)
+      return(c(sd,b,sh$p.value))
+    }
+  }
+}
+out = as.data.frame(out)
+colnames(out) = c("sd", "b", "p-value")
+plot(out$sd,out$b)
+lm(b~sd, out)
+
+data.frame(sd= numeric(), b1 = numeric(), b10=numeric())
+b=5000
+sd = 500
+bVals = data.frame(sd = numeric(), b= numeric(), shPval=numeric())
+# for(sd in seq(1, 5000, 10)){
+out2 = foreach (sd = seq(1, 5000, 1), .combine = rbind) %dopar% {
+  data = rnorm(5000000,sd = sd,mean = 0)
+  b=1
+  as = asinh(data/b)
+  as = as[as<0]
+  m1 = mean(as)
+  m2 = median(as)
+  return(c(sd, b=1, m1))
+}

-# gsm holds data
-# gse just describes the objects
-# still don't know if individual cells are also individual gsm and how to link annoation
-# also unclear on how to tranform to scEx
+out = as.data.frame(out)
+colnames(out) = c("sd", "b", "p-value")
+plot(out$sd,out$b)
+lm(b~sd, out)


-gpl <- getGEO("GPL17021")

-Meta(gse[[1]])