Data_Management.R 23.5 KB
Newer Older
svolant's avatar
svolant committed
1
#@ This file contains all the functions needed to
svolant's avatar
svolant committed
2
#@ to load, check, filter and transform the data 
svolant's avatar
svolant committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

## Add news to the home page
addNews <- function(date ="",title="",text="")
{
  res=list()
  res$r1 = paste("<b><font size='+1'>",date,"</font></b>", " - ", "<b><font size='+1'>",title,"</font></b><br/>")
  res$r2 = paste("<p><font color='grey'>",text,"</font></p><hr/>")
  
  return(HTML(unlist(res)))
}



## Function for the rdp format
getval <- function(annotation_rdp, interest, threshold_annot){
  annotation_rdp = unlist(strsplit(annotation_rdp,"\t"))
  annotation = c(annotation_rdp[1])
  for(level in interest){
    idlevel=which(annotation_rdp == level)
    if(length(idlevel)>0){
      if(as.numeric(annotation_rdp[idlevel+1]) >= threshold_annot){
        annotation = c(annotation, gsub("\"", "", annotation_rdp[idlevel-1]))
      }
      else annotation = c(annotation, "NA")
    }
    else annotation = c(annotation, "NA")  
  }
  return(annotation)
}

## Read rdp file
read_rdp <- function(filename, threshold_annot)
{
  interest=c("phylum", "class", "order", "family", "genus")
  conn <- file(filename,open="r")
  linn <-readLines(conn)
  tab=t(sapply(1:length(linn), function(i) getval(linn[i], interest, threshold_annot)))
  close(conn)
  
  if(!TRUE%in%duplicated(tab[,1])) rownames(tab)=tab[,1];tab=tab[,-1]
  colnames(tab) = c("Phylum","Class","Order","Family","Genus")
  
  return(tab)
}



## Check the format of the counts table
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
51
CheckCountsTable <- function(counts, MGSTable=FALSE)
svolant's avatar
svolant committed
52
53
54
{
  Error = NULL
  Warning = NULL
svolant's avatar
svolant committed
55
56
57
58
59
60
61
62
  
  if(is.null(counts) && is.null(Error)){Error = "There is no counts table" }
  

  if(ncol(counts)<=1 && is.null(Error)){Error = "The number of columns of the counts table must be at least 2" }
  if(nrow(counts)<=1 && is.null(Error)){Error = "The number of rows of the counts table must be at least 2" }
  
  if(is.null(Error)) 
svolant's avatar
svolant committed
63
  {
svolant's avatar
svolant committed
64
65
66
67
68
69
    numTest = FALSE%in%sapply(counts,is.numeric)
    if(numTest) Error = "The counts table must contain only numeric values" 
    if(!numTest)
    {
      if(0%in%colSums(counts)){Error = "At least one of the column of the counts table is 0" }
      if(min(counts)<0){Error = "The counts table must contain only positive values" }
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
70
      if(MGSTable && length(which(toupper(colnames(counts))%in%"SIZE")) != 1){Error="The counts table must contain a column named SIZE providing the length of each gene"}
svolant's avatar
svolant committed
71
    }
svolant's avatar
svolant committed
72
  }
svolant's avatar
svolant committed
73
  if(TRUE%in%sapply(counts,is.na) && is.null(Error)){Warning = "NA values are considered as 0 is the counts table"; counts[sapply(counts,is.na)]=0}
svolant's avatar
svolant committed
74
75
76
77
78
  
  return(list(Error=Error,Warning=Warning,counts=counts))
}

## Check the format of the taxonomy table
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
79
CheckTaxoTable <- function(taxo,counts, MGSTable=FALSE, taxoCreated=FALSE)
svolant's avatar
svolant committed
80
81
82
{
  Error = NULL
  Warning = NULL
svolant's avatar
svolant committed
83
84
85
86
  if(taxoCreated){Warning = "No taxonomy table has been uploaded, the analysis can only be done at the OTU/gene level"}
  if(ncol(taxo)<1 && is.null(Error)){Error = "The number of columns of the taxonomy table must be at least 1" }
  if(nrow(taxo)<=1 && is.null(Error)){Error = "The number of rows if the taxonomy table must be at least 2" }
  if(TRUE%in%is.numeric(taxo) && is.null(Error) ){Error = "The taxonomy table must contain only character" }
svolant's avatar
svolant committed
87
  
svolant's avatar
svolant committed
88
  if(is.null(Error))
svolant's avatar
svolant committed
89
  {
svolant's avatar
svolant committed
90
91
92
93
94
95
    for(i in 1:ncol(taxo))
    {
      level = levels(taxo[,i])
      nb = length(level)
      if(nb==1 && level=="NA"){ Error = "At least one column contains only NA"}
    }
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
96
    if(MGSTable && length(which(toupper(colnames(taxo))%in%"MGS")) != 1){Error="The taxonomy table must contain a column named MGS providing the MGS association of each gene"}
svolant's avatar
svolant committed
97
98
99
  }
  
  ## Annotated features without counts
svolant's avatar
svolant committed
100
  if(!any(rownames(taxo)%in%rownames(counts)) && is.null(Error)){ Error = "Some annotated features are not in the count table"}
svolant's avatar
svolant committed
101
102
103
104
  
  return(list(Error=Error,Warning=Warning))
}

105
106
107
108
109
110
111
112
113
114


CheckTargetModel <- function(input,target,labeled,CT)
{
  Error = NULL
  HowTo = NULL
  InterVar = input$InterestVar
  
  labels = rownames(target)
  ind = which(colnames(CT)%in%labels)
svolant's avatar
svolant committed
115
116
117
118

#   InterVar%in%
#   uniq_column = (length(which(sapply(target[InterVar], function(x) length(unique(x))) == 1)) > 0)
#   uniq_column_names = names(which(sapply(target[InterVar], function(x) length(unique(x))) == 1))
119
  
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
  ## At least one variable selected
  if(is.null(Error) && length(ind)<=1){
    Error = "Less than two samples names fit with the counts table" 
    HowTo = "Check the samples names in the target file. They must be in the first column and must correspond EXACTLY to the names in the count table."
  }
  ## At least one variable selected
  if(is.null(Error) && length(InterVar)==0){
    Error = "At least one variable must be selected for the model" 
    HowTo = "Add at least one variable in the 'Select the variables' widget"
  }
  
  ## Names of samples correct ?
  if(is.null(Error) && labeled==0){
    Error = "The names of the samples in the target file do not fit the counts table" 
    HowTo = "Check the samples names in the target file. They must be in the first column and must correspond EXACTLY to the names in the count table."
    }
  
  ## Number of columns
  if(is.null(Error) && ncol(target)<2){
    Error = "The number of columns of the target file must be at least 2"
    HowTo = "Add at least one additional variable to describe your samples"
svolant's avatar
svolant committed
141
142
143
144
145
146
147
  }
  
  if(is.null(Error) && min(sapply(apply(target,2,unique),length)) <=1){
    Error = "One of the variable has the same value for all the samples" 
    HowTo = "Remove the variable from the target file"
  }
  
148
149
150
151
152
153
154
155
156
157
158
159
160
  
  ## Number of rows
  if(is.null(Error) && nrow(target)<=1){
    Error = "The number of rows if the target file must be at least 2"
    HowTo = "Add information about more than 2 samples"
    }
  
  ## NA values
  if(is.null(Error) && (any(is.na(target)) || any(target ==""))){
    Error = "NA's or missing values are not allowed in the target file" 
    HowTo = "Remove all the samples for which one or more variables are NA or missing"
  }
  
161
162
  ## contrasts can be applied only to factors with 2 or more levels
  
svolant's avatar
svolant committed
163
164
165
166
167
#   if(is.null(Error) && (uniq_column)){
#     Error = "Contrasts can be applied only to factors with 2 or more levels."
#     HowTo = paste("Remove all variables with only one factor:", uniq_column_names, sep=" ")
#   }
#   
168
169
170
171
  
  ## Full rank matrix
  if(is.null(Error) && length(InterVar)>0)
  {
svolant's avatar
svolant committed
172
    design = GetDesign(input,target)
173
174
175
176
177
178
179
180
181
182
183
184
185
    testRank = CheckMatrixRank(design,target)
    if(!testRank){
        Error = "The model matrix is not full rank. One or more variables or interaction terms 
        are linear combinations of the others and must be removed." 
        HowTo = "Remove variable(s) that provide the same information, i.e, if the value of a variable is totaly determine by an other variable remove one of them."
        }
  }
    
  return(list(Error=Error,HowTo=HowTo))
}



svolant's avatar
svolant committed
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204

CheckContrast <- function(contrastFile,dds)
{
  Error = NULL
  Warning = NULL
  parameterNames = resultsNames(dds)
  if(is.null(contrastFile) && is.null(Error)){Error = "The format of the contrast file is not supported by SHAMAN" }
  
  
  if(ncol(contrastFile)<1 && is.null(Error)){Error = "The contrast file seems to be empty" }
  if(nrow(contrastFile)!=length(parameterNames) && is.null(Error)){Error = "The contrast file does not fit with the model parameters" }

  if(TRUE%in%sapply(contrastFile,is.na) && is.null(Error)){Error = "NA values are considered as 0 is the counts table"; contrastFile[sapply(contrastFile,is.na)]=0}
  
  
  return(list(Error=Error,Warning=Warning,contrastFile=contrastFile))
}


svolant's avatar
svolant committed
205
206
207
208
209
210
211
212
213
214
215
216
217
## Check the format of the tree file (for Unifrac distance)
CheckTreeFile <- function(tree)
{
  Error = NULL
  Warning = NULL
  if(!is.phylo(tree) && is.null(Error)){Error = "The loaded file is not a phylogenetic tree"; tree = NULL}
  if(!is.rooted(tree) && is.null(Error) ){Warning = "The tree has been rooted using midpoint method"; tree = midpoint.root(tree)}
  return(list(Error=Error,Warning=Warning,tree=tree))
}




svolant's avatar
svolant committed
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
## Get the percentage of annotated OTU
PercentAnnot <- function(counts,taxo)
{
  Error = NULL  
  tmp = table(rownames(counts)%in%rownames(taxo))
  Percent = tmp["TRUE"]/sum(tmp)
  if(is.na(Percent)) Percent = 0
  if(Percent==0){Error = "Counts table and annotation do not matched" }
  
  return(list(Error=Error,Percent=Percent))
}


## Get the counts and the taxo tables from the BIOM format file.
GetDataFromBIOM <-function(dataBIOM)
{
svolant's avatar
svolant committed
234
235
236
  taxo = NULL
  counts = NULL
  taxoCreated = FALSE
svolant's avatar
svolant committed
237
238
  ## Counts table
  counts = biom_data(dataBIOM)
svolant's avatar
svolant committed
239
240
241
242
243
244
245
  if(!is.null(counts))
  {
    counts = as.matrix(counts)
    ## Change of - to . is risky
    colnames(counts) = gsub("-",".",colnames(counts))
    counts = as.data.frame(counts)
  }
svolant's avatar
svolant committed
246
247
248
249
  CheckCounts = CheckCountsTable(counts)
  counts = CheckCounts$counts
  
  ## Taxonomy table
svolant's avatar
svolant committed
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
  tmp = observation_metadata(dataBIOM)
  if(!is.null(tmp))
  {
    if(is.data.frame(tmp)) taxo = as.data.frame(tmp)
    if(!is.data.frame(tmp)) taxo = as.data.frame(t(sapply(observation_metadata(dataBIOM),FUN=function(x){x[1:7]})))
    
    OTUnames = rownames(taxo)
    ## Modif taxo table (remove p__,... and change the colnames)
    taxo = as.data.frame(sapply(taxo,gsub,pattern="^.*__",replacement=""))
    colnames(taxo) = c("Kingdom", "Phylum","Class","Order","Family","Genus","Species")
    rownames(taxo) = OTUnames
    ## Remove empty row
    taxo[taxo==""] = NA
    taxo[taxo=="Unassigned"] = NA
    taxo=taxo[rowSums(is.na(taxo))!=dim(taxo)[2], ]
  }
  if(is.null(tmp) && !is.null(counts)) {taxo = data.frame(rownames(counts),row.names = rownames(counts));names(taxo)=NA; taxoCreated = TRUE}
svolant's avatar
svolant committed
267
  
svolant's avatar
svolant committed
268
  CheckTaxo = CheckTaxoTable(taxo,counts,taxoCreated)
svolant's avatar
svolant committed
269
270
271
272
273
274
275
276
  
  ## Pourcentage of annotation
  tmp = PercentAnnot(counts,taxo)
  
  return(list(counts=counts,taxo=taxo,CheckCounts=CheckCounts,CheckTaxo=CheckTaxo,Percent=tmp$Percent,CheckPercent=tmp$Error))
}

## Check the data
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
277
GetDataFromCT <-function(dataC,dataT, MGSTable)
svolant's avatar
svolant committed
278
279
280
281
{
  
  ## Counts table
  counts = dataC
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
282
  CheckCounts = CheckCountsTable(counts, MGSTable)
svolant's avatar
svolant committed
283
  counts = CheckCounts$counts
svolant's avatar
svolant committed
284
285
  
  
svolant's avatar
svolant committed
286
287
  ## Taxonomy table
  taxo = as.data.frame(dataT)
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
288
  CheckTaxo = CheckTaxoTable(taxo,counts, MGSTable)
svolant's avatar
svolant committed
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
  
  ## Pourcentage of annotation
  tmp = PercentAnnot(counts,taxo)
  
  return(list(counts=counts,taxo=taxo,CheckCounts=CheckCounts,CheckTaxo=CheckTaxo,Percent=tmp$Percent,CheckPercent=tmp$Error))
}



## Get the counts for the selected taxonomy
GetCountsMerge <- function(input,dataInput,taxoSelect,target,design)
{
  ## Init
  counts= NULL
  CheckTarget = FALSE
  CT_noNorm = NULL
  normFactors = NULL
  FeatureSize = NULL
svolant's avatar
svolant committed
307
  CT_Norm = NULL
svolant's avatar
svolant committed
308
309
310
311
  
  ## Counts and taxo tables
  CT = dataInput$counts
  taxo = dataInput$taxo
svolant's avatar
svolant committed
312
  namesTaxo = colnames(taxo)
svolant's avatar
svolant committed
313
  # save(CT,target,taxo,file="testMerge.RData")
svolant's avatar
svolant committed
314
315
  
  ## Select cols in the target
svolant's avatar
svolant committed
316
  labels = rownames(target)
svolant's avatar
svolant committed
317
318
319
320
321
  ind = which(colnames(CT)%in%labels)
  
  ## Get the normalization variable (normalization can be done according to this variable)
  VarNorm = input$VarNorm
  
svolant's avatar
svolant committed
322
  
svolant's avatar
svolant committed
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
  if(length(ind)==length(labels))
  { 
    if(input$TypeTable == "MGS"){
      ## Get the feature size for the normalisation
      Size_indcol = which(toupper(colnames(CT))%in%"SIZE")
      if(length(Size_indcol)==1) FeatureSize = CT[,Size_indcol]
      else print("Size parameter is missing in the count matrix")
      # Consider only counts
      CT = CT[,ind]
      # Divide by gene length
      CT = CT / FeatureSize * 1000
      # Convert matrix as integer
      CT_int=t(apply(CT,1,as.integer))
      rownames(CT_int)=rownames(CT)
      colnames(CT_int)=colnames(CT)
      CT=CT_int
    } else CT = CT[,ind]
    
341
    
svolant's avatar
svolant committed
342
343
344
345
    ## Order CT according to the target
    CT = OrderCounts(counts=CT,labels=labels)$CountsOrder
    CT_noNorm = CT
    RowProd = sum(apply(CT_noNorm,1,prod))
svolant's avatar
svolant committed
346
  
svolant's avatar
svolant committed
347
348
349
350
351
352
353
354
355
356
    merged_table = merge(CT, taxo, by="row.names")
    CT = as.data.frame(merged_table[,2: (dim(CT)[2]+1)])
    taxo = as.data.frame(merged_table[,(dim(CT)[2]+2):dim(merged_table)[2]])
    
    rownames(CT) = merged_table[,1]
    rownames(taxo) = merged_table[,1]
    colnames(taxo) = namesTaxo
    #ordOTU = order(rownames(taxo))
    counts_annot = CT
    
svolant's avatar
svolant committed
357
    ## Create the dds object
svolant's avatar
svolant committed
358
    dds <- DESeqDataSetFromMatrix(countData=CT, colData=target, design=design,ignoreRank=TRUE)
svolant's avatar
svolant committed
359
  
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
360
    #save(dds,file="testdds.RData")
svolant's avatar
svolant committed
361
362
363
    if(is.null(VarNorm)){
      ## Counts normalisation
      ## Normalisation with or without 0
364
365
366
      if(input$AccountForNA=="NonNull" || RowProd==0) dds = estimateSizeFactors(dds,locfunc=eval(as.name(input$locfunc)),geoMeans=GeoMeansCT(CT))
      if(input$AccountForNA=="All" && RowProd!=0) dds = estimateSizeFactors(dds,locfunc=eval(as.name(input$locfunc)))
      if(input$AccountForNA=="Weighted" && input$AccountForNA!="NonNull" ) {dds = estimateSizeFactors(dds,locfunc=eval(as.name(input$locfunc)),geoMeans=GeoMeansCT(CT)); sizeFactors(dds) = w.sizefactor(CT)}
svolant's avatar
svolant committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
      normFactors = sizeFactors(dds)
      
    } else{
      group = as.data.frame(target[,VarNorm])
      group = apply(group,1,paste, collapse = "-")
      normFactors = c()
      mod = unique(group)
      ## At least 2 samples are needed for the normalization
      if(min(table(group))>1){
        for(i in unique(group))
        {
          indgrp = which(group==i) 
          CT_tmp = CT[,indgrp]
          CT_tmp = removeNulCounts(CT_tmp) 
          target_tmp = data.frame(labels = rownames(target)[indgrp])
svolant's avatar
svolant committed
382
          dds_tmp <- DESeqDataSetFromMatrix(countData=CT_tmp, colData=target_tmp, design=~labels,ignoreRank=TRUE)
383
384
385
386
          if(input$AccountForNA=="NonNull") {dds_tmp = estimateSizeFactors(dds_tmp,locfunc=eval(as.name(input$locfunc)),geoMeans=GeoMeansCT(CT_tmp)); normFactors[indgrp] = sizeFactors(dds_tmp)}
          if(input$AccountForNA=="All") {dds_tmp = estimateSizeFactors(dds_tmp,locfunc=eval(as.name(input$locfunc))); normFactors[indgrp] = sizeFactors(dds_tmp)}
          if(input$AccountForNA=="Weighted" && input$AccountForNA!="NonNull" ) {dds_tmp = estimateSizeFactors(dds_tmp,locfunc=eval(as.name(input$locfunc)),geoMeans=GeoMeansCT(CT_tmp)); normFactors[indgrp] = w.sizefactor(CT_tmp)}
          
svolant's avatar
svolant committed
387
388
        }
      } else{
389
390
391
392
        if(input$AccountForNA=="NonNull" || RowProd==0) dds = estimateSizeFactors(dds,locfunc=eval(as.name(input$locfunc)),geoMeans=GeoMeansCT(CT))
        if(input$AccountForNA=="All" && RowProd!=0) dds = estimateSizeFactors(dds,locfunc=eval(as.name(input$locfunc)))
        if(input$AccountForNA=="Weighted" && input$AccountForNA!="NonNull" ) {dds = estimateSizeFactors(dds,locfunc=eval(as.name(input$locfunc)),geoMeans=GeoMeansCT(CT)); sizeFactors(dds) = w.sizefactor(CT)}
        
svolant's avatar
svolant committed
393
394
395
396
397
398
399
400
        normFactors = sizeFactors(dds)
      }
      
      sizeFactors(dds) = normFactors
    }
    
    ## Keep normalized OTU table
    CT_Norm = counts(dds, normalized=TRUE)
401
    
svolant's avatar
svolant committed
402
    # Only interesting OTU
403
    # merged_table = merge(CT, taxo[order(rownames(CT)),], by="row.names")
404

svolant's avatar
svolant committed
405
406
407
408
409
410
411
412
#     merged_table = merge(CT, taxo, by="row.names")
#     CT = as.data.frame(merged_table[,2: (dim(CT)[2]+1)])
#     taxo = as.data.frame(merged_table[,(dim(CT)[2]+2):dim(merged_table)[2]])
#     
#     rownames(CT) = merged_table[,1]
#     rownames(taxo) = merged_table[,1]
#     #ordOTU = order(rownames(taxo))
#     counts_annot = CT
svolant's avatar
svolant committed
413
414
415
416
    #       ordOTU = order(rownames(taxo))
    #       indOTU_annot = which(rownames(CT)%in%rownames(taxo))
    #       counts_annot = CT[indOTU_annot[ordOTU],]
    ## Aggregate matrix
svolant's avatar
svolant committed
417
    if(taxoSelect=="OTU/Gene") counts = counts_annot
svolant's avatar
svolant committed
418
    else{
svolant's avatar
svolant committed
419
      if(input$TypeTable == "MGS" && input$FileFormat!="fileBiom"){
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
420
421
        MGS_taxocol = which(toupper(colnames(taxo))%in%"MGS")
        taxoS = taxo[,MGS_taxocol]
svolant's avatar
svolant committed
422
423
424
425
426
427
428
429
        counts = aggregate(counts_annot,by=list(Taxonomy = taxoS),mean)
        rownames(counts)=counts[,1]
        counts=counts[,-1]
        counts_int=t(apply(counts,1,as.integer))
        rownames(counts_int)=rownames(counts)
        colnames(counts_int)=colnames(counts)
        counts=counts_int
      }
svolant's avatar
svolant committed
430
      if(taxoSelect != "MGS" || input$FileFormat=="fileBiom"){
svolant's avatar
svolant committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
        #taxoS = taxo[ordOTU,taxoSelect]
        taxoS = taxo[,taxoSelect]
        counts = aggregate(counts_annot,by=list(Taxonomy = taxoS),sum)
        rownames(counts)=counts[,1];counts=counts[,-1]
      }
    }
    
    ## Ordering the counts table according to the target labels 
    tmpOrder = OrderCounts(counts,normFactors,labels)
    counts = tmpOrder$CountsOrder
    normFactors = tmpOrder$normFactorsOrder
    CheckTarget = TRUE
  }
  return(list(counts=counts,CheckTarget=CheckTarget,normFactors=normFactors, CT_noNorm=CT_noNorm, CT_Norm =CT_Norm))
  #return(list(counts=counts,target=target[ind,],labeled=labeled,normFactors=normFactors, CT_noNorm=CT_noNorm))
}


## Get the geometric mean of the counts (0 are replaced by NA values)
GeoMeansCT <- function(CT)
{
  CT=as.matrix(CT)
  CT[which(CT<1)]=NA
  gm = apply(CT,1,geometric.mean,na.rm=TRUE)
  return(gm)
}

458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
## Get weighted size factors
w.sizefactor <- function(CT)
{
  sf = c()
  CT = as.matrix(CT)
  nbsamp = ncol(CT)
  CT[which(CT<1)]=NA
  gm = apply(CT,1,geometric.mean,na.rm=TRUE)
  weights = nbsamp - apply(CT,1,FUN=function(x){tmp =length(which(is.na(x))) ;return(tmp)})
  weights_tmp = weights
  
  for(i in 1:ncol(CT))
  {
    ind = which(is.na(CT[,i]))
    gm_tmp = gm
    tmp = CT[,i]
    if(length(ind)>0) {tmp = CT[-ind,i]; gm_tmp = gm[-ind]; weights_tmp = weights[-ind]}
    sf[i] = w.median(tmp/gm_tmp,weights_tmp, na.rm = TRUE)
  }
  names(sf) = colnames(CT)
  return(sf)
}

## Calculated the weighted median
w.median <- function (x, w, na.rm = TRUE) 
{
  if (missing(w)) 
    w <- rep.int(1, length(x))
  else {
    if (length(w) != length(x)) 
      stop("'x' and 'w' must have the same length")
    if (any(is.na(w))) 
      stop("NA weights not allowed")
    if (any(w < 0)) 
      stop("Negative weights not allowed")
  }
  if (is.integer(w)) 
    w <- as.numeric(w)
  if (na.rm) {
    w <- w[i <- !is.na(x)]
    x <- x[i]
  }
  if (all(w == 0)) {
    warning("All weights are zero")
    return(NA)
  }
  o <- order(x)
  x <- x[o]
  w <- w[o]
  p <- cumsum(w)/sum(w)
  n <- sum(p < 0.5)
  if (p[n + 1] > 0.5) 
    x[n + 1]
  else (x[n + 1] + x[n + 2])/2
}

svolant's avatar
svolant committed
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530

## Order the counts 
OrderCounts <- function(counts,normFactors=NULL,labels)
{
  n = length(labels)
  CountsOrder = counts
  normFactorsOrder = normFactors
  for(i in 1:n)
  {
    ind = which(labels[i]==colnames(counts))
    CountsOrder[,i] = counts[,ind]
    if(!is.null(normFactors)) normFactorsOrder[i] = normFactors[ind]
  }
  colnames(CountsOrder) = labels
  return(list(CountsOrder=CountsOrder,normFactorsOrder = normFactorsOrder))
}

531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554

## Order the counts 
Filtered_feature <- function(counts,th.samp,th.abund)
{
  ind = NULL
  
  ## Total abundance over samples
  Tot_abundance = log(rowSums(counts)+1)
  ind.ab = which(Tot_abundance<=th.abund)
  
  ## Get the numbre of non zero sample
  counts.bin = as.matrix(counts)
  counts.bin[which(counts>0)] = 1
  nbSampByFeat = rowSums(counts.bin)
  
  ind.samp = which(nbSampByFeat<=th.samp)
  
  ind = unique(c(ind.samp,ind.ab))
  
  return(list(ind=ind,Tot_abundance=Tot_abundance,ind.ab=ind.ab,counts.bin=counts.bin,ind.samp=ind.samp,nbSampByFeat=nbSampByFeat))
}



svolant's avatar
svolant committed
555
556
557



558
559
560
561
562
563
564
565
## Order the counts 
plot_filter <- function(counts,th.samp,th.abund,type="Scatter")
{
  res = NULL
  
  ## Initial plot for plotly
  if(type == 'Abundance' || type == 'Samples'){ 
    dataNull = data.frame(x=c(0,0),y=c(1,2),col=c("white","white"))
566
    res = ggplot(dataNull,aes(x=x,y=y))+geom_point(aes(colour = col))+theme_bw()+ scale_color_manual(values = "white")
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
  }
  
  res_filter = Filtered_feature(counts,th.samp,th.abund)
  if(type == 'Abundance' && !is.null(th.samp) && !is.null(th.abund) )
  {
    state = rep("Kept",nrow(counts))
    ## Total abundance over samples
    Tot_abundance = res_filter$Tot_abundance
    
    ind = res_filter$ind.ab
    ord = order(Tot_abundance,decreasing = FALSE)
    
    ## Modify the state
    state[ind] = "Removed"
    
    ## Create the data.frame for ggplot
    df = data.frame(lab = rownames(counts)[ord],y = Tot_abundance[ord],State=state[ord])
    df$lab = factor(df$lab,levels = rownames(counts)[ord])
    df$State = factor(df$State,levels = c("Kept","Removed"))
    
    ## plot the results
    gg = ggplot(df,aes(lab,y,fill=State)) + geom_bar(stat='identity') + theme_bw() +theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5))
589
    gg = gg + geom_hline( yintercept = th.abund,linetype = "longdash") + xlab("")
590
591
592
593
    gg = gg + scale_fill_manual(values = c('springgreen3','firebrick'))
    if(!"Kept"%in%df$State ) gg = gg + scale_fill_manual(values = 'firebrick')
    if(!"Removed"%in%df$State ) gg = gg + scale_fill_manual(values = 'springgreen3')
    
594
    res = gg
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
  }
  
  if(type == 'Samples' && !is.null(th.samp) && !is.null(th.abund))
  {
    state = rep("Kept",nrow(counts))
    
    ## Get the number of non zero sample
    nbSampByFeat = res_filter$nbSampByFeat
    ind = res_filter$ind.samp
    ord = order(nbSampByFeat,decreasing = FALSE)
    
    state[ind] = "Removed"
    
    df = data.frame(lab = rownames(counts)[ord],y = nbSampByFeat[ord],State=state[ord])
    df$lab = factor(df$lab,levels = rownames(counts)[ord])
    df$State = factor(df$State,levels = c("Kept","Removed"))
    ## plot the results
    
    gg = ggplot(df,aes(lab,y,fill=State)) + geom_bar(stat='identity') + theme_bw() +theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5))
614
    gg = gg + geom_hline( yintercept = th.samp,linetype = "longdash") + xlab("")
615
616
617
618
    gg = gg + scale_fill_manual(values = c('springgreen3','firebrick'))  
    if(!"Kept"%in%df$State ) gg = gg + scale_fill_manual(values = 'firebrick')
    if(!"Removed"%in%df$State ) gg = gg + scale_fill_manual(values = 'springgreen3')
    
619
    res = gg
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
    
  }
  
  if(type == 'Scatter')
  {
    state = rep("Kept",nrow(counts))
    
    ## Get the number of non zero sample
    nbSampByFeat = res_filter$nbSampByFeat
    Tot_abundance = res_filter$Tot_abundance
    
    ## Get the selected features (under the thresholds)
    ind = res_filter$ind
    
    ## Modify the state
    state[ind] = "Removed"
    
    ## Create the data.frame for ggplot
    df = data.frame(lab =rownames(counts), y = nbSampByFeat,x = Tot_abundance,State=state)
    df$lab = factor(df$lab,levels = rownames(counts))
    df$State = factor(df$State,levels = c("Kept","Removed"))
    
    x_var = df$x
    y_var = df$y
    State = df$State
    PointSize = 2
    colors_scat = list(Kept="#00CD66",Removed="#b22222")
647

648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
    res = scatterD3(x = x_var,
                     y = y_var,
                     lab = rownames(df),
                     xlab = "Abundance in log",
                     ylab = "Number of samples",
                     col_var = State,
                     colors = colors_scat,
                     size_lab = PointSize,
                     key_var = rownames(df),
                     point_opacity = 0.7,
                     transitions = TRUE)
    
    
#     gg = ggplot(df,aes(x,y,color=State,group = lab)) + geom_point() + theme_bw()
#     gg = gg + geom_vline( xintercept = th.samp,linetype = "longdash")
#     gg = gg + geom_hline( yintercept = th.abund,linetype = "longdash")
#     gg = gg + scale_color_manual(values = c('springgreen3','firebrick'))
#     ggplotly(gg)
#     return(gg)
  }
  
  return(res)
}



######################################################
## NAME: SelectThreshAb 
##
## INPUT:
##    infile : data matrix (counts, rows: taxo)
##    lambda : Tuning parameter (default is 500)
##
## OUTPUT:
##    Cut-off value 
##
######################################################

SelectThreshAb <- function(infile,lambda=500,graph=TRUE){
  
  rs <- rowSums(infile)
  test_Filtre <- sapply(c(min(rs):lambda),FUN=function(x) table(rs>x))
690
691
692
693
694
695
696
697
698
699
700
  if(!is.list(test_Filtre))
  {
    x <- c(min(rs):lambda)
    reslm <- lm(test_Filtre[1,]~x)$coefficients
    val <- which(test_Filtre[1,]>reslm[1])[1]
    if (graph==TRUE){
      plot(test_Filtre[1,],ylab="Nb especes avec moins de x comptages sur tous les echantillons",xlab="x")
      abline(v=val,col="red")
    } 
  } else val = min(rs)
  return(max(val, min(rs)+1))
701
702
}