# potential identical columns between the 2 datasets
# WARNINGS
# For data frames: content are compared after conversion of content into characters. This means that the comparison of the content of data frame, either row to row, or column to column, does not take into account the mode in the different columns. This concern the results in $any.id.row, $same.row.pos1, $same.row.pos2, $same.row.match1, $same.row.match2, $any.id.col, $same.row.col1, $same.row.col2, $same.col.match1, $same.col.match2 and $identical.content result
# "TOO BIG FOR EVALUATION" returned in $same.row.pos1, $same.row.pos2, $same.row.match1 and $same.row.match2 when nrow(data1) * nrow(data2) > 1e6 and $any.id.row remains NULL
# "TOO BIG FOR EVALUATION" returned in $same.row.col1, $same.row.col2, $same.col.match1 and $same.col.match2 when ncol(data1) * ncol(data2) > 1e6 and $any.id.col remains NULL
data1 <- as.data.frame(matrix(data1, ncol = ncol(data1)), stringsAsFactors = FALSE) # conversion of table into data frame to facilitate inter class comparison
data2 <- as.data.frame(matrix(data2, ncol = ncol(data2)), stringsAsFactors = FALSE) # conversion of table into data frame to facilitate inter class comparison
# data2 <- data.frame(lapply(data2, as.character), stringsAsFactors = FALSE) # conversion of columns into characters
}
row.names(data1) <- paste0("A", 1:nrow(data1))
row.names(data2) <- paste0("A", 1:nrow(data2))
if(same.col.nb == TRUE){ # because if not the same col nb, the row cannot be identical
if(all(sapply(data1, FUN = typeof) == "integer") & all(sapply(data2, FUN = typeof) == "integer") & as.double(nrow(data1)) * nrow(data2) <= 1e10){ # as.double(nrow(data1)) to prevent integer overflow because R is 32 bits for integers
tempo1 <- c(as.data.frame(t(data1), stringsAsFactors = FALSE)) # this work fast with only integers (because 32 bits)
if(all(sapply(data1, FUN = typeof) == "integer") & all(sapply(data2, FUN = typeof) == "integer") & as.double(nrow(data1)) * nrow(data2) <= 1e10){ # fast method for integers (thus not data frames). as.double(nrow(data1)) to prevent integer overflow because R is 32 bits for integers
tempo1 <- c(as.data.frame(t(data1), stringsAsFactors = FALSE)) # conversion into list. This work fast with only integers (because 32 bits)
tempo2 <- c(as.data.frame(t(data2), stringsAsFactors = FALSE)) # conversion into list. This work fast with only integers (because 32 bits)
same.row.pos1 <- which(tempo1 %in% tempo2)
same.row.pos2 <- which(tempo2 %in% tempo1)
same.row.match1 <- match(tempo1, tempo2)
same.row.match2 <- match(tempo2, tempo1)
}else if(as.double(nrow(data1)) * nrow(data2) <= 1e6){ # as.double(nrow(data1)) to prevent integer overflow because R is 32 bits for integers
if(col.nb <= 10){ # if ncol is not to big, the t() should not be that long
tempo1 <- c(as.data.frame(t(data1), stringsAsFactors = FALSE)) # this work fast with only integers (because 32 bits)
rownames(tempo1) <- NULL # to have same row and column names
colnames(tempo1) <- NULL # to have same row and column names
rownames(tempo2) <- NULL # to have same row and column names
colnames(tempo2) <- NULL # to have same row and column names
if(identical(tempo1, tempo2)){
same.row.pos1[i3] <- TRUE
same.row.pos2[i4] <- TRUE
same.row.match1[i3] <- i4
...
...
@@ -1691,7 +1710,7 @@ same.row.match2[i4] <- i3
}
same.row.pos1 <- which(same.row.pos1)
same.row.pos2 <- which(same.row.pos2)
}
# }
}else{
same.row.pos1 <- "TOO BIG FOR EVALUATION"
same.row.pos2 <- "TOO BIG FOR EVALUATION"
...
...
@@ -1725,32 +1744,33 @@ any.id.row <- FALSE
# same.row.pos1 and 2 remain NULL
}
if(same.row.nb == TRUE){ # because if not the same row nb, the col cannot be identical
if(all(sapply(data1, FUN = typeof) == "integer") & all(sapply(data2, FUN = typeof) == "integer") & as.double(ncol(data1)) * ncol(data2) <= 1e10){ # as.double(ncol(data1)) to prevent integer overflow because R is 32 bits for integers
if(as.double(ncol(data1)) * ncol(data2) <= 1e10){ # comparison of data frame columns is much easier than rows because no need to use t() before converting to list for fast comparison. as.double(ncol(data1)) to prevent integer overflow because R is 32 bits for integers
# if(all(sapply(data1, FUN = typeof) == "integer") & all(sapply(data2, FUN = typeof) == "integer") & as.double(ncol(data1)) * ncol(data2) <= 1e10){ # fast method for integers (thus not data frames). as.double(ncol(data1)) to prevent integer overflow because R is 32 bits for integers
tempo1 <- c(data1)
tempo2 <- c(data2)
same.col.pos1 <- which(tempo1 %in% tempo2)
same.col.pos2 <- which(tempo2 %in% tempo1)
same.col.match1 <- match(tempo1, tempo2)
same.col.match2 <- match(tempo2, tempo1)
}else if(as.double(ncol(data1)) * ncol(data2) <= 1e6){ # as.double(ncol(data1)) to prevent integer overflow because R is 32 bits for integers
same.col.pos1 <- logical(length = ncol(data1)) # FALSE by default
same.col.pos1[] <- FALSE # security
same.col.pos2 <- logical(length = ncol(data2)) # FALSE by default
same.col.pos2[] <- FALSE # security
same.col.match1 <- rep(NA, ncol(data1))
same.col.match2 <- rep(NA, ncol(data2))
for(i3 in 1:ncol(data1)){
for(i4 in 1:ncol(data2)){
if(identical(data1[ , i3], data2[ , i4])){
same.col.pos1[i3] <- TRUE
same.col.pos2[i4] <- TRUE
same.col.match1[i3] <- i4
same.col.match2[i4] <- i3
}
}
}
same.col.pos1 <- which(same.col.pos1)
same.col.pos2 <- which(same.col.pos2)
# }else if(as.double(ncol(data1)) * ncol(data2) <= 1e6){ # as.double(ncol(data1)) to prevent integer overflow because R is 32 bits for integers
# same.col.pos1 <- logical(length = ncol(data1)) # FALSE by default
# same.col.pos1[] <- FALSE # security
# same.col.pos2 <- logical(length = ncol(data2)) # FALSE by default