diff --git a/R/extraction_1KG.R b/R/extraction_1KG.R index 4a13e6e1a76a68dd06e48eb2d4875c0854d24dbb..afcaae47454d920485dd59902600877a3a6ff4d2 100644 --- a/R/extraction_1KG.R +++ b/R/extraction_1KG.R @@ -10,7 +10,7 @@ globalVariables("ind") #' Currently, this is hard-coded to access 1000 Genomes phase3 data hosted by #' Brian Browning (author of BEAGLE): #' -#' \url{http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/} +#' \url{http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/} #' #' This implementation discards multi-allelic markers that have a "," in the #' ALT column. @@ -111,8 +111,8 @@ get_vcf <- function(chrom, start, end, pop = NA, path = "", web = TRUE) { # These are variants filtered by Brian Browning, the developer of BEAGLE. data_url = paste(sep = "", "http://tabix.iobio.io/?cmd=-h%20%27", - "http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/", - "individual_chromosomes/chr", chrom, ".1kg.phase3.v5a.vcf.gz", + "http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/", + "chr", chrom, ".1kg.phase3.v5a.vcf.gz", "%27%20", chrom, ":", start, "-", end ) # Download the data from the server. @@ -161,151 +161,11 @@ get_vcf <- function(chrom, start, end, pop = NA, path = "", web = TRUE) { # Convert the genotypes to a numeric matrix. retval$geno <- t(apply(retval$geno, 1, function(row) { - as.numeric(do.call(cbind, strsplit(row, "|", fixed = TRUE))) + as.numeric(do.call(cbind, strsplit(unlist(row), "[/|]"))) })) rownames(retval$geno) <- retval$meta$ID colnames(retval$geno) <- rep(retval$ind$Individual.ID, each = 2) return(retval) -} - -################################################################################ -# OLD VERSION - KEEP FOR SAFETY - -# get_vcf <- function(chrom, start, end, pop = NA, path = "", web = TRUE) { -# -# # Hard-coded superpopulations for each individual. -# superpops <- rep( -# c("EUR","EAS","AMR","EAS","AMR","EAS","AMR","EAS","AMR","EUR","AMR", -# "EUR","AMR","SAS","EAS","EUR","EAS","AFR","AMR","AFR","AMR","AFR", -# "AMR","AFR","AMR","AFR","EAS","AFR","EAS","AMR","AFR","AMR","AFR", -# "EAS","AFR","AMR","EAS","EUR","EAS","AMR","AFR","AMR","AFR","AMR", -# "AFR","AMR","AFR","AMR","EAS","AFR","AMR","AFR","SAS","AFR","EAS", -# "AFR","SAS","AFR","SAS","AFR","SAS","AFR","SAS","AFR","SAS","AFR", -# "SAS","AFR","SAS","AFR","SAS","AFR","SAS","EUR","AFR","EAS","AFR", -# "EAS","AFR","EAS","AFR","AMR","AFR","AMR","AFR","EUR","SAS"), -# c(185,42,3,35,4,28,9,15,111,1,68,24,9,4,6,73,56,8,2,5,24,2,13,5,7,5, -# 17,4,21,2,1,3,3,17,2,3,20,14,1,2,2,11,4,8,3,1,13,2,35,2,1,22,4,7, -# 4,27,5,16,10,6,13,6,9,9,12,58,10,84,6,74,3,25,310,99,19,103,29,70, -# 18,34,142,19,8,45,52,107,103) -# ) -# -# # Hard-coded populations for each individual. -# pops <- rep( -# c("GBR","FIN","GBR","FIN","CHS","PUR","CHS","PUR","CHS","PUR","CDX","PUR", -# "CLM","PUR","CLM","PUR","GBR","CLM","PUR","CLM","IBS","CLM","PEL","PJL", -# "KHV","IBS","GBR","CDX","KHV","ACB","PEL","ACB","PEL","ACB","PEL","ACB", -# "PEL","ACB","KHV","ACB","KHV","PEL","ACB","PEL","ACB","KHV","ACB","PEL", -# "CDX","GBR","IBS","CDX","PEL","ACB","PEL","ACB","PEL","ACB","PEL","ACB", -# "PEL","CDX","ACB","PEL","ACB","GWD","ACB","PJL","ACB","KHV","ACB","GWD", -# "ACB","GWD","PJL","GWD","PJL","GWD","PJL","GWD","PJL","GWD","PJL","GWD", -# "ESN","GWD","BEB","PJL","GWD","MSL","ESN","MSL","PJL","GWD","ESN","MSL", -# "PJL","ESN","GWD","MSL","BEB","PJL","STU","PJL","STU","PJL","STU","ITU", -# "STU","ITU","STU","PJL","ITU","BEB","STU","ITU","STU","BEB","STU","ITU", -# "STU","ITU","STU","ITU","STU","ITU","STU","ITU","STU","ITU","BEB","ITU", -# "STU","ITU","STU","ITU","CEU","YRI","CHB","YRI","JPT","LWK","JPT","YRI", -# "LWK","ASW","MXL","ASW","MXL","ASW","TSI","GIH"), -# c(55,17,31,82,42,3,35,4,28,9,15,41,18,26,16,10,1,27,11,30,24,3,6,4,6,70, -# 3,22,34,8,2,5,24,2,13,5,7,5,17,4,21,2,1,3,3,17,2,3,20,1,13,1,2,2,11,4, -# 8,3,1,13,2,35,2,1,9,4,9,4,7,4,10,7,2,8,5,16,10,6,13,6,9,9,12,37,19,2,4, -# 6,10,27,43,4,6,6,29,39,3,8,2,15,13,8,5,7,19,6,1,11,5,1,12,3,20,20,13,15, -# 13,21,9,12,8,2,2,10,7,8,1,7,4,1,28,5,1,7,2,3,99,19,103,29,70,18,34,60, -# 81,1,19,8,45,52,107,103) -# ) -# -# if (!is.na(pop) && !pop %in% pops && !pop %in% superpops && pop != "ALL") { -# stop( -# "Invalid pop=", pop, -# "\nMust be a pop: ", paste(unique(pops), collapse = " "), -# "\nOr a superpop: ", paste(unique(superpops), collapse = " ") -# ) -# } -# -# if (!is.numeric(start) || start < 1) { -# stop("Invalid start=", start, "\nMust be a positive integer.") -# } -# -# if (!is.numeric(end) || end < start) { -# stop("Invalid end=", end, "\nMust be an integer >= start.") -# } -# -# if (!chrom %in% c(1:22, "X")) { -# stop( -# "Invalid chrom=", chrom, -# "\nMust be one of: ", paste(c(1:22, "X"), collapse = " ") -# ) -# } -# -# txt <- NULL -# if (web == TRUE) { -# # These are variants filtered by Brian Browning, the developer of BEAGLE. -# data_url = paste(sep = "", -# "http://tabix.iobio.io/?cmd=-h%20%27", -# "http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/", -# "individual_chromosomes/chr", chrom, ".1kg.phase3.v5a.vcf.gz", -# "%27%20", chrom, ":", start, "-", end -# ) -# # Download the data from the server. -# txt <- RCurl::getURL(data_url) -# } -# -# if (web == FALSE) { -# command <- paste("tabix -h ", path, " ", chrom, ":", start, "-", end, sep = "") -# txt <- paste(unlist(system(command, intern = TRUE)), collapse = "\n") -# } -# -# # Extract the sample identifiers from the VCF header. -# sample_ids <- strsplit( -# # FIXME: Should detect number of comment lines. -# # Get the 5th line and split it. -# strsplit(txt, "\n", fixed = TRUE)[[1]][5], -# "\t", fixed = TRUE -# )[[1]] -# -# # Discard the "CHROM,POS,...,INFO,V9" columns. -# sample_ids <- sample_ids[10:length(sample_ids)] -# -# # Read the body of the data into a dataframe. -# -# vcf <- utils::read.delim( -# text = txt, header = FALSE, comment.char = "#", stringsAsFactors = FALSE) -# -# # Assign the standard column names and sample identifiers. -# colnames(vcf) <- c( -# "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "V9", -# sample_ids -# ) -# -# # Discard multi-allelic markers. -# vcf <- vcf[grep(",", vcf$ALT, invert = TRUE), ] -# -# # Select the genotype columns that belong to a particular population. -# if (pop %in% pops) { -# vcf <- vcf[,c(rep(TRUE, 9), pops == pop)] -# } else if (pop %in% superpops) { -# vcf <- vcf[,c(rep(TRUE, 9), superpops == pop)] -# } -# -# retval <- list() -# -# utils::data("ind", package = "VarExp", envir = environment()) -# retval$ind <- ind[colnames(vcf)[10:ncol(vcf)],] -# -# # Separate the metadata from the genotypes. -# retval$meta <- vcf[,1:8] -# retval$geno <- vcf[,10:ncol(vcf)] -# -# # Convert the genotypes to a numeric matrix. -# retval$geno <- t(apply(retval$geno, 1, function(row) { -# as.numeric(do.call(cbind, strsplit(row, "|", fixed = TRUE))) -# })) -# -# print(retval$meta) -# print(retval$meta$ID) -# rownames(retval$geno) <- retval$meta$ID -# print("aqui") -# colnames(retval$geno) <- rep(retval$ind$Individual.ID, each = 2) -# -# return(retval) -# } \ No newline at end of file +} \ No newline at end of file