Skip to content
Snippets Groups Projects
Commit cf510282 authored by Vincent  LAVILLE's avatar Vincent LAVILLE
Browse files

Merge branch 'testing' into 'master'

Fixing bugs and updating URL

See merge request statistical-genetics/VarExp!1
parents 6e10f23c e3e7237f
No related branches found
No related tags found
No related merge requests found
......@@ -10,7 +10,7 @@ globalVariables("ind")
#' Currently, this is hard-coded to access 1000 Genomes phase3 data hosted by
#' Brian Browning (author of BEAGLE):
#'
#' \url{http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/}
#' \url{http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/}
#'
#' This implementation discards multi-allelic markers that have a "," in the
#' ALT column.
......@@ -111,8 +111,8 @@ get_vcf <- function(chrom, start, end, pop = NA, path = "", web = TRUE) {
# These are variants filtered by Brian Browning, the developer of BEAGLE.
data_url = paste(sep = "",
"http://tabix.iobio.io/?cmd=-h%20%27",
"http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/",
"individual_chromosomes/chr", chrom, ".1kg.phase3.v5a.vcf.gz",
"http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/",
"chr", chrom, ".1kg.phase3.v5a.vcf.gz",
"%27%20", chrom, ":", start, "-", end
)
# Download the data from the server.
......@@ -161,151 +161,11 @@ get_vcf <- function(chrom, start, end, pop = NA, path = "", web = TRUE) {
# Convert the genotypes to a numeric matrix.
retval$geno <- t(apply(retval$geno, 1, function(row) {
as.numeric(do.call(cbind, strsplit(row, "|", fixed = TRUE)))
as.numeric(do.call(cbind, strsplit(unlist(row), "[/|]")))
}))
rownames(retval$geno) <- retval$meta$ID
colnames(retval$geno) <- rep(retval$ind$Individual.ID, each = 2)
return(retval)
}
################################################################################
# OLD VERSION - KEEP FOR SAFETY
# get_vcf <- function(chrom, start, end, pop = NA, path = "", web = TRUE) {
#
# # Hard-coded superpopulations for each individual.
# superpops <- rep(
# c("EUR","EAS","AMR","EAS","AMR","EAS","AMR","EAS","AMR","EUR","AMR",
# "EUR","AMR","SAS","EAS","EUR","EAS","AFR","AMR","AFR","AMR","AFR",
# "AMR","AFR","AMR","AFR","EAS","AFR","EAS","AMR","AFR","AMR","AFR",
# "EAS","AFR","AMR","EAS","EUR","EAS","AMR","AFR","AMR","AFR","AMR",
# "AFR","AMR","AFR","AMR","EAS","AFR","AMR","AFR","SAS","AFR","EAS",
# "AFR","SAS","AFR","SAS","AFR","SAS","AFR","SAS","AFR","SAS","AFR",
# "SAS","AFR","SAS","AFR","SAS","AFR","SAS","EUR","AFR","EAS","AFR",
# "EAS","AFR","EAS","AFR","AMR","AFR","AMR","AFR","EUR","SAS"),
# c(185,42,3,35,4,28,9,15,111,1,68,24,9,4,6,73,56,8,2,5,24,2,13,5,7,5,
# 17,4,21,2,1,3,3,17,2,3,20,14,1,2,2,11,4,8,3,1,13,2,35,2,1,22,4,7,
# 4,27,5,16,10,6,13,6,9,9,12,58,10,84,6,74,3,25,310,99,19,103,29,70,
# 18,34,142,19,8,45,52,107,103)
# )
#
# # Hard-coded populations for each individual.
# pops <- rep(
# c("GBR","FIN","GBR","FIN","CHS","PUR","CHS","PUR","CHS","PUR","CDX","PUR",
# "CLM","PUR","CLM","PUR","GBR","CLM","PUR","CLM","IBS","CLM","PEL","PJL",
# "KHV","IBS","GBR","CDX","KHV","ACB","PEL","ACB","PEL","ACB","PEL","ACB",
# "PEL","ACB","KHV","ACB","KHV","PEL","ACB","PEL","ACB","KHV","ACB","PEL",
# "CDX","GBR","IBS","CDX","PEL","ACB","PEL","ACB","PEL","ACB","PEL","ACB",
# "PEL","CDX","ACB","PEL","ACB","GWD","ACB","PJL","ACB","KHV","ACB","GWD",
# "ACB","GWD","PJL","GWD","PJL","GWD","PJL","GWD","PJL","GWD","PJL","GWD",
# "ESN","GWD","BEB","PJL","GWD","MSL","ESN","MSL","PJL","GWD","ESN","MSL",
# "PJL","ESN","GWD","MSL","BEB","PJL","STU","PJL","STU","PJL","STU","ITU",
# "STU","ITU","STU","PJL","ITU","BEB","STU","ITU","STU","BEB","STU","ITU",
# "STU","ITU","STU","ITU","STU","ITU","STU","ITU","STU","ITU","BEB","ITU",
# "STU","ITU","STU","ITU","CEU","YRI","CHB","YRI","JPT","LWK","JPT","YRI",
# "LWK","ASW","MXL","ASW","MXL","ASW","TSI","GIH"),
# c(55,17,31,82,42,3,35,4,28,9,15,41,18,26,16,10,1,27,11,30,24,3,6,4,6,70,
# 3,22,34,8,2,5,24,2,13,5,7,5,17,4,21,2,1,3,3,17,2,3,20,1,13,1,2,2,11,4,
# 8,3,1,13,2,35,2,1,9,4,9,4,7,4,10,7,2,8,5,16,10,6,13,6,9,9,12,37,19,2,4,
# 6,10,27,43,4,6,6,29,39,3,8,2,15,13,8,5,7,19,6,1,11,5,1,12,3,20,20,13,15,
# 13,21,9,12,8,2,2,10,7,8,1,7,4,1,28,5,1,7,2,3,99,19,103,29,70,18,34,60,
# 81,1,19,8,45,52,107,103)
# )
#
# if (!is.na(pop) && !pop %in% pops && !pop %in% superpops && pop != "ALL") {
# stop(
# "Invalid pop=", pop,
# "\nMust be a pop: ", paste(unique(pops), collapse = " "),
# "\nOr a superpop: ", paste(unique(superpops), collapse = " ")
# )
# }
#
# if (!is.numeric(start) || start < 1) {
# stop("Invalid start=", start, "\nMust be a positive integer.")
# }
#
# if (!is.numeric(end) || end < start) {
# stop("Invalid end=", end, "\nMust be an integer >= start.")
# }
#
# if (!chrom %in% c(1:22, "X")) {
# stop(
# "Invalid chrom=", chrom,
# "\nMust be one of: ", paste(c(1:22, "X"), collapse = " ")
# )
# }
#
# txt <- NULL
# if (web == TRUE) {
# # These are variants filtered by Brian Browning, the developer of BEAGLE.
# data_url = paste(sep = "",
# "http://tabix.iobio.io/?cmd=-h%20%27",
# "http://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/",
# "individual_chromosomes/chr", chrom, ".1kg.phase3.v5a.vcf.gz",
# "%27%20", chrom, ":", start, "-", end
# )
# # Download the data from the server.
# txt <- RCurl::getURL(data_url)
# }
#
# if (web == FALSE) {
# command <- paste("tabix -h ", path, " ", chrom, ":", start, "-", end, sep = "")
# txt <- paste(unlist(system(command, intern = TRUE)), collapse = "\n")
# }
#
# # Extract the sample identifiers from the VCF header.
# sample_ids <- strsplit(
# # FIXME: Should detect number of comment lines.
# # Get the 5th line and split it.
# strsplit(txt, "\n", fixed = TRUE)[[1]][5],
# "\t", fixed = TRUE
# )[[1]]
#
# # Discard the "CHROM,POS,...,INFO,V9" columns.
# sample_ids <- sample_ids[10:length(sample_ids)]
#
# # Read the body of the data into a dataframe.
#
# vcf <- utils::read.delim(
# text = txt, header = FALSE, comment.char = "#", stringsAsFactors = FALSE)
#
# # Assign the standard column names and sample identifiers.
# colnames(vcf) <- c(
# "CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "V9",
# sample_ids
# )
#
# # Discard multi-allelic markers.
# vcf <- vcf[grep(",", vcf$ALT, invert = TRUE), ]
#
# # Select the genotype columns that belong to a particular population.
# if (pop %in% pops) {
# vcf <- vcf[,c(rep(TRUE, 9), pops == pop)]
# } else if (pop %in% superpops) {
# vcf <- vcf[,c(rep(TRUE, 9), superpops == pop)]
# }
#
# retval <- list()
#
# utils::data("ind", package = "VarExp", envir = environment())
# retval$ind <- ind[colnames(vcf)[10:ncol(vcf)],]
#
# # Separate the metadata from the genotypes.
# retval$meta <- vcf[,1:8]
# retval$geno <- vcf[,10:ncol(vcf)]
#
# # Convert the genotypes to a numeric matrix.
# retval$geno <- t(apply(retval$geno, 1, function(row) {
# as.numeric(do.call(cbind, strsplit(row, "|", fixed = TRUE)))
# }))
#
# print(retval$meta)
# print(retval$meta$ID)
# rownames(retval$geno) <- retval$meta$ID
# print("aqui")
# colnames(retval$geno) <- rep(retval$ind$Individual.ID, each = 2)
#
# return(retval)
# }
\ No newline at end of file
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment