Commit 070023a0 authored by mariefbourdon's avatar mariefbourdon
Browse files

modif tab_mark to add chr and pos

parent 1367ad7d
This diff is collapsed.
{
"path" : "~/Documents/PhD/stuart_R/stuart",
"path" : "~/Documents/PhD/stuart_package/stuart/R",
"sortOrder" : [
{
"ascending" : true,
......
{
"left" : {
"panelheight" : 783,
"splitterpos" : 327,
"panelheight" : 689,
"splitterpos" : 180,
"topwindowstate" : "NORMAL",
"windowheight" : 821
"windowheight" : 727
},
"right" : {
"panelheight" : 783,
"splitterpos" : 494,
"panelheight" : 689,
"splitterpos" : 434,
"topwindowstate" : "NORMAL",
"windowheight" : 821
"windowheight" : 727
}
}
\ No newline at end of file
{
"TabSet1" : 0,
"TabSet2" : 4,
"TabSet2" : 0,
"TabZoom" : {
}
}
\ No newline at end of file
/private/var/folders/dn/j71yz2tn5_gdffs8fqxhddrr0000gn/T/RtmpZZZ4WE/preview-31873a42c16d.dir/stuaRt.html
/private/var/folders/dn/j71yz2tn5_gdffs8fqxhddrr0000gn/T/Rtmp3VMULh/preview-1ced48fe920c.dir/stuaRt.html
/private/var/folders/dn/j71yz2tn5_gdffs8fqxhddrr0000gn/T/RtmpNme5vw/preview-4c4321234d03.dir/stuaRt.html
/private/var/folders/dn/j71yz2tn5_gdffs8fqxhddrr0000gn/T/RtmpZZZ4WE/preview-3187564b41ab.dir/stuaRt.html
......@@ -8,6 +8,16 @@
~%2FDocuments%2FPhD%2Fstuart_R%2Fstuart%2FR%2Ftab_mark.R="C4894908"
~%2FDocuments%2FPhD%2Fstuart_R%2Fstuart%2FR%2Fwrite_rqtl.R="B873B132"
~%2FDocuments%2FPhD%2Fstuart_R%2Fstuart%2Fvignettes%2FstuaRt.Rmd="B92E179B"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FDESCRIPTION="DB43CCAB"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Fgeno_strains.R="746D5550"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Fmark_allele.R="94A0A47C"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Fmark_match.R="C03D9873"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Fmark_poly.R="E392A021"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Fmark_prop.R="65449E3B"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Fstuart_tab-data.R="5D74CC67"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Ftab_mark.R="38BAAAF9"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2FR%2Fwrite_rqtl.R="9A1DD653"
~%2FDocuments%2FPhD%2Fstuart_package%2Fstuart%2Fvignettes%2FstuaRt.Rmd="5BDF5DBA"
~%2Fstuart_package%2Fstuart%2FDESCRIPTION="BEB7232"
~%2Fstuart_package%2Fstuart%2FNAMESPACE="AE613167"
~%2Fstuart_package%2Fstuart%2FR%2Fgeno_strains.R="8F7B714A"
......
{
"collab_server" : "",
"contents" : "",
"created" : 1622641774018.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "0",
"id" : "1656F55",
"lastKnownWriteTime" : 1623060553,
"last_content_update" : 1623060553,
"path" : "~/Documents/PhD/stuart_R/stuart/R/geno_strains.R",
"project_path" : "R/geno_strains.R",
"properties" : {
"cursorPosition" : "22,39",
"scrollLine" : "0"
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 4,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
#' @title Create haplotype for a new mouse strain into a reference dataframe
#'
#' @description This functions adds columns for parental strains used in the cross in the annotation data frame, from the genotype data frame in which one or several animal of the parental strains were genotyped.
#' If several animals of one strain were genotyped, a consensus is created from these animals.
#' The consensus is created as follow : if the indivuals carry the same allele, this allele is kept, otherwise, the allele is noted as "N". If individuals show residual heterozygosity, it is encoded as "H".
#' @param ref data frame with the reference genotypes of mouse lines
#' @param geno data frame with the genotyping results for your cross from miniMUGA array
#' @param par1 first parental strain used in the cross, the name must be written as in the geno data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the geno data frame
#' @param name1 name of the first parental strain to use as the column name in the ref data frame
#' @param name2 name of the second parental strain to use as the column name in the ref data frame
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
geno_strains <- function(ref,geno,par1,par2,name1,name2){
#rename df columns
geno <- geno %>% rename("marker"=1,
"id"=2,
"allele_1"=3,
"allele_2"=4)
#recode genotypes from 2 alleles to 1
geno <- geno %>% mutate_all(as.character)
geno <- geno %>% filter(id %in% c(par1,par2))
geno <- geno %>% mutate(Geno=case_when(allele_1 == "-" | allele_2 == "-" ~ "N",
allele_1 == allele_2 ~ allele_1,
allele_1 %in% c("A","T","G","C") & allele_2 %in% c("A","T","G","C") ~ "H"))
geno <- geno %>% select(marker,id,Geno) %>% pivot_wider(names_from = id, values_from = Geno)
#create consensus
if(length(par1)!=1){
geno <- geno %>% mutate(parent1 = ifelse(!!sym(par1[1])==!!sym(par1[2]),!!sym(par1[1]),"N"))
} else {
geno <- geno %>% rename(parent1=!!sym(par1[1]))
}
if(length(par2)!=1){
geno <- geno %>% mutate(parent2 = ifelse(!!sym(par2[1])==!!sym(par2[2]),!!sym(par2[1]),"N"))
} else {
geno <- geno %>% rename(parent2=!!sym(par2[1]))
}
geno <- geno %>% select(marker,parent1,parent2)
colnames(geno) <- c("marker",name1,name2)
#merge with ref file
ref <- full_join(ref,geno,by=c("marker"="marker"))
return(ref)
}
{
"collab_server" : "",
"contents" : "",
"created" : 1622636142238.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "0",
"id" : "42D37312",
"lastKnownWriteTime" : 1623087154,
"last_content_update" : 1623087154974,
"path" : "~/Documents/PhD/stuart_R/stuart/R/mark_prop.R",
"project_path" : "R/mark_prop.R",
"properties" : {
"cursorPosition" : "48,24",
"scrollLine" : "0"
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 2,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
#' @title Exclude markers depending on proportions of homo/hetorozygous
#'
#' @description uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present odd proportions of each genotype. You can define these proportions thanks to the arguments of the function.
#' @param tab data frame obtained with tab_mark function.
#' @param cross F2 or N2.
#' @param homo proportion of homozygous individuals under which the marker is excluded. Will apply on both homozygous genotypes for a F2, but only on one for N2.
#' @param hetero proportion of heterozygous individuals under which the marker is excluded.
#' @param na proportion of non-genotyped individuals above which the marker is excluded.
#'
#' @import dplyr
#'
#' @export
#'
#### mark_prop ####
## excludes markers depending on proportions of homo/hetorozygous
mark_prop <- function(tab,cross,homo=NA,hetero=NA,pval=NA,na=0.5){
#calculate total number of individuals genotyped for each marker
tab <- tab %>% mutate(n_geno = (n_HM1 + n_HM2 + n_HT))
#calculate proportion of each genotype
tab <- tab %>% mutate(p_HM1 = n_HM1/n_geno)
tab <- tab %>% mutate(p_HM2 = n_HM2/n_geno)
tab <- tab %>% mutate(p_HT = n_HT/n_geno)
tab <- tab %>% mutate(p_NA = n_NA/(n_geno+n_NA))
#stock colnames to join
names <- colnames(tab)
#stop if cross != "F2" or "N2"
if(!cross %in% c("F2","N2")){
stop("Cross must be F2 or N2")
}
#stop of homo&hetero or pval not specified
if((is.na(homo)==TRUE | is.na(hetero)==TRUE) & is.na(pval)==TRUE){
stop("Arguments homo and hetero or argument pval must be specified")
}
#stop with prop of na
tab <- tab %>%
mutate(exclude_prop=case_when(p_NA > na ~ 1,
T ~ 0))
#stop with prop of homo/hetero
if(is.na(pval)==TRUE){
tab <- tab %>%
mutate(exclude_prop=case_when(p_NA > na ~ 1,
cross=="F2" & (p_HM1 < homo | p_HM2 < homo | p_HT < hetero) ~ 1,
cross=="N2" & (p_HM1 < homo | p_HT < hetero) ~ 1,
T ~ exclude_prop
))
}
#stop with pval chisq.test
## NEED TO ADD THIS FILTER IF CROSS = N2
if(is.na(pval)==FALSE){
tab <- tab %>% filter(p_NA != 1) %>% rowwise() %>%
mutate(.,chi_pval = tibble(n_HM1,n_HM2,n_HT) %>%
chisq.test(p=c(0.25,0.25,0.5)) %>% .$p.value) %>%
full_join(.,tab,by=names)
tab <- tab %>% mutate(exclude_prop=case_when(chi_pval < pval ~ 1,
T ~ exclude_prop))
}
tab <- tab %>% select(names)
return(tab)
}
{
"collab_server" : "",
"contents" : "",
"created" : 1622535818738.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "0",
"id" : "45D91D58",
"lastKnownWriteTime" : 1622648301,
"last_content_update" : 1622648301329,
"path" : "~/Documents/PhD/stuart_R/stuart/R/write_rqtl.R",
"project_path" : "R/write_rqtl.R",
"properties" : {
"cursorPosition" : "84,18",
"scrollLine" : "0"
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 9,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
#' @title Create data frame in Rqtl CSV format
#'
#' @description This function uses the table produced by tab_mark function filled by all the mark_* functions in order to create a data frame in the right format for Rqtl read.cross function. Only the non-excluded markers will be kept and genotypeds will be encoded in "0", "1" and "2", "0" being homozygous for the first parental strain, "1" heterozygous and "2" homozygous for the second parental strain. Caution, this file create a data frame and a CSV file in the path of your choice if indicated by the "path" argument. This function does not create a "cross" object in your environment that can be directly used for QTL mapping. You will need to load the CSV file with qtl::read.cross.
#' @param geno data frame with the genotyping results for your cross
#' @param pheno data frame with phenotypes of the individuals (individuals must have the same ID in the geno data frame and in the pheno data frame)
#' @param prefix potential prefix present in the names of the individuals in the geno data frame to be removed in ordere to have the same names as in the pheno file
#' @param tab data frame obtained with tab_mark function
#' @param ref data frame with the reference genotypes of mouse lines
#' @param par1 first parental strain used in the cross, the name must be written as in the "ref" data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the "ref" data frame
#' @param pos column with marker positions
#' @param path if indicated, the data frame will be exported in this path
#'
#' @import dplyr
#' @import tidyr
#' @import utils
#' @import stringr
#'
#' @export
#'
#### write_rqtl ####
## write data frame in rqtl format (csv), if path != NA writes the file in the path indicated
write_rqtl <- function(geno,pheno,tab,ref,par1,par2,prefix,pos,path=NA){
#rename df columns
geno <- geno %>% rename("marker"=1,
"id"=2,
"allele_1"=3,
"allele_2"=4)
#extract snps non excluded
if("exclude_match" %in% colnames(tab)){
tab <- tab %>% filter(exclude_match==0)
}
if("exclude_poly" %in% colnames(tab)){
tab <- tab %>% filter(exclude_poly==0)
}
if("exclude_prop" %in% colnames(tab)){
tab <- tab %>% filter(exclude_prop==0)
}
if("exclude_allele" %in% colnames(tab)){
tab <- tab %>% filter(exclude_allele==0)
}
#filter genotypes for non excluded markers in geno file
geno <- geno %>% select(c(marker,id,allele_1,allele_2)) %>% filter(marker %in% tab$marker)
#recode parents' names to match column names nomenclature
par1 <- make.names(par1)
par2 <- make.names(par2)
#keep parental lines genotypes
colnames(ref) <- make.names(colnames(ref))
ref <- ref %>% select(marker,chr,!!sym(pos),!!sym(par1),!!sym(par2))
#merge genotypes with parents
geno <- left_join(geno,ref,by=c("marker"="marker"))
#recode "-" in "N" in geno file
geno <- geno %>% mutate(allele_1 = recode(allele_1,
"-" = "N"))
geno <- geno %>% mutate(allele_2 = recode(allele_2,
"-" = "N"))
#recode geno in factors with same levels
geno <- geno %>% mutate(allele_1 = factor(allele_1,levels=c("A","C","G","H","N","T")))
geno <- geno %>% mutate(allele_2 = factor(allele_2,levels=c("A","C","G","H","N","T")))
#recode genotypes depending on parents' genotypes
geno <- geno %>% mutate(Geno = case_when(
#if one allele not genotyped:
allele_1=="N" | allele_2=="N" ~ "NA",
#if both alleles genotyped
##homozygous 0
allele_1==allele_2 & allele_1==!!sym(par1) ~ "0",
##homozygous 2
allele_1==allele_2 & allele_1==!!sym(par2) ~ "2",
##heterozygous
allele_1!=allele_2 ~ "1",
#if parental strains are N/H
##homozygous for parent that is N/H
###homozygous 0
(!!sym(par1)%in%c("H","N") | !!sym(par2)%in%c("H","N")) &
!!sym(par1)%in%c("H","N") ~ "0",
###homozygous 2
(!!sym(par1)%in%c("H","N") | !!sym(par2)%in%c("H","N")) &
!!sym(par2)%in%c("H","N") ~ "2",
)
)
#keep positions of markers
markers <- geno %>% select(marker,chr,!!sym(pos)) %>% distinct()
markers <- markers %>% arrange(chr,!!sym(pos))
#keep only interesting columns in geno file
geno <- geno %>% arrange(chr,!!sym(pos))
geno <- geno %>% select(marker,id,Geno)
#remove prefix
geno <- geno %>% mutate(id=str_remove(id,prefix))
#keep only non excluded markers and merge with positions
markers <- markers %>% mutate(marker=as.character(marker))
markers <- markers %>% mutate(chr=as.character(chr))
geno <- markers %>% select(marker,chr,!!sym(pos)) %>% full_join(.,geno,by="marker")
#pivoting
geno <- geno %>% pivot_wider(names_from = c(marker,chr,!!sym(pos)),values_from = Geno,names_sep=",")
geno <- geno %>% mutate(id=as.character(id))
geno <- geno %>% rename("id,,"=id)
#merge with phenotype file
pheno <- pheno %>% mutate_all(as.character)
colnames(pheno) <- str_c(colnames(pheno),",,")
qtl_file <- right_join(pheno,geno,by=c("Ind,,"="id,,"))
#prepare file
qtl_file <- rbind(colnames(qtl_file),qtl_file)
qtl_file <- separate_rows(qtl_file,everything(),sep=",")
colnames(qtl_file) <- qtl_file[1,]
qtl_file <- qtl_file %>% slice(-1)
if(is.na(path)==FALSE){
write.csv(qtl_file,file=path,quote=FALSE,row.names = FALSE)
}
return(qtl_file)
}
{
"collab_server" : "",
"contents" : "",
"created" : 1622637863181.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "0",
"id" : "65C9B72B",
"lastKnownWriteTime" : 1623060553,
"last_content_update" : 1623060553,
"path" : "~/Documents/PhD/stuart_R/stuart/R/tab_mark.R",
"project_path" : "R/tab_mark.R",
"properties" : {
"cursorPosition" : "43,27",
"scrollLine" : "0",
"source_window_id" : ""
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 5,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
#' @title Create of the summary table for all markers from the genotype data frame
#'
#' @description This function creates a table with all the markers that were genotyped in the array, the alleles for these markers, the number of homozygous and heterozygous animals, as well as the number of non genotyped animals.
#' @param geno data frame with the genotyping results for your cross
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
#### tab_mark function ####
## create table with markers and counts
tab_mark <- function(geno){
#rename df columns
geno <- geno %>% rename("marker"=1,
"id"=2,
"allele_1"=3,
"allele_2"=4)
#create geno column in geno df
geno <- geno %>% unite(Geno,c("allele_1","allele_2"),sep="",remove=FALSE)
#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA")
geno <- geno %>% mutate(Geno=recode(Geno,
"TA" = "AT",
"GA" = "AG",
"CA" = "AC",
"GT" = "TG",
"CT" = "TC",
"GC" = "CG"))
#create df with counts for each genotype
df_count <- tibble(marker = as.character(unique(geno$marker)),
allele_1 = NA,
allele_2 = NA,
n_HM1 = NA,
n_HM2 = NA,
n_HT = NA,
n_NA = NA)
## loop to count genotype
for(i in df_count$marker){
#extract alleles for each marker
Alleles <- geno %>% filter(marker==i) %>%
select(c(marker,id,Geno,allele_1,allele_2)) %>%
pivot_longer(c(allele_1,allele_2),names_to="Allele_name",values_to="Allele") %>%
distinct(Allele) %>% filter(Allele != "-")
Alleles <- as.factor(paste(Alleles$Allele))
#sort alleles
Alleles <- factor(Alleles,levels=c("A","T","C","G"))
Alleles <- sort(Alleles)
#add alleles and counts, only for markers with alleles (not markers with no genotyped ind)
if(all(rapportools::is.empty(Alleles))==FALSE){
#add alleles to df_count
df_count <- df_count %>% mutate(allele_1 = ifelse(marker == i,
paste(Alleles[1]), allele_1))
#count for homozygous for allele 1
n1 <- geno %>% filter(marker==i) %>%
filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM1 = ifelse(marker == i,
n1$n, n_HM1))
}
#if marker not polymorphic
if(is.na(Alleles[2])==TRUE){
#NA as allele_2
df_count <- df_count %>% mutate(allele_2 = ifelse(marker == i,
NA, allele_2))
#NA as n_HM2
df_count <- df_count %>% mutate(n_HM2 = ifelse(marker == i,
NA, n_HM2))
#NA as n_HT
df_count <- df_count %>% mutate(n_HT = ifelse(marker == i,
NA, n_HT))
} else {
#add alleles to df_count
df_count <- df_count %>% mutate(allele_2 = ifelse(marker == i,
paste(Alleles[2]), allele_2))
#count for homozygous for allele 2
n2 <- geno %>% filter(marker==i) %>%
filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM2 = ifelse(marker == i,
n2$n, n_HM2))
#count for heterozygous
n3 <- geno %>% filter(marker==i) %>%
filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HT = ifelse(marker == i,
n3$n, n_HT))
}
#count for NA
n4 <- geno %>% filter(marker==i) %>%
filter(Geno == "--" |
Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") |
Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>%
summarise(n=n())
#add count for NA to df_count
df_count <- df_count %>% mutate(n_NA = ifelse(marker == i,
n4$n, n_NA))
}
#change class of counts as numeric :
df_count$n_HM1 <- df_count$n_HM1 %>% as.numeric()
df_count$n_HM2 <- df_count$n_HM2 %>% as.numeric()
df_count$n_HT <- df_count$n_HT %>% as.numeric()
df_count$n_NA <- df_count$n_NA %>% as.numeric()
#add 0 for null counts
df_count <- df_count %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0))
#return
return(df_count)
}
{
"collab_server" : "",
"contents" : "",
"created" : 1623083300385.000,
"dirty" : false,
"encoding" : "",
"folds" : "",
"hash" : "0",
"id" : "6DCC955A",
"lastKnownWriteTime" : 140320379993736,
"last_content_update" : 1623083300385,
"path" : null,
"project_path" : null,
"properties" : {
"cacheKey" : "FFFA47E6",
"caption" : "test_tab",
"contentUrl" : "grid_resource/gridviewer.html?env=&obj=test_tab&cache_key=FFFA47E6",
"displayedObservations" : "20",
"environment" : "",
"expression" : "test_tab",
"object" : "test_tab",
"preview" : "0",
"totalObservations" : "20",