Commit cdf9faed authored by Marie Bourdon's avatar Marie Bourdon
Browse files

geno_strains with > 2 ind + bug fix

parent 8882043d
This diff is collapsed.
/home/marie/Documents/stuart_package/stuart/.Rbuildignore="739A4511"
/home/marie/Documents/stuart_package/stuart/.Rhistory="77548AEE"
/home/marie/Documents/stuart_package/stuart/.gitignore="05FA27A5"
/home/marie/Documents/stuart_package/stuart/DESCRIPTION="E286725B"
/home/marie/Documents/stuart_package/stuart/R/geno_strains.R="DCEFD103"
/home/marie/Documents/stuart_package/stuart/R/genos-data.R="33820B87"
/home/marie/Documents/stuart_package/stuart/R/mark_allele.R="64630E28"
/home/marie/Documents/stuart_package/stuart/R/mark_match.R="CBA3F514"
/home/marie/Documents/stuart_package/stuart/R/mark_poly.R="63E868BF"
/home/marie/Documents/stuart_package/stuart/R/mark_prop.R="19C6446D"
/home/marie/Documents/stuart_package/stuart/R/phenos-data.R="75BCF577"
/home/marie/Documents/stuart_package/stuart/R/stuart_tab-data.R="9C18AF59"
/home/marie/Documents/stuart_package/stuart/R/tab_mark.R="60E85DC0"
/home/marie/Documents/stuart_package/stuart/R/write_rqtl.R="D25FAC55"
/home/marie/Documents/stuart_package/stuart/README.Rmd="C395B1B3"
/home/marie/Documents/stuart_package/stuart/README.md="8BBA9900"
/home/marie/Documents/stuart_package/stuart/man/genos.Rd="383A8DC0"
/home/marie/Documents/stuart_package/stuart/vignettes/stuaRt.Rmd="007031F6"
......@@ -34,16 +34,18 @@ geno_strains <- function(ref,geno,par1,par2,name1,name2){
#create consensus
geno <- geno %>% mutate(parent1 = !!sym(par1[1]))
if(length(par1)!=1){
geno <- geno %>% mutate(parent1 = ifelse(!!sym(par1[1])==!!sym(par1[2]),!!sym(par1[1]),"N"))
} else {
geno <- geno %>% rename(parent1=!!sym(par1[1]))
for(i in 2:length(par1)){
geno <- geno %>% mutate(parent1 = ifelse(parent1==!!sym(par1[i]),parent1,"N"))
}
}
geno <- geno %>% mutate(parent2 = !!sym(par2[1]))
if(length(par2)!=1){
geno <- geno %>% mutate(parent2 = ifelse(!!sym(par2[1])==!!sym(par2[2]),!!sym(par2[1]),"N"))
} else {
geno <- geno %>% rename(parent2=!!sym(par2[1]))
for(i in 2:length(par2)){
geno <- geno %>% mutate(parent2 = ifelse(parent2==!!sym(par2[i]),parent2,"N"))
}
}
geno <- geno %>% select(marker,parent1,parent2)
......
......@@ -24,7 +24,6 @@ mark_allele <- function(tab,ref,par1,par2,parNH=TRUE){
ref <- ref %>% select(marker,!!sym(par1),!!sym(par2))
tab <- full_join(tab,ref,by=c("marker"="marker"))
print(parNH)
#function core
tab <- tab %>% mutate(exclude_allele = case_when(parNH==FALSE &
(!!sym(par1) == "N" | !!sym(par2) == "N" | !!sym(par1) == "H" | !!sym(par2) == "H") ~ 1,
......@@ -43,8 +42,6 @@ mark_allele <- function(tab,ref,par1,par2,parNH=TRUE){
T ~ 0)
)
print(tab)
tab <- tab %>% select(-c(!!sym(par1),!!sym(par2)))
return(tab)
......
......@@ -17,7 +17,6 @@
mark_prop <- function(tab,cross,homo=NA,hetero=NA,pval=NA,na=0.5){
#stock colnames to join
names <- colnames(tab)
print(names)
#calculate total number of individuals genotyped for each marker
tab <- tab %>% mutate(n_geno = (n_HM1 + n_HM2 + n_HT))
......@@ -71,7 +70,5 @@ mark_prop <- function(tab,cross,homo=NA,hetero=NA,pval=NA,na=0.5){
tab <- tab %>% mutate(exclude_prop=case_when(chi_pval < pval ~ 1,
T ~ exclude_prop))
}
print(tab)
tab <- tab %>% select(all_of(names),exclude_prop)
print(tab)
}
Package: stuart
Title: stuaRt
Version: 0.1.0
Authors@R:
person(given = "Marie",
family = "Bourdon",
role = c("aut", "cre"),
email = "mariefbourdon@gmail.com",
comment = c(ORCID = "YOUR-ORCID-ID"))
Description: Sorts markers of miniMUGA genotyping for F2 or N2 individuals, for Rqtl analysis.
License: GPL-3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
Imports: dplyr, tidyr, utils, stringr, rapportools
Suggests: knitr, rmarkdown
VignetteBuilder: knitr
Depends: R (>= 3.5.0)
NeedsCompilation: no
Packaged: 2021-06-01 08:06:30 UTC; mariebourdon
Author: Marie Bourdon [aut, cre] (YOUR-ORCID-ID)
Maintainer: Marie Bourdon <mariefbourdon@gmail.com>
# Generated by roxygen2: do not edit by hand
export(geno_strains)
export(mark_allele)
export(mark_match)
export(mark_poly)
export(mark_prop)
export(tab_mark)
export(write_rqtl)
import(dplyr)
import(stringr)
import(tidyr)
import(utils)
#' @title Create haplotype for a new mouse strain into a reference dataframe
#'
#' @description This functions adds columns for parental strains used in the cross in the annotation data frame, from the genotype data frame in which one or several animal of the parental strains were genotyped.
#' If several animals of one strain were genotyped, a consensus is created from these animals.
#' The consensus is created as follow : if the indivuals carry the same allele, this allele is kept, otherwise, the allele is noted as "N". If individuals show residual heterozygosity, it is encoded as "H".
#' @param ref data frame with the reference genotypes of mouse lines
#' @param geno data frame with the genotyping results for your cross from miniMUGA array
#' @param par1 first parental strain used in the cross, the name must be written as in the geno data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the geno data frame
#' @param name1 name of the first parental strain to use as the column name in the ref data frame
#' @param name2 name of the second parental strain to use as the column name in the ref data frame
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
geno_strains <- function(ref,geno,par1,par2,name1,name2){
#recode genotypes from 2 alleles to 1
geno <- geno %>% mutate_all(as.character)
geno <- geno %>% filter(Sample.ID %in% c(par1,par2))
geno <- geno %>% mutate(Geno=case_when(Allele1...Forward == "-" | Allele2...Forward == "-" ~ "N",
Allele1...Forward == Allele2...Forward ~ Allele1...Forward,
Allele1...Forward %in% c("A","T","G","C") & Allele2...Forward %in% c("A","T","G","C") ~ "H"))
geno <- geno %>% select(SNP.Name,Sample.ID,Geno) %>% pivot_wider(names_from = Sample.ID, values_from = Geno)
#create consensus
if(length(par1)!=1){
geno <- geno %>% mutate(parent1 = ifelse(!!sym(par1[1])==!!sym(par1[2]),!!sym(par1[1]),"N"))
} else {
geno <- geno %>% rename(parent1=!!sym(par1[1]))
}
if(length(par2)!=1){
geno <- geno %>% mutate(parent2 = ifelse(!!sym(par2[1])==!!sym(par2[2]),!!sym(par2[1]),"N"))
} else {
geno <- geno %>% rename(parent2=!!sym(par2[1]))
}
geno <- geno %>% select(SNP.Name,parent1,parent2)
colnames(geno) <- c("SNP.Name",name1,name2)
#merge with ref file
ref <- full_join(ref,geno,by=c("marker"="SNP.Name"))
return(ref)
}
#' Data frame with miniMUGA genotyping of F2 individuals and parental strains
#'
#' A dataset containing the genotypes of 176 F2 individuals
#'
#' @format A data frame with 2002493 observations of 11 variables
"genos"
#' @title Exclude markers that have different alleles in the individuals of the cross and in parental strains
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers which have alleles observed in the individuals of the cross that do not correspond to the alleles observed in the parental strains. For example, a marker which is not polymorphic between the two parental strains but which has two alleles in the cross individuals will be excluded.
#' @param tab data frame obtained with tab_mark function
#' @param ref data frame with the reference genotypes of mouse lines
#' @param par1 first parental strain used in the cross, the name must be written as in the "ref" data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the "ref" data frame
#'
#' @import dplyr
#'
#' @export
#'
mark_allele <- function(tab,ref,par1,par2){
#markers of ref df as characters
ref$marker <- as.character(ref$marker)
colnames(ref) <- make.names(colnames(ref))
#recode parents' names to match column names nomenclature
par1 <- make.names(par1)
par2 <- make.names(par2)
#join tab and ref genotypes
ref <- ref %>% select(marker,!!sym(par1),!!sym(par2))
tab <- full_join(tab,ref,by=c("SNP.Name"="marker"))
#function core
tab <- tab %>% mutate(exclude_allele = case_when(is.na(Allele_2)==FALSE &
!!sym(par1) != "N" & !!sym(par2) != "N" & !!sym(par1) != "H" & !!sym(par2) != "H" &
((Allele_1!=!!sym(par1) & Allele_1!=!!sym(par2)) | (Allele_2!=!!sym(par1) & Allele_2!=!!sym(par2))) ~ 1,
is.na(Allele_2)==FALSE &
(!!sym(par1)=="N" | !!sym(par2)=="N" | !!sym(par1)=="H" | !!sym(par2)=="H") &
((Allele_1!=!!sym(par1) & Allele_1!=!!sym(par2)) & (Allele_2!=!!sym(par1) & Allele_2!=!!sym(par2))) ~ 1,
is.na(Allele_2)==TRUE &
!!sym(par1) != "N" & !!sym(par2) != "N" & !!sym(par1) != "H" & !!sym(par2) != "H" &
(Allele_1!=!!sym(par1) | Allele_1!=!!sym(par2)) ~ 1,
is.na(Allele_2)==TRUE &
(!!sym(par1)=="N" | !!sym(par2)=="N" | !!sym(par1)=="H" | !!sym(par2)=="H") &
Allele_1!=!!sym(par1) & Allele_1!=!!sym(par2) ~ 1,
T ~ 0)
)
tab <- tab %>% select(-c(!!sym(par1),!!sym(par2)))
return(tab)
}
#' @title Exclude markers that were not genotyped in the reference strains
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that were genotyped in the individuals of the cross but not in the reference strains. This is useful if the parental strains of the cross were not genotyped with the individuals and a previous genotyping result is used. Indeed, changes in the markers of the array may have happened. We recommend always using this function in order to avoid errors.
#' @param tab data frame obtained with tab_mark function
#' @param ref data frame with the reference genotypes of mouse lines
#'
#' @import dplyr
#'
#' @export
#'
mark_match <- function(tab, #tab_mark df
ref){ #strain ref geno file
#finds SNPs that are in both files:
snp_strains <- as.character(ref$marker) #extracts SNPs in strains ref geno file
snp_genfile <- as.character(tab$SNP.Name) #extracts SNPs in cross geno file
snp <- intersect(snp_strains,snp_genfile) #take intercept
#add results in exclude column
return(tab %>% mutate(exclude_match=ifelse(!SNP.Name %in% snp,
1,
0)))
}
#' @title Exclude markers that are not polymorphic
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that are not polymorphic.
#' @param tab data frame obtained with tab_mark function
#'
#' @import dplyr
#'
#' @export
mark_poly <- function(tab){
return(tab %>% mutate(exclude_poly=ifelse(is.na(Allele_2)==TRUE,
1,
0)))
}
#' @title Exclude markers depending on proportions of homo/hetorozygous
#'
#' @description uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present odd proportions of each genotype. You can define these proportions thanks to the arguments of the function.
#' @param tab data frame obtained with tab_mark function.
#' @param cross F2 or N2.
#' @param homo proportion of homozygous individuals under which the marker is excluded. Will apply on both homozygous genotypes for a F2, but only on one for N2.
#' @param hetero proportion of heterozygous individuals under which the marker is excluded.
#' @param na proportion of non-genotyped individuals above which the marker is excluded.
#'
#' @import dplyr
#'
#' @export
#'
#### mark_prop ####
## excludes markers depending on proportions of homo/hetorozygous
mark_prop <- function(tab,cross,homo=NA,hetero=NA,na=0.5){
#calculate total number of individuals genotyped for each marker
tab <- tab %>% mutate(n_geno = tab %>% select(n_HM1,n_HM2,n_HT) %>% rowSums(na.rm=TRUE))
#calculate proportion of each genotype
tab <- tab %>% mutate(p_HM1 = n_HM1/n_geno)
tab <- tab %>% mutate(p_HM2 = n_HM2/n_geno)
tab <- tab %>% mutate(p_HT = n_HT/n_geno)
tab <- tab %>% mutate(p_NA = n_NA/(n_geno+n_NA))
#stop if cross != "F2" or "N2"
if(!cross %in% c("F2","N2")){
stop("Cross must be F2 or N2")
}
#stop if no value for "homo" for F2 cross
if(cross=="F2" & (is.na(homo)==TRUE | is.na(hetero)==TRUE)){
stop("Arguments homo and hetero must be specified for F2 crosses")
}
#stop if no value for "homo" and "hetero" for N2 cross
if(cross=="N2" & (is.na(homo)==TRUE | is.na(hetero)==TRUE)){
stop("Arguments homo and hetero must be specified for N2 crosses")
}
#exclude markers according to proportion of na
tab <- tab %>% mutate(exclude_prop=case_when(p_NA > na ~ 1, #exclude markers according to proportion of na
cross=="F2" & (p_HM1 < homo | p_HM2 < homo | p_HT < hetero) ~ 1, #exclude markers according to proportion of homozygous individuals for F2 cross
cross=="N2" & (p_HM1 < homo | p_HT < hetero) ~ 1, #exclude markers according to proportion of homozygous and heterozygous individuals for N2 cross
T ~ 0))
tab <- tab %>% select(-c(p_HM1,p_HM2,p_HT,p_NA,n_geno))
return(tab)
}
#' Data frame with phenotype of F2 individuals
#'
#' A dataset containing the phenotype of 176 F2 individuals
#'
#' @format A data frame with 176 observations of 4 variables
"phenos"
#' Data frame with miniMUGA genotyping of classical lab strains.
#'
#' A dataset containing the genotypes of 10 mouse strains of the Institut pasteur. Markers positions and other information are from by Karl Broman (https://kbroman.org/MUGAarrays/mini_revisited.html). Strains genotyped from Institut Pasteur.
#'
#' @format A data frame with 11299 rows and 18 variables
#' \describe{
#' \item{CC001}{CC001 mouse strain}
#' \item{CC005}{CC005 mouse strain}
#' \item{CC042}{CC042 mouse strain}
#' \item{CC071}{CC071 mouse strain}
#' \item{Ifnar.KO.129}{Ifnar KO 129 mouse strain}
#' \item{Ifnar.KO.B6}{Ifnar KO B6 mouse strain}
#' \item{Rvfs2.1}{Rvfs2-1 mouse strain}
#' \item{Rvfs2.2}{Rvfs2-2 mouse strain}
#' \item{Rvfs2.6}{Rvfs2-6 mouse strain}
#' \item{Rvfs2.7}{Rvfs2-7 mouse strain}
#' \item{marker}{name of the marker}
#' \item{chr}{chromosome}
#' \item{bp_mm10}{localisation on chromosome in bp (mm10 assembly)}
#' \item{cM_cox}{localisation on chromosome in cM (from Cox et al.)}
#' \item{cM_g2f1}{localisation on chromosome in cM (from Liu et al.)}
#' \item{snp}{marker alleles}
#' \item{unique}{indicates if the marker maps uniquely on mm10}
#' \item{multi}{indicates if the marker maps more than one time on mm10}
#' \item{unmapped}{indicates if the marker does not map perfectly on mm10}
#' }
"ref_strains_mini"
#' Output of tab_mark function
#'
#' A dataset with the output of tab_mark() function.
#'
#' @format A data frame with 11125 rows and 7 variables
#' \describe{
#' \item{SNP.Name}{name of the marker}
#' \item{Allele_1}{first allele of the marker}
#' \item{Allele_2}{second allele of the marker}
#' \item{n_HM1}{number of homozygous individuals for the first allele}
#' \item{n_HM2}{number of homozygous individuals for the second allele}
#' \item{n_HT}{number of heterozygous individuals}
#' \item{n_NA}{number of non genotyped individuals}
#' }
"stuart_tab"
#' @title Create of the summary table for all markers from the genotype data frame
#'
#' @description This function creates a table with all the markers that were genotyped in the array, the alleles for these markers, the number of homozygous and heterozygous animals, as well as the number of non genotyped animals.
#' @param geno data frame with the genotyping results for your cross
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
#### tab_mark function ####
## create table with markers and counts
tab_mark <- function(geno){
#create geno column in geno df
geno <- geno %>% unite(Geno,c("Allele1...Forward","Allele2...Forward"),sep="",remove=FALSE)
#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA")
geno <- geno %>% mutate(Geno=recode(Geno,
"TA" = "AT",
"GA" = "AG",
"CA" = "AC",
"GT" = "TG",
"CT" = "TC",
"GC" = "CG"))
#create df with counts for each genotype
df_count <- tibble(SNP.Name = as.character(unique(geno$SNP.Name)),
Allele_1 = NA,
Allele_2 = NA,
n_HM1 = NA,
n_HM2 = NA,
n_HT = NA,
n_NA = NA)
## loop to count genotype
for(i in df_count$SNP.Name){
#extract alleles for each marker
Alleles <- geno %>% filter(SNP.Name==i) %>%
select(c(SNP.Name,Sample.ID,Geno,Allele1...Forward,Allele2...Forward)) %>%
pivot_longer(c(Allele1...Forward,Allele2...Forward),names_to="Allele_name",values_to="Allele") %>%
distinct(Allele) %>% filter(Allele != "-")
Alleles <- as.factor(paste(Alleles$Allele))
#sort alleles
Alleles <- factor(Alleles,levels=c("A","T","C","G"))
Alleles <- sort(Alleles)
#add alleles and counts, only for markers with alleles (not markers with no genotyped ind)
if(all(rapportools::is.empty(Alleles))==FALSE){
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_1 = ifelse(SNP.Name == i,
paste(Alleles[1]), Allele_1))
#count for homozygous for allele 1
n1 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM1 = ifelse(SNP.Name == i,
n1$n, n_HM1))
}
#if marker not polymorphic
if(is.na(Alleles[2])==TRUE){
#NA as Allele_2
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
NA, Allele_2))
#NA as n_HM2
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
NA, n_HM2))
#NA as n_HT
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
NA, n_HT))
} else {
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
paste(Alleles[2]), Allele_2))
#count for homozygous for allele 2
n2 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
n2$n, n_HM2))
#count for heterozygous
n3 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
n3$n, n_HT))
}
#count for NA
n4 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == "--" |
Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") |
Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>%
summarise(n=n())
#add count for NA to df_count
df_count <- df_count %>% mutate(n_NA = ifelse(SNP.Name == i,
n4$n, n_NA))
}
#change class of counts as numeric :
df_count$n_HM1 <- df_count$n_HM1 %>% as.numeric()
df_count$n_HM2 <- df_count$n_HM2 %>% as.numeric()
df_count$n_HT <- df_count$n_HT %>% as.numeric()
df_count$n_NA <- df_count$n_NA %>% as.numeric()
#add 0 for null counts
df_count <- df_count %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0))
#return
return(df_count)
}
#' @title Create data frame in Rqtl CSV format
#'
#' @description This function uses the table produced by tab_mark function filled by all the mark_* functions in order to create a data frame in the right format for Rqtl read.cross function. Only the non-excluded markers will be kept and genotypeds will be encoded in "0", "1" and "2", "0" being homozygous for the first parental strain, "1" heterozygous and "2" homozygous for the second parental strain. Caution, this file create a data frame and a CSV file in the path of your choice if indicated by the "path" argument. This function does not create a "cross" object in your environment that can be directly used for QTL mapping. You will need to load the CSV file with qtl::read.cross.
#' @param geno data frame with the genotyping results for your cross
#' @param pheno data frame with phenotypes of the individuals (individuals must have the same ID in the geno data frame and in the pheno data frame)
#' @param prefix potential prefix present in the names of the individuals in the geno data frame to be removed in ordere to have the same names as in the pheno file
#' @param tab data frame obtained with tab_mark function
#' @param ref data frame with the reference genotypes of mouse lines
#' @param par1 first parental strain used in the cross, the name must be written as in the "ref" data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the "ref" data frame
#' @param method method of calculation of cM position, can be "cM_cox" of "cM_g2f1"
#' @param path if indicated, the data frame will be exported in this path
#'
#' @import dplyr
#' @import tidyr
#' @import utils
#' @import stringr
#'
#' @export
#'
#### write_rqtl ####
## write data frame in rqtl format (csv), if path != NA writes the file in the path indicated
write_rqtl <- function(geno,pheno,tab,ref,par1,par2,prefix,method="cM_cox",path=NA){
#extract snps non excluded
if("exclude_match" %in% colnames(tab)){
tab <- tab %>% filter(exclude_match==0)
}
if("exclude_poly" %in% colnames(tab)){
tab <- tab %>% filter(exclude_poly==0)
}
if("exclude_prop" %in% colnames(tab)){
tab <- tab %>% filter(exclude_prop==0)
}
if("exclude_allele" %in% colnames(tab)){
tab <- tab %>% filter(exclude_allele==0)
}
#filter genotypes for non excluded markers in geno file
geno <- geno %>% select(c(SNP.Name,Sample.ID,Allele1...Forward,Allele2...Forward)) %>% filter(SNP.Name %in% tab$SNP.Name)
#recode parents' names to match column names nomenclature
par1 <- make.names(par1)
par2 <- make.names(par2)
#keep parental lines genotypes
colnames(ref) <- make.names(colnames(ref))
ref <- ref %>% select(marker,chr,bp_mm10,cM_cox,cM_g2f1,!!sym(par1),!!sym(par2))
#merge genotypes with parents
geno <- left_join(geno,ref,by=c("SNP.Name"="marker"))
#recode "-" in "N" in geno file