Commit b207d759 authored by Marie  BOURDON's avatar Marie BOURDON
Browse files

add stuart folder

parent 379daae1
File added
^stuart\.Rproj$
^\.Rproj\.user$
library(stuart)
usethis::use_package("rapportools")
library(stuart)
library(stuart)
library(stuart)
read.csv("ref_strains.csv")
ref_strains <- read.csv("ref_strains.csv")
ref_strains_mini <- read.csv("ref_strains.csv")
rm(ref_strains)
View(ref_strains_mini)
usethis::use_data(ref_strains_mini)
rm(ref_strains_mini)
load(ref_strains_mini)
data(ref_strains_mini)
load("C:/Users/mbourdon/Documents/R/stuaRt_package/stuart/data/ref_strains_mini.rda")
View(ref_strains_mini)
usethis::use_vignette("stuaRt")
library(stuart)
stuart::mark_poly()
stuart::mark_poly()
library(stuart)
stuart::mark_poly()
stuart::mark_poly()
library(stuart)
mark_poly()
geno_strains()
mark_alele()
mark_allele()
mark_prop()
tab_mark()
write_rqtl()
library(stuart)
data(ref_strains_mini)
data(ref_strains_mini)
library(stuart)
library(stuart)
data(ref_strains_mini)
ref_strains_mini
ref_strains_mini
load(ref_strains_mini)
data(ref_strains_mini)
view(ref_strain_mini)
View(ref_strain_mini)
View(ref_strains_mini)
data(genos)
View(genos)
data(phenos)
View(phenos)
View(genos)
data(genos)
data(genos)
data(ref_strains_mini)
library(stuart)
data(ref_strains_mini)
data(genos)
data(phenos)
strains <- geno_strains(ref=ref_strains_mini,geno=genos,par1="ind_201",par2="ind_202",name1="parent1",name2="parent2")
strains <- geno_strains(ref=ref_strains_mini,geno=genos,par1="ind_201",par2="ind_202",name1="ok",name2="okok")
library(stuart)
strains <- geno_strains(ref=ref_strains_mini,geno=genos,par1="ind_201",par2="ind_202",name1="ok",name2="okok")
View(strains)
strains <- geno_strains(ref=ref_strains_mini,geno=genos,par1="ind_201",par2="ind_202",name1="parent1",name2="parent2")
head(strains)
View(strains)
View(ref_strains_mini)
library(stuart)
tab <- tab_mark(geno=genos)
head(tab)
library(stuart)
library(stuart)
strains <- geno_strains(ref=ref_strains_mini,geno=genos,par1="ind_201",par2="ind_202",name1="parent1",name2="parent2")
head(strains)
tab2 <- mark_match(tab,STRAINS)
tab2 <- mark_match(tab,ref=ref_strains_mini)
tail(tab2)
View(tab2)
tab2 %>% filter(exclude="match")
library(tidyverse)
tab2 %>% filter(exclude="match")
tab2 %>% filter(exclude=="match")
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
library(stuart)
library(tidyverse)
data(ref_strains_mini)
data(genos)
data(phenos)
strains <- geno_strains(ref=ref_strains_mini,geno=genos,par1="ind_201",par2="ind_202",name1="parent1",name2="parent2")
head(strains)
tab <- tab_mark(geno=genos)
head(tab)
tab2 <- mark_match(tab,ref=ref_strains_mini)
tail(tab2)
tab2 %>% filter(exclude=="match")
save.image()
tab2 <- mark_poly(tab2)
head(tab2)
tab2 <- mark_prop(tab2,cross="F2",homo=0.1)
head(tab2)
tab2 <- mark_allele(tab=tab2,ref=ref_strains_mini,par1="Ifnar(KO)B6",par2="CC001")
tab2 %>% arrange(exclude) %>% head()
ref_strains_mini %>% filter(marker %in% c("gJAX00038569","gJAX00425031","gUNC12245354","gUNC15530876","gUNC21555204","gUNC21596600")) %>% arrange(marker) %>% select(marker,Ifnar.KO.B6,CC001)
tab2 <- mark_allele(tab=tab2,ref=ref_strains_mini,par1="Ifnar(KO)B6",par2="CC001")
tab2 %>% arrange(exclude) %>% head()
ref_strains_mini %>% filter(marker %in% c("gJAX00038569","gJAX00425031","gUNC12245354","gUNC15530876","gUNC21555204","gUNC21596600")) %>% arrange(marker) %>% select(marker,Ifnar.KO.B6,CC001)
View(genos)
write_rqtl(geno=genos,pheno=phenos,ref=ref_strains_mini,par1="Ifnar(KO)B6",par2="CC001",prefix="ind_")
write_rqtl(geno=genos,pheno=phenos,tab=tab2,ref=ref_strains_mini,par1="Ifnar(KO)B6",par2="CC001",prefix="ind_")
head(write_rqtl(geno=genos,pheno=phenos,tab=tab2,ref=ref_strains_mini,par1="Ifnar(KO)B6",par2="CC001",prefix="ind_"))
rqtl_file <- (write_rqtl(geno=genos,pheno=phenos,tab=tab2,ref=ref_strains_mini,par1="Ifnar(KO)B6",par2="CC001",prefix="ind_"))
head(rqtl_file)
rqtl_file[10,10]
rqtl_file[1:10,1:10]
save.image()
.Rproj.user
inst/doc
Package: stuart
Title: stuaRt
Version: 0.1.0
Authors@R:
person(given = "Marie",
family = "Bourdon",
role = c("aut", "cre"),
email = "mariefbourdon@gmail.com",
comment = c(ORCID = "YOUR-ORCID-ID"))
Description: Sorts markers of miniMUGA genotyping for F2 or N2 individuals, for Rqtl analysis.
License: GPL-3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
Imports:
dplyr,
tidyr,
utils,
stringr,
rapportools
Suggests:
knitr,
rmarkdown
VignetteBuilder: knitr
Depends:
R (>= 3.5.0)
# Generated by roxygen2: do not edit by hand
export(geno_strains)
export(mark_allele)
export(mark_match)
export(mark_poly)
export(mark_prop)
export(tab_mark)
export(write_rqtl)
import(dplyr)
import(stringr)
import(tidyr)
import(utils)
#' Data frame with gigaMUGA markers annotation
#'
#' A dataset containing gigaMUGA markers positions and other information from by Karl Broman (https://kbroman.org/MUGAarrays/new_annotations.html).
#'
#' @format A data frame with 143259 rows and 8 variables
#' \describe{
#' \item{marker}{name of the marker}
#' \item{chr}{chromosome}
#' \item{bp_mm10}{localisation on chromosome in bp (mm10 assembly)}
#' \item{cM_cox}{localisation on chromosome in cM (from Cox et al.)}
#' \item{cM_g2f1}{localisation on chromosome in cM (from Liu et al.)}
#' \item{snp}{marker alleles}
#' \item{unique}{indicates if the marker maps uniquely on mm10}
#' \item{unmapped}{indicates if the marker does not map perfectly on mm10}
#' }
"annot_giga"
#' Data frame with megaMUGA markers annotation
#'
#' A dataset containing megaMUGA markers positions and other information from by Karl Broman (https://kbroman.org/MUGAarrays/new_annotations.html).
#'
#' @format A data frame with 77808 rows and 8 variables
#' \describe{
#' \item{marker}{name of the marker}
#' \item{chr}{chromosome}
#' \item{bp_mm10}{localisation on chromosome in bp (mm10 assembly)}
#' \item{cM_cox}{localisation on chromosome in cM (from Cox et al.)}
#' \item{cM_g2f1}{localisation on chromosome in cM (from Liu et al.)}
#' \item{snp}{marker alleles}
#' \item{unique}{indicates if the marker maps uniquely on mm10}
#' \item{unmapped}{indicates if the marker does not map perfectly on mm10}
#' }
"annot_mega"
#' Data frame with miniMUGA markers annotation
#'
#' A dataset containing miniMUGA markers positions and other information from by Karl Broman (https://kbroman.org/MUGAarrays/mini_revisited.html).
#'
#' @format A data frame with 11125 rows and 8 variables
#' \describe{
#' \item{marker}{name of the marker}
#' \item{chr}{chromosome}
#' \item{bp_mm10}{localisation on chromosome in bp (mm10 assembly)}
#' \item{cM_cox}{localisation on chromosome in cM (from Cox et al.)}
#' \item{cM_g2f1}{localisation on chromosome in cM (from Liu et al.)}
#' \item{snp}{marker alleles}
#' \item{unique}{indicates if the marker maps uniquely on mm10}
#' \item{unmapped}{indicates if the marker does not map perfectly on mm10}
#' }
"annot_mini"
#' Data frame with MUGA markers annotation
#'
#' A dataset containing MUGA markers positions and other information from by Karl Broman (https://kbroman.org/MUGAarrays/muga_annotations.html).
#'
#' @format A data frame with 7854 rows and 8 variables
#' \describe{
#' \item{marker}{name of the marker}
#' \item{chr}{chromosome}
#' \item{bp_mm10}{localisation on chromosome in bp (mm10 assembly)}
#' \item{cM_cox}{localisation on chromosome in cM (from Cox et al.)}
#' \item{cM_g2f1}{localisation on chromosome in cM (from Liu et al.)}
#' \item{snp}{marker alleles}
#' \item{unique}{indicates if the marker maps uniquely on mm10}
#' \item{unmapped}{indicates if the marker does not map perfectly on mm10}
#' }
"annot_muga"
#' @title Create haplotype for a new mouse strain into a reference dataframe
#'
#' @description This functions adds columns for parental strains used in the cross in the annotation data frame, from the genotype data frame in which one or several animal of the parental strains were genotyped.
#' If several animals of one strain were genotyped, a consensus is created from these animals.
#' The consensus is created as follow : if the indivuals carry the same allele, this allele is kept, otherwise, the allele is noted as "N". If individuals show residual heterozygosity, it is encoded as "H".
#' @param ref data frame with the reference genotypes of mouse lines
#' @param geno data frame with the genotyping results for your cross from miniMUGA array
#' @param par1 first parental strain used in the cross, the name must be written as in the geno data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the geno data frame
#' @param name1 name of the first parental strain to use as the column name in the ref data frame
#' @param name2 name of the second parental strain to use as the column name in the ref data frame
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
geno_strains <- function(ref,geno,par1,par2,name1,name2){
#recode genotypes from 2 alleles to 1
geno <- geno %>% mutate_all(as.character)
geno <- geno %>% filter(Sample.ID %in% c(par1,par2))
geno <- geno %>% mutate(Geno=case_when(Allele1...Forward == "-" | Allele2...Forward == "-" ~ "N",
Allele1...Forward == Allele2...Forward ~ Allele1...Forward,
Allele1...Forward %in% c("A","T","G","C") & Allele2...Forward %in% c("A","T","G","C") ~ "H"))
geno <- geno %>% select(SNP.Name,Sample.ID,Geno) %>% pivot_wider(names_from = Sample.ID, values_from = Geno)
#create consensus
if(length(par1)!=1){
geno <- geno %>% mutate(parent1 = ifelse(!!sym(par1[1])==!!sym(par1[2]),!!sym(par1[1]),"N"))
} else {
geno <- geno %>% rename(parent1=!!sym(par1[1]))
}
if(length(par2)!=1){
geno <- geno %>% mutate(parent2 = ifelse(!!sym(par2[1])==!!sym(par2[2]),!!sym(par2[1]),"N"))
} else {
geno <- geno %>% rename(parent2=!!sym(par2[1]))
}
geno <- geno %>% select(SNP.Name,parent1,parent2)
colnames(geno) <- c("SNP.Name",name1,name2)
#merge with ref file
ref <- full_join(ref,geno,by=c("marker"="SNP.Name"))
return(ref)
}
#' Data frame with miniMUGA genotyping of F2 individuals and parental strains
#'
#' A dataset containing the genotypes of 176 F2 individuals
#'
#' @format A data frame with 2002493 observations of 11 variables
"genos"
#' @title Exclude markers that have different alleles in the individuals of the cross and in parental strains
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers which have alleles observed in the individuals of the cross that do not correspond to the alleles observed in the parental strains. For example, a marker which is not polymorphic between the two parental strains but which has two alleles in the cross individuals will be excluded.
#' @param tab data frame obtained with tab_mark function
#' @param ref data frame with the reference genotypes of mouse lines
#' @param par1 first parental strain used in the cross, the name must be written as in the "ref" data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the "ref" data frame
#'
#' @import dplyr
#'
#' @export
#'
mark_allele <- function(tab,ref,par1,par2){
#markers of ref df as characters
ref$marker <- as.character(ref$marker)
colnames(ref) <- make.names(colnames(ref))
#recode parents' names to match column names nomenclature
par1 <- make.names(par1)
par2 <- make.names(par2)
#join tab and ref genotypes
ref <- ref %>% select(marker,!!sym(par1),!!sym(par2))
tab <- full_join(tab,ref,by=c("SNP.Name"="marker"))
#function core
tab <- tab %>% mutate(exclude_allele = case_when(is.na(Allele_2)==FALSE &
!!sym(par1) != "N" & !!sym(par2) != "N" & !!sym(par1) != "H" & !!sym(par2) != "H" &
((Allele_1!=!!sym(par1) & Allele_1!=!!sym(par2)) | (Allele_2!=!!sym(par1) & Allele_2!=!!sym(par2))) ~ 1,
is.na(Allele_2)==FALSE &
(!!sym(par1)=="N" | !!sym(par2)=="N" | !!sym(par1)=="H" | !!sym(par2)=="H") &
((Allele_1!=!!sym(par1) & Allele_1!=!!sym(par2)) & (Allele_2!=!!sym(par1) & Allele_2!=!!sym(par2))) ~ 1,
is.na(Allele_2)==TRUE &
!!sym(par1) != "N" & !!sym(par2) != "N" & !!sym(par1) != "H" & !!sym(par2) != "H" &
(Allele_1!=!!sym(par1) | Allele_1!=!!sym(par2)) ~ 1,
is.na(Allele_2)==TRUE &
(!!sym(par1)=="N" | !!sym(par2)=="N" | !!sym(par1)=="H" | !!sym(par2)=="H") &
Allele_1!=!!sym(par1) & Allele_1!=!!sym(par2) ~ 1,
T ~ 0)
)
tab <- tab %>% select(-c(!!sym(par1),!!sym(par2)))
return(tab)
}
#' @title Exclude markers that were not genotyped in the reference strains
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that were genotyped in the individuals of the cross but not in the reference strains. This is useful if the parental strains of the cross were not genotyped with the individuals and a previous genotyping result is used. Indeed, changes in the markers of the array may have happened. We recommend always using this function in order to avoid errors.
#' @param tab data frame obtained with tab_mark function
#' @param ref data frame with the reference genotypes of mouse lines
#'
#' @import dplyr
#'
#' @export
#'
mark_match <- function(tab, #tab_mark df
ref){ #strain ref geno file
#finds SNPs that are in both files:
snp_strains <- as.character(ref$marker) #extracts SNPs in strains ref geno file
snp_genfile <- as.character(tab$SNP.Name) #extracts SNPs in cross geno file
snp <- intersect(snp_strains,snp_genfile) #take intercept
#add results in exclude column
return(tab %>% mutate(exclude_match=ifelse(!SNP.Name %in% snp,
1,
0)))
}
#' @title Exclude markers that are not polymorphic
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that are not polymorphic.
#' @param tab data frame obtained with tab_mark function
#'
#' @import dplyr
#'
#' @export
mark_poly <- function(tab){
return(tab %>% mutate(exclude_poly=ifelse(is.na(Allele_2)==TRUE,
1,
0)))
}
#' @title Exclude markers depending on proportions of homo/hetorozygous
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present odd proportions of each genotype. You can define these proportions thanks to the arguments of the function.
#' @param tab data frame obtained with tab_mark function
#' @param cross F2 or N2. If F2, markers are excluded according to the proportion of each homozygous genotype (see "homo" argument). If N2, markers are excluded according to the proportion of heterogygous and homozygous (see "homo" and "hetero" argument)
#' @param homo proportion of homozygous individuals under which the marker is excluded. Will apply on both homozygous genotypes for a F2, but only on one for N2
#' @param hetero F2 or N2. Proportion of heterozygous individuals under which the marker is excluded
#' @param na proportion of non-genotyped individuals under which the marker is excluded
#'
#' @import dplyr
#'
#' @export
#'
#### mark_prop ####
## excludes markers depending on proportions of homo/hetorozygous
mark_prop <- function(tab,cross,homo=NA,hetero=NA,na=0.5){
#calculate total number of individuals genotyped for each marker
tab <- tab %>% mutate(n_geno = tab %>% select(n_HM1,n_HM2,n_HT) %>% rowSums(na.rm=TRUE))
#calculate proportion of each genotype
tab <- tab %>% mutate(p_HM1 = n_HM1/n_geno)
tab <- tab %>% mutate(p_HM2 = n_HM2/n_geno)
tab <- tab %>% mutate(p_HT = n_HT/n_geno)
tab <- tab %>% mutate(p_NA = n_NA/(n_geno+n_NA))
#stop if cross != "F2" or "N2"
if(!cross %in% c("F2","N2")){
stop("Cross must be F2 or N2")
}
#stop if no value for "homo" for F2 cross
if(cross=="F2" & (is.na(homo)==TRUE | is.na(hetero)==TRUE)){
stop("Arguments homo and hetero must be specified for F2 crosses")
}
#stop if no value for "homo" and "hetero" for N2 cross
if(cross=="N2" & (is.na(homo)==TRUE | is.na(hetero)==TRUE)){
stop("Arguments homo and hetero must be specified for N2 crosses")
}
#exclude markers according to proportion of na
tab <- tab %>% mutate(exclude_prop=case_when(p_NA > na ~ 1, #exclude markers according to proportion of na
cross=="F2" & (p_HM1 < homo | p_HM2 < homo | p_HT < hetero) ~ 1, #exclude markers according to proportion of homozygous individuals for F2 cross
cross=="N2" & (p_HM1 < homo | p_HT < hetero) ~ 1, #exclude markers according to proportion of homozygous and heterozygous individuals for N2 cross
T ~ 0))
tab <- tab %>% select(-c(p_HM1,p_HM2,p_HT,p_NA,n_geno))
return(tab)
}
#' Data frame with phenotype of F2 individuals
#'
#' A dataset containing the phenotype of 176 F2 individuals
#'
#' @format A data frame with 176 observations of 4 variables
"phenos"
#' Data frame with miniMUGA genotyping of classical lab strains.
#'
#' A dataset containing the genotypes of 10 mouse strains of the Institut pasteur. Markers positions and other information are from by Karl Broman (https://kbroman.org/MUGAarrays/mini_revisited.html). Strains genotyped from Institut Pasteur.
#'
#' @format A data frame with 11299 rows and 18 variables
#' \describe{
#' \item{CC001}{CC001 mouse strain}
#' \item{CC005}{CC005 mouse strain}
#' \item{CC042}{CC042 mouse strain}
#' \item{CC071}{CC071 mouse strain}
#' \item{Ifnar.KO.129}{Ifnar KO 129 mouse strain}
#' \item{Ifnar.KO.B6}{Ifnar KO B6 mouse strain}
#' \item{Rvfs2.1}{Rvfs2-1 mouse strain}
#' \item{Rvfs2.2}{Rvfs2-2 mouse strain}
#' \item{Rvfs2.6}{Rvfs2-6 mouse strain}
#' \item{Rvfs2.7}{Rvfs2-7 mouse strain}
#' \item{marker}{name of the marker}
#' \item{chr}{chromosome}
#' \item{bp_mm10}{localisation on chromosome in bp (mm10 assembly)}
#' \item{cM_cox}{localisation on chromosome in cM (from Cox et al.)}
#' \item{cM_g2f1}{localisation on chromosome in cM (from Liu et al.)}
#' \item{snp}{marker alleles}
#' \item{unique}{indicates if the marker maps uniquely on mm10}
#' \item{multi}{indicates if the marker maps more than one time on mm10}
#' \item{unmapped}{indicates if the marker does not map perfectly on mm10}
#' }
"ref_strains_mini"
#' @title Create of the summary table for all markers from the genotype data frame
#'
#' @description This function creates a table with all the markers that were genotyped in the array, the alleles for these markers, the number of homozygous and heterozygous animals, as well as the number of non genotyped animals.
#' @param geno data frame with the genotyping results for your cross
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
#### tab_mark function ####
## create table with markers and counts
tab_mark <- function(geno){
#create geno column in geno df
geno <- geno %>% unite(Geno,c("Allele1...Forward","Allele2...Forward"),sep="",remove=FALSE)
#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA")
geno <- geno %>% mutate(Geno=recode(Geno,
"TA" = "AT",
"GA" = "AG",
"CA" = "AC",
"GT" = "TG",
"CT" = "TC",
"GC" = "CG"))
#create df with counts for each genotype
df_count <- tibble(SNP.Name = as.character(unique(geno$SNP.Name)),
Allele_1 = NA,
Allele_2 = NA,
n_HM1 = NA,
n_HM2 = NA,
n_HT = NA,
n_NA = NA)
## loop to count genotype
for(i in df_count$SNP.Name){
#extract alleles for each marker
Alleles <- geno %>% filter(SNP.Name==i) %>%
select(c(SNP.Name,Sample.ID,Geno,Allele1...Forward,Allele2...Forward)) %>%
pivot_longer(c(Allele1...Forward,Allele2...Forward),names_to="Allele_name",values_to="Allele") %>%
distinct(Allele) %>% filter(Allele != "-")
Alleles <- as.factor(paste(Alleles$Allele))
#sort alleles
Alleles <- factor(Alleles,levels=c("A","T","C","G"))
Alleles <- sort(Alleles)
#add alleles and counts, only for markers with alleles (not markers with no genotyped ind)
if(all(rapportools::is.empty(Alleles))==FALSE){
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_1 = ifelse(SNP.Name == i,
paste(Alleles[1]), Allele_1))
#count for homozygous for allele 1
n1 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM1 = ifelse(SNP.Name == i,
n1$n, n_HM1))
}
#if marker not polymorphic
if(is.na(Alleles[2])==TRUE){
#NA as Allele_2
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
NA, Allele_2))
#NA as n_HM2
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
NA, n_HM2))
#NA as n_HT
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
NA, n_HT))
} else {
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
paste(Alleles[2]), Allele_2))
#count for homozygous for allele 2
n2 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
n2$n, n_HM2))
#count for heterozygous
n3 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
n3$n<