Commit 3cd79cbd authored by Marie Bourdon's avatar Marie Bourdon
Browse files

modif geno_strains more than 2 strains

parent 404f913e
/home/marie/Documents/stuart_package/stuart/R/geno_strains.R="AF4CFAD2"
/home/marie/Documents/stuart_package/stuart/R/genos-data.R="1CBE4D7A"
/home/marie/Documents/stuart_package/stuart/R/mark_allele.R="32D90792"
/home/marie/Documents/stuart_package/stuart/R/mark_estmap.R="2E43F0E2"
/home/marie/Documents/stuart_package/stuart/R/mark_match.R="39634C04"
/home/marie/Documents/stuart_package/stuart/R/mark_poly.R="D19CB401"
/home/marie/Documents/stuart_package/stuart/R/phenos-data.R="753628A1"
/home/marie/Documents/stuart_package/stuart/R/stuart_cross-data.R="2A7132AD"
/home/marie/Documents/stuart_package/stuart/R/stuart_newmap-data.R="86A2EB27"
/home/marie/Documents/stuart_package/stuart/R/stuart_tab-data.R="D77DF427"
/home/marie/Documents/stuart_package/stuart/vignettes/stuart.Rmd="568884A2"
/mnt/zeus/zeus_mouselab/marie/map_before_us/map_before_us.R="2F43B141"
/mnt/zeus/zeus_mouselab/marie/stuart_estmap/stuart_estmap.R="000583A9"
#' @title Create haplotype for a new mouse strain into a dataframe
#' @title Create haplotype for inbred strains into a dataframe
#'
#' @description This functions adds columns for parental strains used in the cross in the annotation data frame, from the genotype data frame in which one or several animal of the parental strains were genotyped.
#' If several animals of one strain were genotyped, a consensus is created from these animals.
#' The consensus is created as follow : if the indivuals carry the same allele, this allele is kept, otherwise, the allele is noted as "N". If individuals show residual heterozygosity, it is encoded as "H".
#' @param ref data frame with the annotation of the arrray used
#' @param annot data frame with the annotation of the array used
#' @param geno data frame with the genotyping results for your cross from miniMUGA array
#' @param par1 first parental strain used in the cross, the name must be written as in the geno data frame
#' @param par2 second parental strain used in the cross, the name must be written as in the geno data frame
#' @param name1 name of the first parental strain to use as the column name in the ref data frame
#' @param name2 name of the second parental strain to use as the column name in the ref data frame
#' @param strn tibble or dataframe with the strains to analyse. Columns names are the name of the strains and the values in rows of each column are the IDs of the individuals genotyped for each strain.
#' @param cols name of the columns from the annot data frame to keep in the output of this function.
#'
#' @import dplyr
#' @import tidyr
#' @import tidyselect
#'
#' @export
#'
geno_strains <- function(ref,geno,par1,par2,name1,name2){
geno_strains <- function(annot,geno,strn,cols){
#rename df columns
geno <- geno %>% rename("marker"=1,
"id"=2,
......@@ -25,33 +25,30 @@ geno_strains <- function(ref,geno,par1,par2,name1,name2){
#recode genotypes from 2 alleles to 1
geno <- geno %>% mutate_all(as.character)
geno <- geno %>% filter(id %in% c(par1,par2))
geno <- geno %>% filter(id %in% unlist(unname(strn)))
geno <- geno %>% mutate(Geno=case_when(allele_1 == "-" | allele_2 == "-" ~ "N",
allele_1 == allele_2 ~ allele_1,
allele_1 %in% c("A","T","G","C") & allele_2 %in% c("A","T","G","C") ~ "H"))
geno <- geno %>% select(marker,id,Geno) %>% pivot_wider(names_from = id, values_from = Geno)
#pivoting
geno <- geno %>% select(marker,id,Geno) %>% tidyr::pivot_wider(names_from = id, values_from = Geno)
#create consensus
geno <- geno %>% mutate(parent1 = !!sym(par1[1]))
if(length(par1)!=1){
for(i in 2:length(par1)){
geno <- geno %>% mutate(parent1 = ifelse(parent1==!!sym(par1[i]),parent1,"N"))
}
}
for(i in colnames(strn)){
inds <- na.omit(strn[,i]) %>%pull()
geno <- geno %>% rename(!!sym(i):=!!sym(inds[1]))
geno <- geno %>% mutate(parent2 = !!sym(par2[1]))
if(length(par2)!=1){
for(i in 2:length(par2)){
geno <- geno %>% mutate(parent2 = ifelse(parent2==!!sym(par2[i]),parent2,"N"))
if(length(inds) > 1){
for(j in 2:length(inds)){
geno <- geno %>% mutate(!!sym(i) := ifelse(!!sym(i)==!!sym(inds[j]),!!sym(i),"N"))
}
}
}
geno <- geno %>% select(marker,colnames(strn))
geno <- geno %>% select(marker,parent1,parent2)
colnames(geno) <- c("marker",name1,name2)
#merge with ref file
ref <- full_join(ref,geno,by=c("marker"="marker"))
return(ref)
#merge with annot file
annot <- annot %>% select(marker,tidyselect::all_of(cols))
annot <- full_join(annot,geno,by=c("marker"="marker"))
return(annot)
}
#' @title Exclude markers depending on proportions of homo/hetorozygous
#'
#' @description This function uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present odd proportions of each genotype. You can define these proportions thanks to the arguments of the function.
#' @description This function uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present too much missing genotypes or odd proportions of each genotype. You can define these proportions thanks to the arguments of the function. The filter on genotype proportions applies to autosomes only (and not the chromosomes encoded as "X", "Y" and "M")
#' @param tab data frame obtained with tab_mark function.
#' @param cross F2 or N2.
#' @param homo proportion of homozygous individuals under which the marker is excluded. Will apply on both homozygous genotypes for a F2, but only on one for N2.
......@@ -52,9 +52,9 @@ mark_prop <- function(tab,cross,homo=NA,hetero=NA,pval=NA,na=0.5){
tab <- tab %>%
mutate(exclude_prop=case_when(!chr %in% c("X","Y","M") & p_NA > na ~ 1,
!chr %in% c("X","Y","M") & cross=="F2" & (p_HM1 < homo | p_HM2 < homo | p_HT < hetero) ~ 1,
!chr %in% c("X","Y","M") & cross=="N2" & ((p_HM2 == 0 & p_HM1 < homo) |
mutate(exclude_prop=case_when(!chr %in% c("X","Y","M","PAR") & p_NA > na ~ 1,
!chr %in% c("X","Y","M","PAR") & cross=="F2" & (p_HM1 < homo | p_HM2 < homo | p_HT < hetero) ~ 1,
!chr %in% c("X","Y","M","PAR") & cross=="N2" & ((p_HM2 == 0 & p_HM1 < homo) |
(p_HM1 == 0 & p_HM2 < homo) |
(p_HT < hetero) |
(p_HM1 !=0 & p_HM2 != 0)) ~ 1,
......
......@@ -2,7 +2,8 @@
#'
#' @description This function creates a table with all the markers that were genotyped in the array, the alleles for these markers, the number of homozygous and heterozygous animals, as well as the number of non genotyped animals.
#' @param geno data frame with the genotyping results for your cross
#' @param ref data frame with the annotation of the arrray used
#' @param ref data frame with the annotation of the array used. It must contain a "chr" column and a column with positions. X, Y and mitochondrial chromosomes must be coded as: "X", "Y" and "M" in all caps.
#' @param ref data frame with the annotation of the array used
#'
#' @import dplyr
#' @import tidyr
......
File added
......@@ -44,7 +44,9 @@ Data frame from stuart with phenotypes of 176 F2 individuals for a quantitative
data(phenos)
summary(phenos)
```
### annotation file
Annotation file from K.Broman: https://kbroman.org/MUGAarrays/mini_annotations.html
```{r annot}
annot_mini <- read.csv(url("https://raw.githubusercontent.com/kbroman/MUGAarrays/master/UWisc/mini_uwisc_v2.csv"))
```
......
......@@ -2,22 +2,18 @@
% Please edit documentation in R/geno_strains.R
\name{geno_strains}
\alias{geno_strains}
\title{Create haplotype for a new mouse strain into a dataframe}
\title{Create haplotype for inbred strains into a dataframe}
\usage{
geno_strains(ref, geno, par1, par2, name1, name2)
geno_strains(annot, geno, strn, cols)
}
\arguments{
\item{ref}{data frame with the annotation of the arrray used}
\item{annot}{data frame with the annotation of the array used}
\item{geno}{data frame with the genotyping results for your cross from miniMUGA array}
\item{par1}{first parental strain used in the cross, the name must be written as in the geno data frame}
\item{strn}{tibble or dataframe with the strains to analyse. Columns names are the name of the strains and the values in rows of each column are the IDs of the individuals genotyped for each strain.}
\item{par2}{second parental strain used in the cross, the name must be written as in the geno data frame}
\item{name1}{name of the first parental strain to use as the column name in the ref data frame}
\item{name2}{name of the second parental strain to use as the column name in the ref data frame}
\item{cols}{name of the columns from the annot data frame to keep in the output of this function.}
}
\description{
This functions adds columns for parental strains used in the cross in the annotation data frame, from the genotype data frame in which one or several animal of the parental strains were genotyped.
......
......@@ -18,5 +18,5 @@ mark_prop(tab, cross, homo = NA, hetero = NA, pval = NA, na = 0.5)
\item{na}{proportion of non-genotyped individuals above which the marker is excluded.}
}
\description{
This function uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present odd proportions of each genotype. You can define these proportions thanks to the arguments of the function.
This function uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present too much missing genotypes or odd proportions of each genotype. You can define these proportions thanks to the arguments of the function. The filter on genotype proportions applies to autosomes only (and not the chromosomes encoded as "X", "Y" and "M")
}
......@@ -9,7 +9,7 @@ tab_mark(geno, annot, pos)
\arguments{
\item{geno}{data frame with the genotyping results for your cross}
\item{ref}{data frame with the annotation of the arrray used}
\item{ref}{data frame with the annotation of the array used}
}
\description{
This function creates a table with all the markers that were genotyped in the array, the alleles for these markers, the number of homozygous and heterozygous animals, as well as the number of non genotyped animals.
......
No preview for this file type
No preview for this file type
......@@ -71,8 +71,12 @@ We recommend to always genotype the parental strains of the cross. Here, their g
This is done with the `geno_strains` function.
```{r strains}
strains <- geno_strains(ref=annot_mini,geno=genos,par1=c("StrainsA_1","StrainsA_2"),
par2=c("StrainsB_1","StrainsB_2"),name1="parent1",name2="parent2")
parental_strains <- tibble::tibble(parent1 = c("StrainsA_1","StrainsA_2"),
parent2 = c("StrainsB_1","StrainsB_2"))
strains <- geno_strains(annot=annot_mini,geno=genos,
strn=parental_strains,cols=c("chr","cM_cox"))
head(strains) %>% print.data.frame()
```
......@@ -87,7 +91,7 @@ genos <- genos %>% filter(!Sample.ID %in% c("StrainsA_1", "StrainsA_2",
### Marker tab
The first step of the markers sorting is to create the marker dataframe with the tab_mark() function. This dataframe contains for each marker its chromosome (`chr`) and position (`cM_cox` was chosen from annot_mini), the two alleles that can be found in the F2/N2 population (`Allele_1` and `Allele_2`), the number of individuals for each genotype (homozygous for each allele (`n_HM1` and `n_HM2`) and heterozygous (`n_HT`)), and the number of non genotyped individuals (`n_NA`) This step can take several minutes, so you can also load the example output of this function.
The first step of the markers sorting is to create the marker dataframe with the tab_mark() function. This dataframe contains for each marker its chromosome (`chr`) and position (`cM_cox` was chosen from annot_mini, but you can choose the column that you want provided that it is in the annotation file), the two alleles that can be found in the F2/N2 population (`Allele_1` and `Allele_2`), the number of individuals for each genotype (homozygous for each allele (`n_HM1` and `n_HM2`) and heterozygous (`n_HT`)), and the number of non genotyped individuals (`n_NA`) This step can take several minutes, so you can also load the example output of this function.
```{r tab_mark}
......@@ -102,7 +106,7 @@ Then we will use the different mark_* functions in order to filter the markers.
### mark_match
First, we can use `mark_match()`` function. Here, the parental strains were genotyped with the F2 individuals, but it can happen that you use previous genotyping results for the parental strains. `mark_match()` function excludes markers that are in your genotype file but not in the reference genotype dataset. We recomend using this function as the chip used for genotyping may change.
First, we can use `mark_match()` function. Here, the parental strains were genotyped with the F2 individuals, but it can happen that you use previous genotyping results for the parental strains. `mark_match()` function excludes markers that are in your genotype file but not in the reference genotype dataset. We recomend using this function as the chip used for genotyping may change.
```{r mark_match}
tab2 <- mark_match(stuart_tab,ref=strains)
......@@ -146,8 +150,7 @@ tab2 %>% arrange(desc(exclude_allele)) %>% head() %>% print.data.frame()
Indeed, we can see that the markers excluded with `mark_allele()` have different alleles in the parental strains.
```{r mark_allele-strains}
strains %>% filter(marker %in% c("gJAX00038569","gJAX00425031","gUNC12245354",
"gUNC15530876","gUNC21555204","gUNC21596600")) %>% arrange(marker) %>%
strains %>% filter(marker %in% c("gJAX00001099","gUNC83675","gUNC66010","mUNC010010509","gUNC30322","mUNC010515443")) %>%
select(marker,parent1,parent2) %>% print.data.frame()
```
......@@ -155,18 +158,20 @@ strains %>% filter(marker %in% c("gJAX00038569","gJAX00425031","gUNC12245354",
After excluding the problematic markers, we can create the R/qtl file. The individuals must have the same ID in the geno and in the pheno file. If there is a prefix in the geno file that must be removed in order to acheive this, you can use the "prefix" argument. The "path" argument can be used in order to create a CSV file that you can laod with `qtl::read.cross`.
```{r write_qtl}
stuart_cross <- write_rqtl(geno=genos,pheno=phenos,tab=tab2,
```{r write_rqtl}
stuart_output <- write_rqtl(geno=genos,pheno=phenos,tab=tab2,
ref=strains,par1="parent1",par2="parent2",prefix="ind_",pos="cM_cox")
stuart_cross[1:10,1:7] %>% print.data.frame()
stuart_output[1:10,1:7] %>% print.data.frame()
```
## Check markers with est.map()
The best way to verify that the markers are correctly sorted is to calculate the estimated map with R/qtl. Indeed, if there is a problematic marker, it will present high recombination fraction with adjacent markers.
Here we can load the cross object that was created with the previous steps of the pipeline as well as the newmap that was produced with the function `est.map(cross=stuart_cross,error.prob=0.01)`.
The CSV file produced with stuart must be loaded in R as a cross with `qtl::read.cross`. The dataframe produced by `write_rqtl` cannot be used with R/qtl as it does not have the "cross" class.
The cross object was saved in stuart. Here we can load it as well as the newmap that was produced with the function `est.map(cross=stuart_cross,error.prob=0.01)`.
```{r load_cross}
library(qtl)
......@@ -182,6 +187,7 @@ Then we can plot the newmap to compare it with the positions found in the annota
```{r plot_map}
plotMap(stuart_cross,stuart_newmap,shift=TRUE)
```
### mark_estmap
It seems to be still 2 problematic markers. We can identify them with the `mark_estmap()`function.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment