Commit c62185f3 authored by Marie Bourdon's avatar Marie Bourdon
Browse files

modif names geno_strains

parent 6145aaab
{
"cursorPosition" : "24,47",
"scrollLine" : "2"
}
\ No newline at end of file
{
}
\ No newline at end of file
{
"cursorPosition" : "38,30",
"scrollLine" : "25",
"source_window_id" : ""
}
\ No newline at end of file
{
"cursorPosition" : "34,0",
"scrollLine" : "21"
}
\ No newline at end of file
{
}
\ No newline at end of file
{
}
\ No newline at end of file
{
"cursorPosition" : "44,0",
"cursorPosition" : "78,3",
"last_setup_crc32" : "39B546A65bfca283",
"scrollLine" : "44"
"scrollLine" : "63"
}
\ No newline at end of file
~%2Fstuart_package%2Fstuart%2FDESCRIPTION="BEB7232"
~%2Fstuart_package%2Fstuart%2FNAMESPACE="AE613167"
~%2Fstuart_package%2Fstuart%2FR%2Fgeno_strains.R="8F7B714A"
~%2Fstuart_package%2Fstuart%2FR%2Fgenos-data.R="C10FF5C8"
~%2Fstuart_package%2Fstuart%2FR%2Fmark_allele.R="1416C0B5"
~%2Fstuart_package%2Fstuart%2FR%2Fmark_match.R="75F49619"
~%2Fstuart_package%2Fstuart%2FR%2Fmark_poly.R="3A328548"
~%2Fstuart_package%2Fstuart%2FR%2Fmark_prop.R="A609F054"
~%2Fstuart_package%2Fstuart%2FR%2Ftab_mark.R="7FA3B215"
~%2Fstuart_package%2Fstuart%2FR%2Fwrite_rqtl.R="5B8691C7"
~%2Fstuart_package%2Fstuart%2Fvignettes%2FstuaRt.R="EBD625D2"
~%2Fstuart_package%2Fstuart%2Fvignettes%2FstuaRt.Rmd="D602FFE4"
{
"collab_server" : "",
"contents" : "",
"created" : 1622622953119.000,
"created" : 1622636142238.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "2767839831",
"id" : "9E3EC0FD",
"lastKnownWriteTime" : 1622539469,
"last_content_update" : 1622539469,
"path" : "~/stuart_package/stuart/NAMESPACE",
"project_path" : "NAMESPACE",
"hash" : "387034705",
"id" : "42D37312",
"lastKnownWriteTime" : 1622465534,
"last_content_update" : 1622465534,
"path" : "~/stuart_package/stuart/R/mark_prop.R",
"project_path" : "R/mark_prop.R",
"properties" : {
},
"read_only" : true,
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 8,
"relative_order" : 7,
"source_on_save" : false,
"source_window" : "",
"type" : "r_namespace"
"type" : "r_source"
}
\ No newline at end of file
#' @title Exclude markers depending on proportions of homo/hetorozygous
#'
#' @description uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present odd proportions of each genotype. You can define these proportions thanks to the arguments of the function.
#' @param tab data frame obtained with tab_mark function.
#' @param cross F2 or N2.
#' @param homo proportion of homozygous individuals under which the marker is excluded. Will apply on both homozygous genotypes for a F2, but only on one for N2.
#' @param hetero proportion of heterozygous individuals under which the marker is excluded.
#' @param na proportion of non-genotyped individuals above which the marker is excluded.
#'
#' @import dplyr
#'
#' @export
#'
#### mark_prop ####
## excludes markers depending on proportions of homo/hetorozygous
mark_prop <- function(tab,cross,homo=NA,hetero=NA,na=0.5){
#calculate total number of individuals genotyped for each marker
tab <- tab %>% mutate(n_geno = tab %>% select(n_HM1,n_HM2,n_HT) %>% rowSums(na.rm=TRUE))
#calculate proportion of each genotype
tab <- tab %>% mutate(p_HM1 = n_HM1/n_geno)
tab <- tab %>% mutate(p_HM2 = n_HM2/n_geno)
tab <- tab %>% mutate(p_HT = n_HT/n_geno)
tab <- tab %>% mutate(p_NA = n_NA/(n_geno+n_NA))
#stop if cross != "F2" or "N2"
if(!cross %in% c("F2","N2")){
stop("Cross must be F2 or N2")
}
#stop if no value for "homo" for F2 cross
if(cross=="F2" & (is.na(homo)==TRUE | is.na(hetero)==TRUE)){
stop("Arguments homo and hetero must be specified for F2 crosses")
}
#stop if no value for "homo" and "hetero" for N2 cross
if(cross=="N2" & (is.na(homo)==TRUE | is.na(hetero)==TRUE)){
stop("Arguments homo and hetero must be specified for N2 crosses")
}
#exclude markers according to proportion of na
tab <- tab %>% mutate(exclude_prop=case_when(p_NA > na ~ 1, #exclude markers according to proportion of na
cross=="F2" & (p_HM1 < homo | p_HM2 < homo | p_HT < hetero) ~ 1, #exclude markers according to proportion of homozygous individuals for F2 cross
cross=="N2" & (p_HM1 < homo | p_HT < hetero) ~ 1, #exclude markers according to proportion of homozygous and heterozygous individuals for N2 cross
T ~ 0))
tab <- tab %>% select(-c(p_HM1,p_HM2,p_HT,p_NA,n_geno))
return(tab)
}
......@@ -18,7 +18,7 @@
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 5,
"relative_order" : 9,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
......
{
"collab_server" : "",
"contents" : "",
"created" : 1622538256440.000,
"dirty" : false,
"encoding" : "",
"folds" : "",
"hash" : "0",
"id" : "4A9D04E",
"lastKnownWriteTime" : 140548509794304,
"last_content_update" : 1622538256440,
"path" : null,
"project_path" : null,
"properties" : {
"cacheKey" : "F7A5FD33",
"caption" : "strains",
"contentUrl" : "grid_resource/gridviewer.html?env=&obj=strains&cache_key=F7A5FD33",
"displayedObservations" : "11125",
"environment" : "",
"expression" : "strains",
"object" : "strains",
"preview" : "0",
"totalObservations" : "11125",
"variables" : "14"
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 3,
"source_on_save" : false,
"source_window" : "",
"type" : "r_dataframe"
}
\ No newline at end of file
{
"collab_server" : "",
"contents" : "",
"created" : 1622637863181.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "1306976036",
"id" : "65C9B72B",
"lastKnownWriteTime" : 1622462353,
"last_content_update" : 1622462353,
"path" : "~/stuart_package/stuart/R/tab_mark.R",
"project_path" : "R/tab_mark.R",
"properties" : {
"cursorPosition" : "38,30",
"scrollLine" : "25",
"source_window_id" : ""
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 3,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
#' @title Create of the summary table for all markers from the genotype data frame
#'
#' @description This function creates a table with all the markers that were genotyped in the array, the alleles for these markers, the number of homozygous and heterozygous animals, as well as the number of non genotyped animals.
#' @param geno data frame with the genotyping results for your cross
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
#### tab_mark function ####
## create table with markers and counts
tab_mark <- function(geno){
#create geno column in geno df
geno <- geno %>% unite(Geno,c("Allele1...Forward","Allele2...Forward"),sep="",remove=FALSE)
#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA")
geno <- geno %>% mutate(Geno=recode(Geno,
"TA" = "AT",
"GA" = "AG",
"CA" = "AC",
"GT" = "TG",
"CT" = "TC",
"GC" = "CG"))
#create df with counts for each genotype
df_count <- tibble(SNP.Name = as.character(unique(geno$SNP.Name)),
Allele_1 = NA,
Allele_2 = NA,
n_HM1 = NA,
n_HM2 = NA,
n_HT = NA,
n_NA = NA)
## loop to count genotype
for(i in df_count$SNP.Name){
#extract alleles for each marker
Alleles <- geno %>% filter(SNP.Name==i) %>%
select(c(SNP.Name,Sample.ID,Geno,Allele1...Forward,Allele2...Forward)) %>%
pivot_longer(c(Allele1...Forward,Allele2...Forward),names_to="Allele_name",values_to="Allele") %>%
distinct(Allele) %>% filter(Allele != "-")
Alleles <- as.factor(paste(Alleles$Allele))
#sort alleles
Alleles <- factor(Alleles,levels=c("A","T","C","G"))
Alleles <- sort(Alleles)
#add alleles and counts, only for markers with alleles (not markers with no genotyped ind)
if(all(rapportools::is.empty(Alleles))==FALSE){
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_1 = ifelse(SNP.Name == i,
paste(Alleles[1]), Allele_1))
#count for homozygous for allele 1
n1 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM1 = ifelse(SNP.Name == i,
n1$n, n_HM1))
}
#if marker not polymorphic
if(is.na(Alleles[2])==TRUE){
#NA as Allele_2
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
NA, Allele_2))
#NA as n_HM2
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
NA, n_HM2))
#NA as n_HT
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
NA, n_HT))
} else {
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
paste(Alleles[2]), Allele_2))
#count for homozygous for allele 2
n2 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
n2$n, n_HM2))
#count for heterozygous
n3 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
n3$n, n_HT))
}
#count for NA
n4 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == "--" |
Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") |
Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>%
summarise(n=n())
#add count for NA to df_count
df_count <- df_count %>% mutate(n_NA = ifelse(SNP.Name == i,
n4$n, n_NA))
}
#change class of counts as numeric :
df_count$n_HM1 <- df_count$n_HM1 %>% as.numeric()
df_count$n_HM2 <- df_count$n_HM2 %>% as.numeric()
df_count$n_HT <- df_count$n_HT %>% as.numeric()
df_count$n_NA <- df_count$n_NA %>% as.numeric()
#add 0 for null counts
df_count <- df_count %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0))
#return
return(df_count)
}
{
"collab_server" : "",
"contents" : "",
"created" : 1622636138213.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "897610086",
"id" : "76AC3EC",
"lastKnownWriteTime" : 1622462353,
"last_content_update" : 1622462353,
"path" : "~/stuart_package/stuart/R/mark_match.R",
"project_path" : "R/mark_match.R",
"properties" : {
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 5,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
\ No newline at end of file
#' @title Exclude markers that were not genotyped in the reference strains
#'
#' @description This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that were genotyped in the individuals of the cross but not in the reference strains. This is useful if the parental strains of the cross were not genotyped with the individuals and a previous genotyping result is used. Indeed, changes in the markers of the array may have happened. We recommend always using this function in order to avoid errors.
#' @param tab data frame obtained with tab_mark function
#' @param ref data frame with the reference genotypes of mouse lines
#'
#' @import dplyr
#'
#' @export
#'
mark_match <- function(tab, #tab_mark df
ref){ #strain ref geno file
#finds SNPs that are in both files:
snp_strains <- as.character(ref$marker) #extracts SNPs in strains ref geno file
snp_genfile <- as.character(tab$SNP.Name) #extracts SNPs in cross geno file
snp <- intersect(snp_strains,snp_genfile) #take intercept
#add results in exclude column
return(tab %>% mutate(exclude_match=ifelse(!SNP.Name %in% snp,
1,
0)))
}
#' Data frame with miniMUGA genotyping of F2 individuals and parental strains
#'
#' A dataset containing the genotypes of 176 F2 individuals
#'
#' @format A data frame with 2002493 observations of 11 variables
"genos"
{
"collab_server" : "",
"contents" : "",
"created" : 1622538162413.000,
"dirty" : false,
"encoding" : "",
"folds" : "",
"hash" : "0",
"id" : "806AAC34",
"lastKnownWriteTime" : 5,
"last_content_update" : 1622538162413,
"path" : null,
"project_path" : null,
"properties" : {
"cacheKey" : "634A6953",
"caption" : "stuart_tab",
"contentUrl" : "grid_resource/gridviewer.html?env=&obj=stuart_tab&cache_key=634A6953",
"displayedObservations" : 11125,
"environment" : "",
"expression" : "stuart_tab",
"object" : "stuart_tab",
"preview" : 0,
"totalObservations" : 11125,
"variables" : 7
},
"read_only" : false,
"read_only_alternatives" : [
],
"relative_order" : 4,
"source_on_save" : false,
"source_window" : "",
"type" : "r_dataframe"
}
\ No newline at end of file
#' @title Create of the summary table for all markers from the genotype data frame
#'
#' @description This function creates a table with all the markers that were genotyped in the array, the alleles for these markers, the number of homozygous and heterozygous animals, as well as the number of non genotyped animals.
#' @param geno data frame with the genotyping results for your cross
#'
#' @import dplyr
#' @import tidyr
#'
#' @export
#'
#### tab_mark function ####
## create table with markers and counts
tab_mark <- function(geno){
#create geno column in geno df
geno <- geno %>% unite(Geno,c("Allele1...Forward","Allele2...Forward"),sep="",remove=FALSE)
#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA")
geno <- geno %>% mutate(Geno=recode(Geno,
"TA" = "AT",
"GA" = "AG",
"CA" = "AC",
"GT" = "TG",
"CT" = "TC",
"GC" = "CG"))
#create df with counts for each genotype
df_count <- tibble(SNP.Name = as.character(unique(geno$SNP.Name)),
Allele_1 = NA,
Allele_2 = NA,
n_HM1 = NA,
n_HM2 = NA,
n_HT = NA,
n_NA = NA)
## loop to count genotype
for(i in df_count$SNP.Name){
#extract alleles for each marker
Alleles <- geno %>% filter(SNP.Name==i) %>%
select(c(SNP.Name,Sample.ID,Geno,Allele1...Forward,Allele2...Forward)) %>%
pivot_longer(c(Allele1...Forward,Allele2...Forward),names_to="Allele_name",values_to="Allele") %>%
distinct(Allele) %>% filter(Allele != "-")
Alleles <- as.factor(paste(Alleles$Allele))
#sort alleles
Alleles <- factor(Alleles,levels=c("A","T","C","G"))
Alleles <- sort(Alleles)
#add alleles and counts, only for markers with alleles (not markers with no genotyped ind)
if(all(rapportools::is.empty(Alleles))==FALSE){
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_1 = ifelse(SNP.Name == i,
paste(Alleles[1]), Allele_1))
#count for homozygous for allele 1
n1 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM1 = ifelse(SNP.Name == i,
n1$n, n_HM1))
}
#if marker not polymorphic
if(is.na(Alleles[2])==TRUE){
#NA as Allele_2
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
NA, Allele_2))
#NA as n_HM2
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
NA, n_HM2))
#NA as n_HT
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
NA, n_HT))
} else {
#add alleles to df_count
df_count <- df_count %>% mutate(Allele_2 = ifelse(SNP.Name == i,
paste(Alleles[2]), Allele_2))
#count for homozygous for allele 2
n2 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HM2 = ifelse(SNP.Name == i,
n2$n, n_HM2))
#count for heterozygous
n3 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>%
summarise(n=n())
#add count for homozygous for allele 1 to df_count
df_count <- df_count %>% mutate(n_HT = ifelse(SNP.Name == i,
n3$n, n_HT))
}
#count for NA
n4 <- geno %>% filter(SNP.Name==i) %>%
filter(Geno == "--" |
Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") |
Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>%
summarise(n=n())
#add count for NA to df_count
df_count <- df_count %>% mutate(n_NA = ifelse(SNP.Name == i,
n4$n, n_NA))
}
#change class of counts as numeric :
df_count$n_HM1 <- df_count$n_HM1 %>% as.numeric()
df_count$n_HM2 <- df_count$n_HM2 %>% as.numeric()
df_count$n_HT <- df_count$n_HT %>% as.numeric()
df_count$n_NA <- df_count$n_NA %>% as.numeric()
#add 0 for null counts
df_count <- df_count %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0))
#return
return(df_count)
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment