230116 v1.1.0 new tab_mark function

ec4b9742 · mariefbourdon · c792484b · ec4b9742 · ec4b9742 · ec4b9742
Commit ec4b9742 authored 2 years ago by mariefbourdon
--- a/.Rproj.user/shared/notebooks/paths
+++ b/.Rproj.user/shared/notebooks/paths
-/home/marie/Documents/HB_CC011/F2/Geno/Maestro/F2_geno_estmap1.R="42B0AE06"
 /home/marie/Documents/stuart/stuart_package/stuart/DESCRIPTION="55556B0B"
+/home/marie/Documents/stuart/stuart_package/stuart/NAMESPACE="D52CF639"
 /home/marie/Documents/stuart/stuart_package/stuart/R/geno_strains.R="68944065"
 /home/marie/Documents/stuart/stuart_package/stuart/R/mark_estmap.R="F759F000"
+/home/marie/Documents/stuart/stuart_package/stuart/R/mark_poly.R="25EFE570"
+/home/marie/Documents/stuart/stuart_package/stuart/R/mark_prop.R="E4688428"
+/home/marie/Documents/stuart/stuart_package/stuart/R/tab_mark.R="37E604E4"
+/home/marie/Documents/stuart/stuart_package/stuart/R/write_rqtl.R="04457D88"
 /home/marie/Documents/stuart/stuart_package/stuart/README.Rmd="4E06E7DC"
 /home/marie/Documents/stuart/stuart_package/stuart/doc/stuart.R="A03ACF3D"
 /home/marie/Documents/stuart/stuart_package/stuart/man/genos.Rd="75E7412F"

--- a/DESCRIPTION
+++ b/DESCRIPTION
 Package: stuart
 Title: Sort markers of lab strains genotyping results
-Version: 1.0.7
+Version: 1.1.0
 Authors@R: 
    person(given = "Marie",
           family = "Bourdon",
@@ -12,7 +12,7 @@ License: GPL-3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.1
 Depends: 
    R (>= 3.5.0),
    dplyr,

--- a/R/mark_allele.R
+++ b/R/mark_allele.R
@@ -6,6 +6,7 @@
 #' @param cross cross type, either "F2" or "N2".
 #' @param par1 first parental strain used in the cross, the name must be written as in the "ref" data frame. For a backcross the strain used to backcross F1 individuals must be indicated as par1.
 #' @param par2 second parental strain used in the cross, the name must be written as in the "ref" data frame.
+#' @param parNH wether markers for which one of the parental strains has an undetermined or heterozygous genotype (default is TRUE)
 #'
 #' @import dplyr
 #'

--- a/R/mark_prop.R
+++ b/R/mark_prop.R
@@ -16,7 +16,7 @@
 #' @param homo1X a vector of two numbers. The lower and upper limits for the proportion of homozygous individuals for markers on X chromosome. This argument is for homozygous genotype with the highest expected proportion.
 #' @param homo2X a vector of two numbers. The lower and upper limits for the proportion of homozygous individuals for markers on X chromosome. This argument is for homozygous genotype with the lowest expected proportion.
 #' @param heteroX a vector of two numbers. The lower and upper limits for the proportion of heterozygous individuals for markers on X chromosome.
-#' @param na proportion of non-genotyped individuals above which the marker is excluded.
+#' @param pval chi2 p value threshold under which the marker is excluded. Must not be used if the homo and hetero arguments are used.
 #'
 #' @import dplyr
 #' @import tidyselect

--- a/R/tab_mark.R
+++ b/R/tab_mark.R
@@ -23,137 +23,73 @@ tab_mark <- function(geno,annot,pos){
    stop("No position data")
  }
-  #rename df columns
+  geno <- geno %>% rename(marker = 1, id = 2, allele_1 = 3,
-  geno <- geno %>% rename("marker"=1,
+                          allele_2 = 4)
-                          "id"=2,
-                          "allele_1"=3,
+  tab <- geno %>% group_by(marker) %>%
-                          "allele_2"=4)
+    summarise(alleles=toString(unique(c(allele_1,allele_2))))
-  #create geno column in geno df
+  tab <- tab %>% separate(alleles,c("al1","al2","al3"),sep=", ",fill="right") %>%
-  geno <- geno %>% unite(Geno,c("allele_1","allele_2"),sep="",remove=FALSE)
+    mutate(allele_1=case_when(al1 == "A" | al2 == "A" | al3 == "A" ~ "A",
+                              al1 == "T" | al2 == "T" | al3 == "T" ~ "T",
-  #recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA")
+                              al1 == "C" | al2 == "C" | al3 == "C" ~ "C",
-  geno <- geno %>% mutate(Geno=recode(Geno,
+                              al1 == "G" | al2 == "G" | al3 == "G" ~ "G")) %>%
-                                      "TA" = "AT",
+    mutate(allele_2=case_when(#is.na(al2)==TRUE & is.na(al3)==TRUE ~ NA,
-                                      "GA" = "AG",
+      allele_1 != "T" & (al1=="T" | al2=="T" | al3=="T") ~ "T",
-                                      "CA" = "AC",
+      allele_1 != "C" & (al1=="C" | al2=="C" | al3=="C") ~ "C",
-                                      "GT" = "TG",
+      allele_1 != "G" & (al1=="G" | al2=="G" | al3=="G") ~ "G")) %>%
-                                      "CT" = "TC",
+    select(-c(al1,al2,al3))
-                                      "GC" = "CG"))
+  # genotype per ind
+  geno <- geno %>% unite(Geno, c("allele_1", "allele_2"), sep = "",
-  #create df with counts for each genotype
+                         remove = FALSE) %>% select(marker,id,Geno)
-  tab <- tibble(marker = as.character(unique(geno$marker)),
+  geno <- geno %>% mutate(Geno = recode(Geno, TA = "AT", GA = "AG",
-                allele_1 = NA,
+                                        CA = "AC", GT = "TG", CT = "TC", GC = "CG"))
-                allele_2 = NA,
-                n_HM1 = NA,
+  #summarise
-                n_HM2 = NA,
+  geno <- geno %>% group_by(Geno,marker) %>% summarise(n=n())
-                n_HT = NA,
+  geno <- geno %>% pivot_wider(names_from=Geno,values_from=n)
-                n_NA = NA)
+  # add columns if missing
+  genocols <- c(AA = NA_integer_, AT = NA_integer_, AG = NA_integer_, AC = NA_integer_,
-  ## loop to count genotype
+                TT = NA_integer_, TC = NA_integer_, TG = NA_integer_,
-  for(i in tab$marker){
+                CC = NA_integer_, CG = NA_integer_,
-    #extract alleles for each marker
+                GG = NA_integer_)
-    Alleles <- geno %>% filter(marker==i) %>%
-      select(c(marker,id,Geno,allele_1,allele_2)) %>%
+  if(!rapportools::is.empty(setdiff(names(genocols), names(geno)))){
-      pivot_longer(c(allele_1,allele_2),names_to="Allele_name",values_to="Allele") %>%
+    geno <- geno %>% add_column(!!!genocols[setdiff(names(genocols), names(geno))])
-      distinct(Allele) %>% filter(Allele != "-")
-    Alleles <- as.factor(paste(Alleles$Allele))
-    #sort alleles
-    Alleles <- factor(Alleles,levels=c("A","T","C","G"))
-    Alleles <- sort(Alleles)
-    #add alleles and counts, only for markers with alleles (not markers with no genotyped ind)
-    if(all(rapportools::is.empty(Alleles))==FALSE){
-      #add alleles to tab
-      tab <- tab %>% mutate(allele_1 = ifelse(marker == i,
-                                              paste(Alleles[1]), allele_1))
-      #count for homozygous for allele 1
-      n1 <- geno %>% filter(marker==i) %>%
-        filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>%
-        summarise(n=n())
-      #add count for homozygous for allele 1 to tab
-      tab <- tab %>% mutate(n_HM1 = ifelse(marker == i,
-                                           n1$n, n_HM1))
-    }
-    #if marker not polymorphic
-    if(is.na(Alleles[2])==TRUE){
-      #NA as allele_2
-      tab <- tab %>% mutate(allele_2 = ifelse(marker == i,
-                                              NA, allele_2))
-      #NA as n_HM2
-      tab <- tab %>% mutate(n_HM2 = ifelse(marker == i,
-                                           NA, n_HM2))
-      #NA as n_HT
-      tab <- tab %>% mutate(n_HT = ifelse(marker == i,
-                                          NA, n_HT))
-    } else {
-      #add alleles to tab
-      tab <- tab %>% mutate(allele_2 = ifelse(marker == i,
-                                              paste(Alleles[2]), allele_2))
-      #count for homozygous for allele 2
-      n2 <- geno %>% filter(marker==i) %>%
-        filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>%
-        summarise(n=n())
-      #add count for homozygous for allele 1 to tab
-      tab <- tab %>% mutate(n_HM2 = ifelse(marker == i,
-                                           n2$n, n_HM2))
-      #count for heterozygous
-      n3 <- geno %>% filter(marker==i) %>%
-        filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>%
-        summarise(n=n())
-      #add count for homozygous for allele 1 to tab
-      tab <- tab %>% mutate(n_HT = ifelse(marker == i,
-                                          n3$n, n_HT))
-    }
-    #count for NA
-    n4 <- geno %>% filter(marker==i) %>%
-      filter(Geno == "--" |
-               Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") |
-               Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>%
-      summarise(n=n())
-    #add count for NA to tab
-    tab <- tab %>% mutate(n_NA = ifelse(marker == i,
-                                        n4$n, n_NA))
  }
-  #change class of counts as numeric :
-  tab$n_HM1 <- tab$n_HM1 %>% as.numeric()
-  tab$n_HM2 <- tab$n_HM2 %>% as.numeric()
-  tab$n_HT <- tab$n_HT %>% as.numeric()
-  tab$n_NA <- tab$n_NA %>% as.numeric()
-  #add 0 for null counts
+  #join tab and geno
-  tab <- tab %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0))
+  tab <- full_join(tab,geno,by="marker")
-  #save useful columns in annot dataframe
+  #replace NA with 0
+  tab <- tab %>% mutate_at(c(4:14), ~replace(., is.na(.), 0))
+  #create n_HM1, n_HT, n_HM2, n_NA columns and suppress others
+  tab <- tab %>% mutate(n_HM1=case_when(allele_1=="A"~AA,
+                                        allele_1=="T"~TT,
+                                        allele_1=="C"~CC,
+                                        allele_1=="G"~GG,
+                                        T ~ 0)) %>%
+    mutate(n_HM2=case_when(allele_2=="T"~TT,
+                           allele_2=="C"~CC,
+                           allele_2=="G"~GG,
+                           T ~ 0)) %>%
+    mutate(n_HT=case_when(allele_1=="A" & allele_2=="T" ~ AT,
+                          allele_1=="A" & allele_2=="C" ~ AC,
+                          allele_1=="A" & allele_2=="G" ~ AG,
+                          allele_1=="T" & allele_2=="C" ~ TC,
+                          allele_1=="T" & allele_2=="G" ~ TG,
+                          allele_1=="C" & allele_2=="G" ~ CG,
+                          T ~ 0)) %>%
+    rename("n_NA"="--") %>%
+    select(marker:allele_2,n_HM1:n_HT,n_NA)
+  #save useful columns in annot dataframe and merge
  annot <- annot %>% select(marker,chr,!!sym(pos))
  tab <- right_join(annot,tab,by="marker")
  #return
  return(tab)
 }
--- a/R/write_rqtl.R
+++ b/R/write_rqtl.R
@@ -20,7 +20,7 @@
 #'
 #### write_rqtl ####
 ## write data frame in rqtl format (csv), if path != NA writes the file in the path indicated
-write_rqtl <- function(geno,pheno,tab,ref,par1,par2,par_N=TRUE,prefix,pos,path=NA){
+write_rqtl <- function(geno,pheno,tab,ref,par1,par2,prefix,pos,path=NA){
  #rename df columns
  geno <- geno %>% rename("marker"=1,
                          "id"=2,

--- a/man/geno_strains.Rd
+++ b/man/geno_strains.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/geno_strains.R
 \name{geno_strains}
 \alias{geno_strains}
-\title{Create haplotype for inbred strains into a dataframe}
+\title{Summarizes genotypes of multiple individuals}
 \usage{
 geno_strains(annot, geno, strn, cols)
 }
@@ -16,7 +16,9 @@ geno_strains(annot, geno, strn, cols)
 \item{cols}{name of the columns from the annot data frame to keep in the output of this function.}
 }
 \description{
-This functions adds columns for parental strains used in the cross in the annotation data frame, from the genotype data frame in which one or several animal of the parental strains were genotyped.
+This functions formats genotypes of genotyped individuals from a two letters encoding to a one letter encoding.
 If several animals of one strain were genotyped, a consensus is created from these animals.
-The consensus is created as follow : if the indivuals carry the same allele, this allele is kept, otherwise, the allele is noted as "N". If individuals show residual heterozygosity, it is encoded as "H".
+The one letter encoding is made as follow: if the individual is homozygous, the letter of the allele (A, T, G or C) is kept. If the individual show residual heterozygosity, it is encoded as "H". If the genotype is missing, it is encoded as "N".
+The consensus is created as follow : if the indivuals carry the same allele, this allele is kept. If one or several individuals are heterozygous, the genotype is encoded as "H". If one individual has a missing genotypes but another was correctly genotyped, its genotype is kept.
+If the genotypes are completely discordant (i.e. if two individuals are both homozygous but for different alleles), the genotype is encoded as "D".
 }
--- a/man/mark_allele.Rd
+++ b/man/mark_allele.Rd
@@ -16,6 +16,8 @@ mark_allele(tab, ref, cross, par1, par2, parNH = TRUE)
 \item{par1}{first parental strain used in the cross, the name must be written as in the "ref" data frame. For a backcross the strain used to backcross F1 individuals must be indicated as par1.}
 \item{par2}{second parental strain used in the cross, the name must be written as in the "ref" data frame.}
+\item{parNH}{wether markers for which one of the parental strains has an undetermined or heterozygous genotype (default is TRUE)}
 }
 \description{
 This functions uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers which have alleles observed in the individuals of the cross that do not correspond to the alleles observed in the parental strains. For example, a marker which is not polymorphic between the two parental strains but which has two alleles in the cross individuals will be excluded.

--- a/man/mark_prop.Rd
+++ b/man/mark_prop.Rd
@@ -12,8 +12,7 @@ mark_prop(
  pval = NA,
  homo1X = NULL,
  homo2X = NULL,
-  heteroX = NULL,
+  heteroX = NULL
-  na = 0.5
 )
 }
 \arguments{
@@ -25,13 +24,13 @@ mark_prop(
 \item{hetero}{proportion of heterozygous individuals under which the marker is excluded.}
+\item{pval}{chi2 p value threshold under which the marker is excluded. Must not be used if the homo and hetero arguments are used.}
 \item{homo1X}{a vector of two numbers. The lower and upper limits for the proportion of homozygous individuals for markers on X chromosome. This argument is for homozygous genotype with the highest expected proportion.}
 \item{homo2X}{a vector of two numbers. The lower and upper limits for the proportion of homozygous individuals for markers on X chromosome. This argument is for homozygous genotype with the lowest expected proportion.}
 \item{heteroX}{a vector of two numbers. The lower and upper limits for the proportion of heterozygous individuals for markers on X chromosome.}
-\item{na}{proportion of non-genotyped individuals above which the marker is excluded.}
 }
 \description{
 This function uses the dataframe produced by the tab_mark function and fills the "exclude" column for all the markers that present too much missing genotypes or odd proportions of each genotype.

--- a/man/write_rqtl.Rd
+++ b/man/write_rqtl.Rd
@@ -4,18 +4,7 @@
 \alias{write_rqtl}
 \title{Create data frame in Rqtl CSV format}
 \usage{
-write_rqtl(
+write_rqtl(geno, pheno, tab, ref, par1, par2, prefix, pos, path = NA)
-  geno,
-  pheno,
-  tab,
-  ref,
-  par1,
-  par2,
-  par_N = TRUE,
-  prefix,
-  pos,
-  path = NA
-)
 }
 \arguments{
 \item{geno}{data frame with the genotyping results for your cross}

--- a/stuart_1.0.7.pdf
+++ b/stuart_1.0.7.pdf
--- a/stuart_1.1.0.pdf
+++ b/stuart_1.1.0.pdf