diff --git a/.Rhistory b/.Rhistory deleted file mode 100644 index a6361541dd90ae181bc81517277060a6416e03cd..0000000000000000000000000000000000000000 --- a/.Rhistory +++ /dev/null @@ -1,512 +0,0 @@ -#create geno column in geno df -geno <- geno %>% unite(Geno,c("allele_1","allele_2"),sep="",remove=FALSE) -#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA") -geno <- geno %>% mutate(Geno=recode(Geno, -"TA" = "AT", -"GA" = "AG", -"CA" = "AC", -"GT" = "TG", -"CT" = "TC", -"GC" = "CG")) -#create df with counts for each genotype -df_count <- tibble(marker = as.character(unique(geno$marker)), -allele_1 = NA, -allele_2 = NA, -n_HM1 = NA, -n_HM2 = NA, -n_HT = NA, -n_NA = NA) -## loop to count genotype -for(i in df_count$marker){ -#extract alleles for each marker -Alleles <- geno %>% filter(marker==i) %>% -select(c(marker,id,Geno,allele_1,allele_2)) %>% -pivot_longer(c(allele_1,allele_2),names_to="Allele_name",values_to="Allele") %>% -distinct(Allele) %>% filter(Allele != "-") -Alleles <- as.factor(paste(Alleles$Allele)) -#sort alleles -Alleles <- factor(Alleles,levels=c("A","T","C","G")) -Alleles <- sort(Alleles) -#add alleles and counts, only for markers with alleles (not markers with no genotyped ind) -if(all(rapportools::is.empty(Alleles))==FALSE){ -#add alleles to df_count -df_count <- df_count %>% mutate(allele_1 = ifelse(marker == i, -paste(Alleles[1]), allele_1)) -#count for homozygous for allele 1 -n1 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to df_count -df_count <- df_count %>% mutate(n_HM1 = ifelse(marker == i, -n1$n, n_HM1)) -} -#if marker not polymorphic -if(is.na(Alleles[2])==TRUE){ -#NA as allele_2 -df_count <- df_count %>% mutate(allele_2 = ifelse(marker == i, -NA, allele_2)) -#NA as n_HM2 -df_count <- df_count %>% mutate(n_HM2 = ifelse(marker == i, -NA, n_HM2)) -#NA as n_HT -df_count <- df_count %>% mutate(n_HT = ifelse(marker == i, -NA, n_HT)) -} else { -#add alleles to df_count -df_count <- df_count %>% mutate(allele_2 = ifelse(marker == i, -paste(Alleles[2]), allele_2)) -#count for homozygous for allele 2 -n2 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to df_count -df_count <- df_count %>% mutate(n_HM2 = ifelse(marker == i, -n2$n, n_HM2)) -#count for heterozygous -n3 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to df_count -df_count <- df_count %>% mutate(n_HT = ifelse(marker == i, -n3$n, n_HT)) -} -#count for NA -n4 <- geno %>% filter(marker==i) %>% -filter(Geno == "--" | -Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") | -Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for NA to df_count -df_count <- df_count %>% mutate(n_NA = ifelse(marker == i, -n4$n, n_NA)) -} -#change class of counts as numeric : -df_count$n_HM1 <- df_count$n_HM1 %>% as.numeric() -df_count$n_HM2 <- df_count$n_HM2 %>% as.numeric() -df_count$n_HT <- df_count$n_HT %>% as.numeric() -df_count$n_NA <- df_count$n_NA %>% as.numeric() -#add 0 for null counts -df_count <- df_count %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0)) -#save useful columns in annot dataframe -annot <- annot %>% select(marker,chr,!!sym(pos)) -print(annot) -#return -return(df_count) -} -tab_mark(genos,annot_mini,"cM_cox") -library(tidyr) -tab_mark(genos,annot_mini,"cM_cox") -tab_mark <- function(geno,annot,pos){ -#rename df columns -geno <- geno %>% rename("marker"=1, -"id"=2, -"allele_1"=3, -"allele_2"=4) -#create geno column in geno df -geno <- geno %>% unite(Geno,c("allele_1","allele_2"),sep="",remove=FALSE) -#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA") -geno <- geno %>% mutate(Geno=recode(Geno, -"TA" = "AT", -"GA" = "AG", -"CA" = "AC", -"GT" = "TG", -"CT" = "TC", -"GC" = "CG")) -#create df with counts for each genotype -df_count <- tibble(marker = as.character(unique(geno$marker)), -allele_1 = NA, -allele_2 = NA, -n_HM1 = NA, -n_HM2 = NA, -n_HT = NA, -n_NA = NA) -## loop to count genotype -for(i in df_count$marker){ -#extract alleles for each marker -Alleles <- geno %>% filter(marker==i) %>% -select(c(marker,id,Geno,allele_1,allele_2)) %>% -pivot_longer(c(allele_1,allele_2),names_to="Allele_name",values_to="Allele") %>% -distinct(Allele) %>% filter(Allele != "-") -Alleles <- as.factor(paste(Alleles$Allele)) -#sort alleles -Alleles <- factor(Alleles,levels=c("A","T","C","G")) -Alleles <- sort(Alleles) -#add alleles and counts, only for markers with alleles (not markers with no genotyped ind) -if(all(rapportools::is.empty(Alleles))==FALSE){ -#add alleles to df_count -df_count <- df_count %>% mutate(allele_1 = ifelse(marker == i, -paste(Alleles[1]), allele_1)) -#count for homozygous for allele 1 -n1 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to df_count -df_count <- df_count %>% mutate(n_HM1 = ifelse(marker == i, -n1$n, n_HM1)) -} -#if marker not polymorphic -if(is.na(Alleles[2])==TRUE){ -#NA as allele_2 -df_count <- df_count %>% mutate(allele_2 = ifelse(marker == i, -NA, allele_2)) -#NA as n_HM2 -df_count <- df_count %>% mutate(n_HM2 = ifelse(marker == i, -NA, n_HM2)) -#NA as n_HT -df_count <- df_count %>% mutate(n_HT = ifelse(marker == i, -NA, n_HT)) -} else { -#add alleles to df_count -df_count <- df_count %>% mutate(allele_2 = ifelse(marker == i, -paste(Alleles[2]), allele_2)) -#count for homozygous for allele 2 -n2 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to df_count -df_count <- df_count %>% mutate(n_HM2 = ifelse(marker == i, -n2$n, n_HM2)) -#count for heterozygous -n3 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to df_count -df_count <- df_count %>% mutate(n_HT = ifelse(marker == i, -n3$n, n_HT)) -} -#count for NA -n4 <- geno %>% filter(marker==i) %>% -filter(Geno == "--" | -Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") | -Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for NA to df_count -df_count <- df_count %>% mutate(n_NA = ifelse(marker == i, -n4$n, n_NA)) -} -#change class of counts as numeric : -df_count$n_HM1 <- df_count$n_HM1 %>% as.numeric() -df_count$n_HM2 <- df_count$n_HM2 %>% as.numeric() -df_count$n_HT <- df_count$n_HT %>% as.numeric() -df_count$n_NA <- df_count$n_NA %>% as.numeric() -#add 0 for null counts -df_count <- df_count %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0)) -#save useful columns in annot dataframe -annot <- annot %>% select(marker,chr,!!sym(pos)) -tab <- left_join(tab,annot) -#return -return(df_count) -} -tab_mark(genos,annot_mini,"cM_cox") -tab_mark <- function(geno,annot,pos){ -#rename df columns -geno <- geno %>% rename("marker"=1, -"id"=2, -"allele_1"=3, -"allele_2"=4) -#create geno column in geno df -geno <- geno %>% unite(Geno,c("allele_1","allele_2"),sep="",remove=FALSE) -#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA") -geno <- geno %>% mutate(Geno=recode(Geno, -"TA" = "AT", -"GA" = "AG", -"CA" = "AC", -"GT" = "TG", -"CT" = "TC", -"GC" = "CG")) -#create df with counts for each genotype -tab <- tibble(marker = as.character(unique(geno$marker)), -allele_1 = NA, -allele_2 = NA, -n_HM1 = NA, -n_HM2 = NA, -n_HT = NA, -n_NA = NA) -## loop to count genotype -for(i in tab$marker){ -#extract alleles for each marker -Alleles <- geno %>% filter(marker==i) %>% -select(c(marker,id,Geno,allele_1,allele_2)) %>% -pivot_longer(c(allele_1,allele_2),names_to="Allele_name",values_to="Allele") %>% -distinct(Allele) %>% filter(Allele != "-") -Alleles <- as.factor(paste(Alleles$Allele)) -#sort alleles -Alleles <- factor(Alleles,levels=c("A","T","C","G")) -Alleles <- sort(Alleles) -#add alleles and counts, only for markers with alleles (not markers with no genotyped ind) -if(all(rapportools::is.empty(Alleles))==FALSE){ -#add alleles to tab -tab <- tab %>% mutate(allele_1 = ifelse(marker == i, -paste(Alleles[1]), allele_1)) -#count for homozygous for allele 1 -n1 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HM1 = ifelse(marker == i, -n1$n, n_HM1)) -} -#if marker not polymorphic -if(is.na(Alleles[2])==TRUE){ -#NA as allele_2 -tab <- tab %>% mutate(allele_2 = ifelse(marker == i, -NA, allele_2)) -#NA as n_HM2 -tab <- tab %>% mutate(n_HM2 = ifelse(marker == i, -NA, n_HM2)) -#NA as n_HT -tab <- tab %>% mutate(n_HT = ifelse(marker == i, -NA, n_HT)) -} else { -#add alleles to tab -tab <- tab %>% mutate(allele_2 = ifelse(marker == i, -paste(Alleles[2]), allele_2)) -#count for homozygous for allele 2 -n2 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HM2 = ifelse(marker == i, -n2$n, n_HM2)) -#count for heterozygous -n3 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HT = ifelse(marker == i, -n3$n, n_HT)) -} -#count for NA -n4 <- geno %>% filter(marker==i) %>% -filter(Geno == "--" | -Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") | -Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for NA to tab -tab <- tab %>% mutate(n_NA = ifelse(marker == i, -n4$n, n_NA)) -} -#change class of counts as numeric : -tab$n_HM1 <- tab$n_HM1 %>% as.numeric() -tab$n_HM2 <- tab$n_HM2 %>% as.numeric() -tab$n_HT <- tab$n_HT %>% as.numeric() -tab$n_NA <- tab$n_NA %>% as.numeric() -#add 0 for null counts -tab <- tab %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0)) -#save useful columns in annot dataframe -annot <- annot %>% select(marker,chr,!!sym(pos)) -tab <- left_join(tab,annot) -#return -return(tab) -} -tab_mark(genos,annot_mini,"cM_cox") -tab_mark <- function(geno,annot,pos){ -#rename df columns -geno <- geno %>% rename("marker"=1, -"id"=2, -"allele_1"=3, -"allele_2"=4) -#create geno column in geno df -geno <- geno %>% unite(Geno,c("allele_1","allele_2"),sep="",remove=FALSE) -#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA") -geno <- geno %>% mutate(Geno=recode(Geno, -"TA" = "AT", -"GA" = "AG", -"CA" = "AC", -"GT" = "TG", -"CT" = "TC", -"GC" = "CG")) -#create df with counts for each genotype -tab <- tibble(marker = as.character(unique(geno$marker)), -allele_1 = NA, -allele_2 = NA, -n_HM1 = NA, -n_HM2 = NA, -n_HT = NA, -n_NA = NA) -## loop to count genotype -for(i in tab$marker){ -#extract alleles for each marker -Alleles <- geno %>% filter(marker==i) %>% -select(c(marker,id,Geno,allele_1,allele_2)) %>% -pivot_longer(c(allele_1,allele_2),names_to="Allele_name",values_to="Allele") %>% -distinct(Allele) %>% filter(Allele != "-") -Alleles <- as.factor(paste(Alleles$Allele)) -#sort alleles -Alleles <- factor(Alleles,levels=c("A","T","C","G")) -Alleles <- sort(Alleles) -#add alleles and counts, only for markers with alleles (not markers with no genotyped ind) -if(all(rapportools::is.empty(Alleles))==FALSE){ -#add alleles to tab -tab <- tab %>% mutate(allele_1 = ifelse(marker == i, -paste(Alleles[1]), allele_1)) -#count for homozygous for allele 1 -n1 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HM1 = ifelse(marker == i, -n1$n, n_HM1)) -} -#if marker not polymorphic -if(is.na(Alleles[2])==TRUE){ -#NA as allele_2 -tab <- tab %>% mutate(allele_2 = ifelse(marker == i, -NA, allele_2)) -#NA as n_HM2 -tab <- tab %>% mutate(n_HM2 = ifelse(marker == i, -NA, n_HM2)) -#NA as n_HT -tab <- tab %>% mutate(n_HT = ifelse(marker == i, -NA, n_HT)) -} else { -#add alleles to tab -tab <- tab %>% mutate(allele_2 = ifelse(marker == i, -paste(Alleles[2]), allele_2)) -#count for homozygous for allele 2 -n2 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HM2 = ifelse(marker == i, -n2$n, n_HM2)) -#count for heterozygous -n3 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HT = ifelse(marker == i, -n3$n, n_HT)) -} -#count for NA -n4 <- geno %>% filter(marker==i) %>% -filter(Geno == "--" | -Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") | -Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for NA to tab -tab <- tab %>% mutate(n_NA = ifelse(marker == i, -n4$n, n_NA)) -} -#change class of counts as numeric : -tab$n_HM1 <- tab$n_HM1 %>% as.numeric() -tab$n_HM2 <- tab$n_HM2 %>% as.numeric() -tab$n_HT <- tab$n_HT %>% as.numeric() -tab$n_NA <- tab$n_NA %>% as.numeric() -#add 0 for null counts -tab <- tab %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0)) -#save useful columns in annot dataframe -annot <- annot %>% select(marker,chr,!!sym(pos)) -tab <- left_join(tab,annot,by="marker") -#return -return(tab) -} -tab_mark(genos,annot_mini,"cM_cox") -tab_mark <- function(geno,annot,pos){ -#rename df columns -geno <- geno %>% rename("marker"=1, -"id"=2, -"allele_1"=3, -"allele_2"=4) -#create geno column in geno df -geno <- geno %>% unite(Geno,c("allele_1","allele_2"),sep="",remove=FALSE) -#recode genotypes to have all heterozygous encoded the same way (ex: only "AT", no "TA") -geno <- geno %>% mutate(Geno=recode(Geno, -"TA" = "AT", -"GA" = "AG", -"CA" = "AC", -"GT" = "TG", -"CT" = "TC", -"GC" = "CG")) -#create df with counts for each genotype -tab <- tibble(marker = as.character(unique(geno$marker)), -allele_1 = NA, -allele_2 = NA, -n_HM1 = NA, -n_HM2 = NA, -n_HT = NA, -n_NA = NA) -## loop to count genotype -for(i in tab$marker){ -#extract alleles for each marker -Alleles <- geno %>% filter(marker==i) %>% -select(c(marker,id,Geno,allele_1,allele_2)) %>% -pivot_longer(c(allele_1,allele_2),names_to="Allele_name",values_to="Allele") %>% -distinct(Allele) %>% filter(Allele != "-") -Alleles <- as.factor(paste(Alleles$Allele)) -#sort alleles -Alleles <- factor(Alleles,levels=c("A","T","C","G")) -Alleles <- sort(Alleles) -#add alleles and counts, only for markers with alleles (not markers with no genotyped ind) -if(all(rapportools::is.empty(Alleles))==FALSE){ -#add alleles to tab -tab <- tab %>% mutate(allele_1 = ifelse(marker == i, -paste(Alleles[1]), allele_1)) -#count for homozygous for allele 1 -n1 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[1],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HM1 = ifelse(marker == i, -n1$n, n_HM1)) -} -#if marker not polymorphic -if(is.na(Alleles[2])==TRUE){ -#NA as allele_2 -tab <- tab %>% mutate(allele_2 = ifelse(marker == i, -NA, allele_2)) -#NA as n_HM2 -tab <- tab %>% mutate(n_HM2 = ifelse(marker == i, -NA, n_HM2)) -#NA as n_HT -tab <- tab %>% mutate(n_HT = ifelse(marker == i, -NA, n_HT)) -} else { -#add alleles to tab -tab <- tab %>% mutate(allele_2 = ifelse(marker == i, -paste(Alleles[2]), allele_2)) -#count for homozygous for allele 2 -n2 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[2],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HM2 = ifelse(marker == i, -n2$n, n_HM2)) -#count for heterozygous -n3 <- geno %>% filter(marker==i) %>% -filter(Geno == paste(Alleles[1],Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for homozygous for allele 1 to tab -tab <- tab %>% mutate(n_HT = ifelse(marker == i, -n3$n, n_HT)) -} -#count for NA -n4 <- geno %>% filter(marker==i) %>% -filter(Geno == "--" | -Geno == paste(Alleles[1],"-",sep="") | Geno == paste(Alleles[2],"-",sep="") | -Geno == paste("-",Alleles[1],sep="") | Geno == paste("-",Alleles[2],sep="")) %>% -summarise(n=n()) -#add count for NA to tab -tab <- tab %>% mutate(n_NA = ifelse(marker == i, -n4$n, n_NA)) -} -#change class of counts as numeric : -tab$n_HM1 <- tab$n_HM1 %>% as.numeric() -tab$n_HM2 <- tab$n_HM2 %>% as.numeric() -tab$n_HT <- tab$n_HT %>% as.numeric() -tab$n_NA <- tab$n_NA %>% as.numeric() -#add 0 for null counts -tab <- tab %>% mutate_at(.vars=vars(n_HM1,n_HM2,n_HT,n_NA),~replace(., is.na(.), 0)) -#save useful columns in annot dataframe -annot <- annot %>% select(marker,chr,!!sym(pos)) -tab <- right_join(annot,tab,by="marker") -#return -return(tab) -} -tab_mark(genos,annot_mini,"cM_cox") -# how to use the function: -# stuart_tab <- tab_mark(genos,annot_mini,"cM_cox") -tab <- tab_mark(genos,annot_mini,"cM_cox") -View(tab) -View(genos) -View(tab) diff --git a/.gitignore b/.gitignore index 42d718175c9ea808fa1d65491df803f76914cfb2..e934e149a702dc6cf7e0ccb24cb264857131d614 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ Meta /doc/ /Meta/ .Rhistory +.Rhistory