diff --git a/article/.RData b/article/.RData index 439a2e1d9b2f72071735afc73410e1969b8bb902..a664efc58c763b9f841bd59d020adcf0a278cc6a 100644 Binary files a/article/.RData and b/article/.RData differ diff --git a/article/article_figures.Rmd b/article/article_figures.Rmd index 2ad0c9169d548c929dc995123fc326a279b0f1a4..0d6d24e6550143d29602c578ec73c9332f1e6b24 100644 --- a/article/article_figures.Rmd +++ b/article/article_figures.Rmd @@ -1,7 +1,7 @@ --- title: "article_figures.Rmd" author: "Marie Bourdon" -date: "July 2021" +date: "June 2022" output: html_document --- @@ -246,6 +246,27 @@ pheno_before_plot <- qtl_plot(pheno_before,lod=data.frame(group = c("alpha=0.05" pheno_before_plot ``` +```{r before_ann} +chrs <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "X") +ann_dat_text<-data.frame( + chr=factor(chrs, + levels=chrs), + lod=c(NA,7.5,NA,NA,15,NA,10.5,10.5,NA,5.5,NA,NA,18.5,NA,NA,NA,NA,NA,NA,NA), + label=c(NA,"1",NA,NA,"2",NA,"3","4",NA,"5",NA,NA,"6",NA,NA,NA,NA,NA,NA,NA), + x=c(NA,17,NA,NA,27,NA,70,10,NA,30,NA,NA,33,NA,NA,NA,NA,NA,NA,NA) +) + +pheno_before_plot + geom_text( + # the new dataframe for annotating text + data = ann_dat_text, + mapping = aes(x = x, + y = lod, + label = label, + color="blue") +) +``` + + ```{r before_plot} pheno_before_zoom <- qtl_plot(pheno_before,lod=data.frame(group = c("alpha=0.05", "alpha=0.1","alpha=0.63"), @@ -447,18 +468,29 @@ functions_plot rm(none,allele,match,poly,prop,barplot_df) ``` -# Pheno data format +## Pheno data format + ```{r pheno} format_pheno <- phenos[1:6,] write_csv(format_pheno,file="sup/tableS1.csv") print(xtable::xtable(format_pheno, type = "latex"), file = "tables/tab_alleles.tex",include.rownames=FALSE) ``` +## est rf + +```{r} +source("files/find_linked_markers.R") +estrf_matrix_after <- pull.rf(cross_after, what=c("lod")) + +find_linked_markers(estrf_matrix_after,mark="S6J011498219",annot=annot_mini) +``` + + ## Narrow peaks Investigation of high lod score peaks -### Chromosome 2 +### Peak 1 1 peak on 1 pseudomarker : c2.loc10, postionned between gUNC2731905 and gUNCHS004244. @@ -471,14 +503,6 @@ tab_before %>% filter(marker %in% c("gUNC2731905","gUNCHS004244")) %>% select(ma For gUNC2731905, all individuals except 1 are homozygous so this marker should be removed. The proportions for gUNCHS004244 seem correct. -```{r parents_geno_chr2} -strns_ref %>% filter(marker %in% c("gUNCHS004244")) -strains %>% filter(marker %in% c("gUNCHS004244")) -``` - -The alleles of parental strains seem correct too and are the same in the reference alleles data and in our genotypes. - - Graph: ```{r geno_plot_chr2} @@ -507,15 +531,16 @@ geno_plot2 <- pgmap %>% filter(pos > 10 & pos < 20) %>% geno_plot2 ``` -However, it seems to be an excess of recombinants for gUNCHS004244. Let's look at the calculated map for this marker : +This region contains many markers with non Mendelian proportions (many have an excess of homozygous). This leads to the creation of a pseudomarker between gUNC2731905 and gUNCHS004244 with incorrect genotypic probabilities which leads to a spurious peak. ```{r} -newmap_before[["2"]][11:16] +newmap_before[["2"]][8:20] ``` -There is indeed an increase recombinants for gUNCHS004244 as the calculated distance with the previous marker is 4428.269-4277.043=151.226 cM and the calculated distance with the following marker is 4569.736-4428.269=141.467 cM. -### Chromosome 5 +Recombination fractions between adjacent markers in this regions are indeed high. + +### Peak 2 1 peak on 1 marker : mUNC050096588 @@ -525,7 +550,7 @@ Here are the infos on genotype counts for these markers: tab_before %>% filter(marker %in% c("mUNC050096588")) %>% select(marker:n_NA) ``` -All individuals heterozygous so this marker should be removed +All individuals heterozygous so this marker should be removed. ```{r} newmap_before[["5"]][18:23] @@ -561,7 +586,7 @@ test_plot <- pgmap %>% filter(pos > 20 & pos < 30) %>% test_plot ``` -### Chromosome 7 +### Peak 3 2 peaks, one on 1 pseudomarker : c7.loc74, between UNC13823755 and S3J075374098; and one on 1 marker and 1 pseudomarker : c7.loc82 and gUNC13998623, c7.loc82 being between gUNC13979374 and gUNC13998623. @@ -607,7 +632,7 @@ test_plot <- pgmap %>% filter(pos > 65 & pos < 90) %>% test_plot ``` -### Chromosome 8 +### Peak 4 1 peaks, one on 1 marker : mbackupJAX00158395 @@ -654,7 +679,7 @@ test_plot ``` -### Chromosome 10 +### Peak 5 1 peaks, one on 1 marker : S6J102311553 @@ -710,7 +735,7 @@ newmap_before[["10"]][80:110] Indeed, there is not an increased distance between S6J102311553 and the adjacent markers; but S6J102311553 is in the middle of a region with enormous calculated with its adjacent markers, between S2T101968115 (showing 19084.57-18083.06=1001.51 cM with the previous marker) and S2T102642258 (showing 20373.93-19372.42=1001.51 cM with the following marker) -### Chromosome 13 +### Peak 6 1 peak, one on 1 marker : SAC132487883 @@ -817,6 +842,8 @@ compar_pos_after <- tibble(marker=compar_pos_after$marker, dif_fol=calc_fol/cox_fol) mean(compar_pos_before$dif_prev,na.rm=TRUE) +sd(compar_pos_before$dif_prev,na.rm=TRUE) mean(compar_pos_after$dif_prev,na.rm=TRUE) +sd(compar_pos_after$dif_prev,na.rm=TRUE) ``` diff --git a/article/data2/.RData b/article/data2/.RData index cc719da40b4c5bbfd84e1b13b3e0c48bdc0be988..a927b648399c261a1f7ac6b98c65a591677efd80 100644 Binary files a/article/data2/.RData and b/article/data2/.RData differ diff --git a/article/data2/data2.Rmd b/article/data2/data2.Rmd index 5befcc1b728413b4f9a2643a3a5ac3d877c659c6..091637ce0bc696a0c58bd5047d58457e51e10383 100644 --- a/article/data2/data2.Rmd +++ b/article/data2/data2.Rmd @@ -83,6 +83,27 @@ pheno_before_plot <- qtl_plot(pheno_before,lod=data.frame(group = c("alpha=0.05" pheno_before_plot ``` +```{r before_ann} +chrs <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "X") +ann_dat_text<-data.frame( + chr=factor(chrs, + levels=chrs), + lod=c(rep(NA,10),8,rep(NA,9)), + label=c(rep(NA,10),7,rep(NA,9)), + x=c(rep(NA,10),22,rep(NA,9)) +) + +pheno_before_plot + geom_text( + # the new dataframe for annotating text + data = ann_dat_text, + mapping = aes(x = x, + y = lod, + label = label, + color="blue") +) +``` + + ## create file for parental strains genotyped ```{r} @@ -187,4 +208,50 @@ functions_plot <- functions_df %>% ggplot(aes(x=markers,y=fct)) + functions_plot rm(none,allele,match,poly,prop,functions_df) -``` \ No newline at end of file +``` + +## Narrow peaks + +Investigation of high lod score peaks + +### Peak 7 + +1 peak on 1 pseudomarker : c11.loc18, postionned between SNT111392585 and mJAX00308021. + +Here are the infos on genotype counts for these markers: + +```{r summary_geno_chr2} +tab_before %>% filter(marker %in% c("SNT111392585","mJAX00308021")) %>% select(marker:n_NA) +``` + +For SNT111392585, all individuals except 1 are homozygous so this marker should be removed. The proportions for mJAX00308021 seem correct. + +Graph: + +```{r geno_plot_chr2} +phenotypes <- cross_before[["pheno"]] +map <- cross_before[["geno"]][["11"]][["map"]] +map <- tibble(marker=names(map),pos=map) +genotypes <- cross_before[["geno"]][["11"]][["data"]] +genotypes <- as_tibble(genotypes) +phenogeno <- cbind(phenotypes,genotypes) +phenogeno %<>% pivot_longer(mbackupUNC110000218:gUNC20538837,names_to="marker",values_to="genotype") +pgmap <- full_join(phenogeno,map,by="marker") + +geno_plot2 <- pgmap %>% filter(pos > 15 & pos < 25) %>% + filter(id %in% sample(phenotypes$id,10)) %>% + ggplot(aes(x=pos,y=as.factor(id))) + + geom_point(aes(color=as.factor(genotype))) + + coord_cartesian(ylim = c(1, 10), expand = TRUE, clip = "off") + + annotate(geom="text",y=-1,size=3, + x = map %>% filter(pos > 15 & pos < 25) %>% pull(pos), + label = map %>% filter(pos > 15 & pos < 25) %>% pull(marker), + angle=90) + + labs(x="Position (cM)",y="Individual",color="Genotype") + + theme_bw() + + theme(plot.margin = unit(c(1, 1, 1, 1), "lines"), + axis.title.x = element_text(margin = margin(t = 50))) +geno_plot2 +``` + +The two markers before the pseudomarker have an excess of homozygous. \ No newline at end of file diff --git a/article/data4/.RData b/article/data4/.RData index bd38fc1fce23514a1b3e0c337f5bca8fca3be212..5fbfda8e7679ec0598f0472bf1461ab10406d80d 100644 Binary files a/article/data4/.RData and b/article/data4/.RData differ diff --git a/article/data4/data4.Rmd b/article/data4/data4.Rmd index 989a3d4378e121b1f3602b7b814d27fa7d952839..f3715c8b837d0b658b463e22e9326c9aa6f19c50 100644 --- a/article/data4/data4.Rmd +++ b/article/data4/data4.Rmd @@ -82,6 +82,26 @@ pheno_before_plot <- qtl_plot(pheno_before,lod=data.frame(group = c("alpha=0.05" pheno_before_plot ``` +```{r before_ann} +chrs <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "X") +ann_dat_text<-data.frame( + chr=factor(chrs, + levels=chrs), + lod=c(9,NA,NA,NA,11,NA,NA,NA,NA,13,NA,NA,8.5,NA,NA,NA,NA,7,NA,NA), + label=c(8,NA,NA,NA,9,NA,NA,NA,NA,10,NA,NA,11,NA,NA,NA,NA,12,NA,NA), + x=c(45,NA,NA,NA,20,NA,NA,NA,NA,27,NA,NA,30,NA,NA,NA,NA,35,NA,NA) +) + +pheno_before_plot + geom_text( + # the new dataframe for annotating text + data = ann_dat_text, + mapping = aes(x = x, + y = lod, + label = label, + color="blue") +) +``` + ```{r} #our genotypes @@ -200,4 +220,215 @@ functions_plot <- functions_df %>% ggplot(aes(x=markers,y=fct)) + functions_plot rm(none,allele,match,poly,prop) -``` \ No newline at end of file +``` + + +## Narrow peaks + +Investigation of high lod score peaks + +### Peak 8 + +1 peak on 1 pseudomarker : c1.loc42, postionned between gUNC1177319 and S6J013976867. + +Here are the infos on genotype counts for these markers: + +```{r summary_geno_peak8} +tab_before %>% filter(marker %in% c("gUNC1177319", "S6J013976867")) %>% select(marker:n_NA) +``` + + +For gUNC1177319, all individuals heterozygous so this marker should be removed. The proportions for S6J013976867 seem correct. + +Graph: + +```{r geno_plot_peak8} +phenotypes <- cross_before[["pheno"]] +map <- cross_before[["geno"]][["1"]][["map"]] +map <- tibble(marker=names(map),pos=map) +genotypes <- cross_before[["geno"]][["1"]][["data"]] +genotypes <- as_tibble(genotypes) +phenogeno <- cbind(phenotypes,genotypes) +phenogeno %<>% pivot_longer(SAH010136363:gUNC2460751,names_to="marker",values_to="genotype") +pgmap <- full_join(phenogeno,map,by="marker") + +geno_plot8 <- pgmap %>% filter(pos > 42 & pos < 52) %>% + filter(id %in% sample(phenotypes$id,10)) %>% + ggplot(aes(x=pos,y=as.factor(id))) + + geom_point(aes(color=as.factor(genotype))) + + coord_cartesian(ylim = c(1, 10), expand = TRUE, clip = "off") + + annotate(geom="text",y=-1,size=3, + x = map %>% filter(pos > 42 & pos < 52) %>% pull(pos), + label = map %>% filter(pos > 42 & pos < 52) %>% pull(marker), + angle=90) + + labs(x="Position (cM)",y="Individual",color="Genotype") + + theme_bw() + + theme(plot.margin = unit(c(1, 1, 1, 1), "lines"), + axis.title.x = element_text(margin = margin(t = 50))) +geno_plot8 +``` + +### Peak 9 + +1 peak on 1 pseudomarker : c5.loc16, in a region with very few markers, postionned between S6J050685107 and mUNC050096588. + +Here are the infos on genotype counts for these markers: + +```{r summary_geno_peak9} +tab_before %>% filter(marker %in% c("S6J050685107","mUNC050096588")) %>% select(marker:n_NA) +``` + + +For mUNC050096588, all individuals heterozygous so this marker should be removed. The proportions for S6J050685107 seem correct. + +Graph: + +```{r geno_plot_peak9} +phenotypes <- cross_before[["pheno"]] +map <- cross_before[["geno"]][["5"]][["map"]] +map <- tibble(marker=names(map),pos=map) +genotypes <- cross_before[["geno"]][["5"]][["data"]] +genotypes <- as_tibble(genotypes) +phenogeno <- cbind(phenotypes,genotypes) +phenogeno %<>% pivot_longer(mUNC050013072:gUNC10448854,names_to="marker",values_to="genotype") +pgmap <- full_join(phenogeno,map,by="marker") + +geno_plot9 <- pgmap %>% filter(pos > 1 & pos < 30) %>% + filter(id %in% sample(phenotypes$id,10)) %>% + ggplot(aes(x=pos,y=as.factor(id))) + + geom_point(aes(color=as.factor(genotype))) + + coord_cartesian(ylim = c(1, 10), expand = TRUE, clip = "off") + + annotate(geom="text",y=-1,size=3, + x = map %>% filter(pos > 1 & pos < 30) %>% pull(pos), + label = map %>% filter(pos > 1 & pos < 30) %>% pull(marker), + angle=90) + + labs(x="Position (cM)",y="Individual",color="Genotype") + + theme_bw() + + theme(plot.margin = unit(c(1, 1, 1, 1), "lines"), + axis.title.x = element_text(margin = margin(t = 50))) +geno_plot9 +``` + +### Peak 10 + +1 peak on 1 pseudomarker : c10.loc30, in a region with very few markers, postionned between SAH102097335 and S2C102505843. + +Here are the infos on genotype counts for these markers: + +```{r summary_geno_peak10} +tab_before %>% filter(marker %in% c("SAH102097335","S2C102505843")) %>% select(marker:n_NA) +``` + + +For S2C102505843, all individuals except 1 are homozygous so this marker should be removed. The proportions for SAH102097335 seem correct. + +Graph: + +```{r geno_plot_peak10} +phenotypes <- cross_before[["pheno"]] +map <- cross_before[["geno"]][["10"]][["map"]] +map <- tibble(marker=names(map),pos=map) +genotypes <- cross_before[["geno"]][["10"]][["data"]] +genotypes <- as_tibble(genotypes) +phenogeno <- cbind(phenotypes,genotypes) +phenogeno %<>% pivot_longer(gICR258:S6J105101847,names_to="marker",values_to="genotype") +pgmap <- full_join(phenogeno,map,by="marker") + +geno_plot10 <- pgmap %>% filter(pos > 25 & pos < 35) %>% + filter(id %in% sample(phenotypes$id,10)) %>% + ggplot(aes(x=pos,y=as.factor(id))) + + geom_point(aes(color=as.factor(genotype))) + + coord_cartesian(ylim = c(1, 10), expand = TRUE, clip = "off") + + annotate(geom="text",y=-1,size=3, + x = map %>% filter(pos > 25 & pos < 35) %>% pull(pos), + label = map %>% filter(pos > 25 & pos < 35) %>% pull(marker), + angle=90) + + labs(x="Position (cM)",y="Individual",color="Genotype") + + theme_bw() + + theme(plot.margin = unit(c(1, 1, 1, 1), "lines"), + axis.title.x = element_text(margin = margin(t = 50))) +geno_plot10 +``` + + +### Peak 11 + +1 peak on 1 pseudomarker : c13.loc28, postionned between S6J132182752 and SAC132487883. + +Here are the infos on genotype counts for these markers: + +```{r summary_geno_peak11} +tab_before %>% filter(marker %in% c("S6J132182752","SAC132487883")) %>% select(marker:n_NA) +``` + + +For SAC132487883, all individuals are heterozygous so this marker should be removed. The proportions for S6J132182752 seem correct. + +Graph: + +```{r geno_plot_peak11} +phenotypes <- cross_before[["pheno"]] +map <- cross_before[["geno"]][["13"]][["map"]] +map <- tibble(marker=names(map),pos=map) +genotypes <- cross_before[["geno"]][["13"]][["data"]] +genotypes <- as_tibble(genotypes) +phenogeno <- cbind(phenotypes,genotypes) +phenogeno %<>% pivot_longer(gB6_rs13481676:SFH134765844,names_to="marker",values_to="genotype") +pgmap <- full_join(phenogeno,map,by="marker") + +geno_plot11 <- pgmap %>% filter(pos > 20 & pos < 35) %>% + filter(id %in% sample(phenotypes$id,10)) %>% + ggplot(aes(x=pos,y=as.factor(id))) + + geom_point(aes(color=as.factor(genotype))) + + coord_cartesian(ylim = c(1, 10), expand = TRUE, clip = "off") + + annotate(geom="text",y=-1,size=3, + x = map %>% filter(pos > 20 & pos < 35) %>% pull(pos), + label = map %>% filter(pos > 20 & pos < 35) %>% pull(marker), + angle=90) + + labs(x="Position (cM)",y="Individual",color="Genotype") + + theme_bw() + + theme(plot.margin = unit(c(1, 1, 1, 1), "lines"), + axis.title.x = element_text(margin = margin(t = 50))) +geno_plot11 +``` + +### Peak 12 + +1 peak on 1 marker : S3C182557441 + +Here are the infos on genotype counts for these markers: + +```{r summary_geno_peak12} +tab_before %>% filter(marker %in% c("S3C182557441")) %>% select(marker:n_NA) +``` + + +For this marker, all individuals except one are homozygous so this marker should be removed. + +Graph: + +```{r geno_plot_peak12} +phenotypes <- cross_before[["pheno"]] +map <- cross_before[["geno"]][["18"]][["map"]] +map <- tibble(marker=names(map),pos=map) +genotypes <- cross_before[["geno"]][["18"]][["data"]] +genotypes <- as_tibble(genotypes) +phenogeno <- cbind(phenotypes,genotypes) +phenogeno %<>% pivot_longer(mJAX00450679:gUNC29817480,names_to="marker",values_to="genotype") +pgmap <- full_join(phenogeno,map,by="marker") + +geno_plot12 <- pgmap %>% filter(pos > 32 & pos < 42) %>% + filter(id %in% sample(phenotypes$id,10)) %>% + ggplot(aes(x=pos,y=as.factor(id))) + + geom_point(aes(color=as.factor(genotype))) + + coord_cartesian(ylim = c(1, 10), expand = TRUE, clip = "off") + + annotate(geom="text",y=-1,size=3, + x = map %>% filter(pos > 32 & pos < 42) %>% pull(pos), + label = map %>% filter(pos > 32 & pos < 42) %>% pull(marker), + angle=90) + + labs(x="Position (cM)",y="Individual",color="Genotype") + + theme_bw() + + theme(plot.margin = unit(c(1, 1, 1, 1), "lines"), + axis.title.x = element_text(margin = margin(t = 50))) +geno_plot12 +``` diff --git a/article/files/find_linked_markers.R b/article/files/find_linked_markers.R new file mode 100644 index 0000000000000000000000000000000000000000..7c8b6b75782648edf53bb0b2baf1c0e4bd1b5016 --- /dev/null +++ b/article/files/find_linked_markers.R @@ -0,0 +1,13 @@ +find_linked_markers <- function(rf,annot,mark,maxit=10000,tol=1e-6,lod=3){ + #calculate matrix with estimated RF + + names <- rownames(rf) + estrf_df <- as_tibble(rf) + estrf_df <- tibble(marker=names, + estrf_df) + + + estrf_df %<>% select(marker,all_of(mark)) + estrf_df <- full_join(estrf_df,annot,by=c("marker"="marker")) + estrf_df %>% filter(!!sym(mark)>3) +} \ No newline at end of file