From 6bf7b037aeddd6994b1cfb5f9970b45d3d4735d7 Mon Sep 17 00:00:00 2001 From: Remi PLANEL <rplanel@pasteur.fr> Date: Thu, 28 Mar 2024 21:12:38 +0100 Subject: [PATCH] WIP: generate system stat and distri --- .../content/ArticleSystemDistributionPlot.vue | 166 ++++++++++++++++++ content/3.defense-systems/abia.md | 4 + content/3.defense-systems/avs.md | 4 + content/3.defense-systems/rm.md | 3 + .../df-wiki-cli/df_wiki_cli/content/main.py | 9 +- 5 files changed, 182 insertions(+), 4 deletions(-) create mode 100644 components/content/ArticleSystemDistributionPlot.vue diff --git a/components/content/ArticleSystemDistributionPlot.vue b/components/content/ArticleSystemDistributionPlot.vue new file mode 100644 index 00000000..9b871639 --- /dev/null +++ b/components/content/ArticleSystemDistributionPlot.vue @@ -0,0 +1,166 @@ +<script setup lang="ts"> +import { toValue } from '@vueuse/core'; +import * as d3 from "d3"; +import * as Plot from "@observablehq/plot"; +import { useDisplay } from "vuetify"; + +const { width } = useDisplay(); +const systemHits = ref(undefined) +const refseqTaxo = ref(undefined) +const marginRight = ref(50) +const selectedTaxoRank = ref("phylum") +const taxoRanks: Ref<string[]> = ref([ + "species", + "genus", + "family", + "order", + "class", + "phylum", + "Superkingdom" +]); +const { page } = useContent(); +// get the structures +const msIndexName = ref<string>("refseqsanitized") + +onBeforeMount(() => { + fetchSystemHits() + fetchRefSeqTaxo() +}) +onMounted(() => { + fetchSystemHits() + fetchRefSeqTaxo() +}) + + +const computedWidth = computed(() => { + const screenWidth = toValue(width) > 1280 ? 1280 : toValue(width) + return Math.max(screenWidth, 550); +}); + + + +const computedDistribution = computed(() => { + const toValSelectedTaxoRank = toValue(selectedTaxoRank) + const toValSystemHits = toValue(systemHits) + const toValRefseqTaxo = toValue(refseqTaxo) + if (toValSystemHits?.hits && toValSelectedTaxoRank && toValRefseqTaxo?.facetDistribution) { + console.log(toValRefseqTaxo) + const toValFacetsPerRank = toValRefseqTaxo.facetDistribution?.[toValSelectedTaxoRank] + // group per selected taxo rank and accession + const itemsPerGroup = d3.rollup(toValSystemHits.hits, D => D.length, d => d[toValSelectedTaxoRank], d => d.Assembly) + console.log(itemsPerGroup) + if (toValSelectedTaxoRank === "order"){ + console.log(itemsPerGroup.get("Oscillatoriales")) + } + const distribution = [] + for (const [taxo, values] of itemsPerGroup.entries()) { + console.log(toValFacetsPerRank[taxo]) + if (toValFacetsPerRank[taxo] && toValFacetsPerRank[taxo] > 0) { + distribution.push({ taxo, size: (values.size / toValFacetsPerRank[taxo]) * 100 }) + } + } + return distribution + } + return [] + +}) + +// const totalGenome = computed(() => { +// refseqTaxo?.estimatedTotalHits +// }) +const systemStatistics = computed(() => { + const toValSystemHits = toValue(systemHits) + const toValRefseqTaxo = toValue(refseqTaxo) + + let statistics: Record<string, number | undefined> = { totalGenome: undefined, genomeWithSystem: undefined, speciesWithSystem: undefined, percentGenome: undefined } + if (toValSystemHits?.facetDistribution) { + statistics = { + ...statistics, + genomeWithSystem: Object.entries(toValSystemHits.facetDistribution.Assembly)?.length, + speciesWithSystem: Object.entries(toValSystemHits.facetDistribution.species)?.length, + } + } + if (statistics.genomeWithSystem !== undefined && toValRefseqTaxo?.estimatedTotalHits) { + statistics = { + ...statistics, + totalGenome: toValRefseqTaxo.estimatedTotalHits, + percentGenome: (statistics.genomeWithSystem / toValRefseqTaxo.estimatedTotalHits) * 100 + } + } + return statistics +}) + +const distributionOptions = computed(() => { + return { + marginBottom: 100, + marginRight: marginRight.value, + y: { label: `% of genomes encoding ${toValue(page)?.title ?? 'the system'}` }, + x: { label: selectedTaxoRank.value, tickRotate: 45 }, + width: computedWidth.value - marginRight.value, + marks: [ + Plot.barY( + toValue(computedDistribution), + { + y: "size", + x: "taxo", + tip: true, + sort: { x: "-y" }, + }, + ), + ], + }; +}) + +console.log(computedDistribution) + +// ================================================= +// ASYNC PART +// ================================================= +async function fetchSystemHits() { + const { data, error } = await useAsyncMeiliSearch({ + index: toValue(msIndexName), + query: "", + params: { + facets: ["*"], + filter: [`type='${toValue(page).title}'`], + limit: 500000, + } + }) + systemHits.value = data.value + if (error.value) { + throw createError(`Cannot get hits on refseq for system: ${page.title}`) + } +} + +async function fetchRefSeqTaxo() { + const { data, error } = await useAsyncMeiliSearch({ + index: "refseqtaxo", + query: "", + params: { + facets: ["*"], + filter: [], + limit: 1, + } + }) + refseqTaxo.value = data.value + if (error.value) { + throw createError(`Cannot get refseq taxo: ${page.title}`) + } +} + +</script> +<template> + <v-card flat> + <v-select v-model="selectedTaxoRank" :items="taxoRanks" density="compact" label="Select taxonomic rank" + hide-details="auto" class="mx-2"></v-select> + + <v-card-text> + Among the {{ d3.format(",")(systemStatistics.totalGenome) }} complete genomes of RefSeq, the {{ page.title + }} is + detected in {{ d3.format(",")(systemStatistics.genomeWithSystem) }} genomes ({{ + d3.format(".2f")(systemStatistics.percentGenome) }} %). + The system was detected in {{ d3.format(",")(systemStatistics.speciesWithSystem) }} different species. + </v-card-text> + <PlotFigure ref="systemsDistributionPlot" :options="unref(distributionOptions)" defer></PlotFigure> + </v-card> +</template> \ No newline at end of file diff --git a/content/3.defense-systems/abia.md b/content/3.defense-systems/abia.md index b201de9d..f88d18f2 100644 --- a/content/3.defense-systems/abia.md +++ b/content/3.defense-systems/abia.md @@ -60,6 +60,10 @@ Proportion of genome encoding the AbiA system for the 14 phyla with more than 50 +::article-system-distribution-plot +:: + + ## Structure ### AbiA_large ##### Example 1 diff --git a/content/3.defense-systems/avs.md b/content/3.defense-systems/avs.md index c56bd771..7ef1438d 100644 --- a/content/3.defense-systems/avs.md +++ b/content/3.defense-systems/avs.md @@ -73,6 +73,10 @@ The system was detected in 366 different species. Proportion of genome encoding the Avs system for the 14 phyla with more than 50 genomes in the RefSeq database. + +::article-system-distribution-plot +:: + ## Structure ### AVAST_I ##### Example 1 diff --git a/content/3.defense-systems/rm.md b/content/3.defense-systems/rm.md index 86a78383..2cb561dc 100644 --- a/content/3.defense-systems/rm.md +++ b/content/3.defense-systems/rm.md @@ -70,6 +70,9 @@ The system was detected in 6137 different species. Proportion of genome encoding the RM system for the 14 phyla with more than 50 genomes in the RefSeq database. +::article-system-distribution-plot +:: + ## Structure ### Experimentaly determined structure diff --git a/packages/df-wiki-cli/df_wiki_cli/content/main.py b/packages/df-wiki-cli/df_wiki_cli/content/main.py index ac707257..db32af7e 100644 --- a/packages/df-wiki-cli/df_wiki_cli/content/main.py +++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py @@ -341,6 +341,7 @@ def refseq_group_per_assembly_and_type( "species", ], as_index=False, + dropna=False ).size() df_final_grouped.reset_index().to_csv(output, index=False) @@ -380,6 +381,7 @@ def refseq_group_per_assembly( "species", ], as_index=False, + dropna=False, ).size() df_grouped.reset_index().to_csv(output, index=False) @@ -406,10 +408,7 @@ def refseq_type_count( ], ): df = pd.read_csv(input) - grouped_per_type = df.groupby( - ["type"], - as_index=False, - ).size() + grouped_per_type = df.groupby(["type"], as_index=False, dropna=False).size() grouped_per_type.reset_index().to_csv(output, index=False) @@ -442,12 +441,14 @@ def _sanitized_refseq_hits(df): "species", ], as_index=False, + dropna=False, ).size() # count each occurrence df_again_per_assembly = no_sys_assembly_by_size.groupby( "Assembly", as_index=False, + dropna=False ).size() # filter to keep only size > 1 (when == 1 it means that there is only "no system found for an assembly") # so we should keep it -- GitLab