From 6bf7b037aeddd6994b1cfb5f9970b45d3d4735d7 Mon Sep 17 00:00:00 2001
From: Remi  PLANEL <rplanel@pasteur.fr>
Date: Thu, 28 Mar 2024 21:12:38 +0100
Subject: [PATCH] WIP: generate system stat and distri

---
 .../content/ArticleSystemDistributionPlot.vue | 166 ++++++++++++++++++
 content/3.defense-systems/abia.md             |   4 +
 content/3.defense-systems/avs.md              |   4 +
 content/3.defense-systems/rm.md               |   3 +
 .../df-wiki-cli/df_wiki_cli/content/main.py   |   9 +-
 5 files changed, 182 insertions(+), 4 deletions(-)
 create mode 100644 components/content/ArticleSystemDistributionPlot.vue

diff --git a/components/content/ArticleSystemDistributionPlot.vue b/components/content/ArticleSystemDistributionPlot.vue
new file mode 100644
index 00000000..9b871639
--- /dev/null
+++ b/components/content/ArticleSystemDistributionPlot.vue
@@ -0,0 +1,166 @@
+<script setup lang="ts">
+import { toValue } from '@vueuse/core';
+import * as d3 from "d3";
+import * as Plot from "@observablehq/plot";
+import { useDisplay } from "vuetify";
+
+const { width } = useDisplay();
+const systemHits = ref(undefined)
+const refseqTaxo = ref(undefined)
+const marginRight = ref(50)
+const selectedTaxoRank = ref("phylum")
+const taxoRanks: Ref<string[]> = ref([
+    "species",
+    "genus",
+    "family",
+    "order",
+    "class",
+    "phylum",
+    "Superkingdom"
+]);
+const { page } = useContent();
+// get the structures
+const msIndexName = ref<string>("refseqsanitized")
+
+onBeforeMount(() => {
+    fetchSystemHits()
+    fetchRefSeqTaxo()
+})
+onMounted(() => {
+    fetchSystemHits()
+    fetchRefSeqTaxo()
+})
+
+
+const computedWidth = computed(() => {
+    const screenWidth = toValue(width) > 1280 ? 1280 : toValue(width)
+    return Math.max(screenWidth, 550);
+});
+
+
+
+const computedDistribution = computed(() => {
+    const toValSelectedTaxoRank = toValue(selectedTaxoRank)
+    const toValSystemHits = toValue(systemHits)
+    const toValRefseqTaxo = toValue(refseqTaxo)
+    if (toValSystemHits?.hits && toValSelectedTaxoRank && toValRefseqTaxo?.facetDistribution) {
+        console.log(toValRefseqTaxo)
+        const toValFacetsPerRank = toValRefseqTaxo.facetDistribution?.[toValSelectedTaxoRank]
+        // group per selected taxo rank and accession
+        const itemsPerGroup = d3.rollup(toValSystemHits.hits, D => D.length, d => d[toValSelectedTaxoRank], d => d.Assembly)
+        console.log(itemsPerGroup)
+        if (toValSelectedTaxoRank === "order"){
+            console.log(itemsPerGroup.get("Oscillatoriales"))
+        }
+        const distribution = []
+        for (const [taxo, values] of itemsPerGroup.entries()) {
+            console.log(toValFacetsPerRank[taxo])
+            if (toValFacetsPerRank[taxo] && toValFacetsPerRank[taxo] > 0) {
+                distribution.push({ taxo, size: (values.size / toValFacetsPerRank[taxo]) * 100 })
+            }
+        }
+        return distribution
+    }
+    return []
+
+})
+
+// const totalGenome = computed(() => {
+//     refseqTaxo?.estimatedTotalHits
+// })
+const systemStatistics = computed(() => {
+    const toValSystemHits = toValue(systemHits)
+    const toValRefseqTaxo = toValue(refseqTaxo)
+
+    let statistics: Record<string, number | undefined> = { totalGenome: undefined, genomeWithSystem: undefined, speciesWithSystem: undefined, percentGenome: undefined }
+    if (toValSystemHits?.facetDistribution) {
+        statistics = {
+            ...statistics,
+            genomeWithSystem: Object.entries(toValSystemHits.facetDistribution.Assembly)?.length,
+            speciesWithSystem: Object.entries(toValSystemHits.facetDistribution.species)?.length,
+        }
+    }
+    if (statistics.genomeWithSystem !== undefined && toValRefseqTaxo?.estimatedTotalHits) {
+        statistics = {
+            ...statistics,
+            totalGenome: toValRefseqTaxo.estimatedTotalHits,
+            percentGenome: (statistics.genomeWithSystem / toValRefseqTaxo.estimatedTotalHits) * 100
+        }
+    }
+    return statistics
+})
+
+const distributionOptions = computed(() => {
+    return {
+        marginBottom: 100,
+        marginRight: marginRight.value,
+        y: { label: `% of genomes encoding ${toValue(page)?.title ?? 'the system'}` },
+        x: { label: selectedTaxoRank.value, tickRotate: 45 },
+        width: computedWidth.value - marginRight.value,
+        marks: [
+            Plot.barY(
+                toValue(computedDistribution),
+                {
+                    y: "size",
+                    x: "taxo",
+                    tip: true,
+                    sort: { x: "-y" },
+                },
+            ),
+        ],
+    };
+})
+
+console.log(computedDistribution)
+
+// =================================================
+// ASYNC PART 
+// =================================================
+async function fetchSystemHits() {
+    const { data, error } = await useAsyncMeiliSearch({
+        index: toValue(msIndexName),
+        query: "",
+        params: {
+            facets: ["*"],
+            filter: [`type='${toValue(page).title}'`],
+            limit: 500000,
+        }
+    })
+    systemHits.value = data.value
+    if (error.value) {
+        throw createError(`Cannot get hits on refseq for system: ${page.title}`)
+    }
+}
+
+async function fetchRefSeqTaxo() {
+    const { data, error } = await useAsyncMeiliSearch({
+        index: "refseqtaxo",
+        query: "",
+        params: {
+            facets: ["*"],
+            filter: [],
+            limit: 1,
+        }
+    })
+    refseqTaxo.value = data.value
+    if (error.value) {
+        throw createError(`Cannot get refseq taxo: ${page.title}`)
+    }
+}
+
+</script>
+<template>
+    <v-card flat>
+        <v-select v-model="selectedTaxoRank" :items="taxoRanks" density="compact" label="Select taxonomic rank"
+            hide-details="auto" class="mx-2"></v-select>
+
+        <v-card-text>
+            Among the {{ d3.format(",")(systemStatistics.totalGenome) }} complete genomes of RefSeq, the {{ page.title
+            }} is
+            detected in {{ d3.format(",")(systemStatistics.genomeWithSystem) }} genomes ({{
+            d3.format(".2f")(systemStatistics.percentGenome) }} %).
+            The system was detected in {{ d3.format(",")(systemStatistics.speciesWithSystem) }} different species.
+        </v-card-text>
+        <PlotFigure ref="systemsDistributionPlot" :options="unref(distributionOptions)" defer></PlotFigure>
+    </v-card>
+</template>
\ No newline at end of file
diff --git a/content/3.defense-systems/abia.md b/content/3.defense-systems/abia.md
index b201de9d..f88d18f2 100644
--- a/content/3.defense-systems/abia.md
+++ b/content/3.defense-systems/abia.md
@@ -60,6 +60,10 @@ Proportion of genome encoding the AbiA system for the 14 phyla with more than 50
 
 
 
+::article-system-distribution-plot
+::
+
+
 ## Structure
 ### AbiA_large
 ##### Example 1
diff --git a/content/3.defense-systems/avs.md b/content/3.defense-systems/avs.md
index c56bd771..7ef1438d 100644
--- a/content/3.defense-systems/avs.md
+++ b/content/3.defense-systems/avs.md
@@ -73,6 +73,10 @@ The system was detected in 366 different species.
 
 Proportion of genome encoding the Avs system for the 14 phyla with more than 50 genomes in the RefSeq database.
 
+
+::article-system-distribution-plot
+::
+
 ## Structure
 ### AVAST_I
 ##### Example 1
diff --git a/content/3.defense-systems/rm.md b/content/3.defense-systems/rm.md
index 86a78383..2cb561dc 100644
--- a/content/3.defense-systems/rm.md
+++ b/content/3.defense-systems/rm.md
@@ -70,6 +70,9 @@ The system was detected in 6137 different species.
 Proportion of genome encoding the RM system for the 14 phyla with more than 50 genomes in the RefSeq database.
 
 
+::article-system-distribution-plot
+::
+
 ## Structure
 
 ### Experimentaly determined structure
diff --git a/packages/df-wiki-cli/df_wiki_cli/content/main.py b/packages/df-wiki-cli/df_wiki_cli/content/main.py
index ac707257..db32af7e 100644
--- a/packages/df-wiki-cli/df_wiki_cli/content/main.py
+++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py
@@ -341,6 +341,7 @@ def refseq_group_per_assembly_and_type(
             "species",
         ],
         as_index=False,
+        dropna=False
     ).size()
     df_final_grouped.reset_index().to_csv(output, index=False)
 
@@ -380,6 +381,7 @@ def refseq_group_per_assembly(
             "species",
         ],
         as_index=False,
+        dropna=False,
     ).size()
     df_grouped.reset_index().to_csv(output, index=False)
 
@@ -406,10 +408,7 @@ def refseq_type_count(
     ],
 ):
     df = pd.read_csv(input)
-    grouped_per_type = df.groupby(
-        ["type"],
-        as_index=False,
-    ).size()
+    grouped_per_type = df.groupby(["type"], as_index=False, dropna=False).size()
     grouped_per_type.reset_index().to_csv(output, index=False)
 
 
@@ -442,12 +441,14 @@ def _sanitized_refseq_hits(df):
             "species",
         ],
         as_index=False,
+        dropna=False,
     ).size()
 
     # count each occurrence
     df_again_per_assembly = no_sys_assembly_by_size.groupby(
         "Assembly",
         as_index=False,
+        dropna=False
     ).size()
     # filter to keep only size > 1 (when == 1 it means that there is only "no system found for an assembly")
     # so we should keep it
-- 
GitLab