diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7a66c6829c8f3f5bce2eee73a1beaca8c095091f..4a947922d8233505c34350b1bcf8732ffc33f6a2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -192,18 +192,20 @@ lint: MEILI_HOST: "http://localhost:7700" script: # - rm data/list-systems.json + ###### REFSEQ + # delete all document refseq - > df-wiki-cli meilisearch --host ${MEILI_HOST} --key ${MEILI_MASTER_KEY} delete-all-documents refseq + # update refseq index - > - df-wiki-cli - content systems - --dir content/3.defense-systems/ - --pfam public/pfam-a-hmm.csv - --output data/list-systems.json + df-wiki-cli meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + index-update refseq sys_id - > df-wiki-cli meilisearch @@ -211,23 +213,92 @@ lint: --key ${MEILI_MASTER_KEY} update --file data/refseq_res.csv - --document refseq + --document refseq + # REF SEQ TAXO # + - > + df-wiki-cli + content + refseq-group-per-assembly + --input data/refseq_res.csv + --output data/refseqtaxo.csv + - > + df-wiki-cli + meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + delete-all-documents refseqtaxo + - > + df-wiki-cli meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + index-update refseqtaxo Assembly - > df-wiki-cli meilisearch --host ${MEILI_HOST} --key ${MEILI_MASTER_KEY} update - --file data/refseq_res.csv + --file data/refseqtaxo.csv --document refseqtaxo + # REFSEQ TAXO TYPE # + - > + df-wiki-cli + content + refseq-group-per-assembly-and-type + --input data/refseq_res.csv + --output data/refseqtaxotype.csv + - > + df-wiki-cli + meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + delete-all-documents refseqtaxotype + - > + df-wiki-cli meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + index-update refseqtaxotype index - > df-wiki-cli meilisearch --host ${MEILI_HOST} --key ${MEILI_MASTER_KEY} update - --file data/all_predictions_statistics_clean.csv - --document structure + --file data/refseqtaxotype.csv + --document refseqtaxotype + # SANITIZED REFSEQ + - > + df-wiki-cli + content + refseq-sanitized-hits + --input data/refseq_res.csv + --output data/refseq-sanitized.csv + - > + df-wiki-cli + meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + delete-all-documents refseqsanitized + - > + df-wiki-cli meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + index-update refseqsanitized index + - > + df-wiki-cli + meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + update + --file data/refseq-sanitized.csv + --document refseqsanitized + # SYSTEMS + - > + df-wiki-cli + content systems + --dir content/3.defense-systems/ + --pfam public/pfam-a-hmm.csv + --output data/list-systems.json - > df-wiki-cli meilisearch @@ -236,6 +307,16 @@ lint: update --file data/list-systems.json --document systems + # STRUCTURE + - > + df-wiki-cli + meilisearch + --host ${MEILI_HOST} + --key ${MEILI_MASTER_KEY} + update + --file data/all_predictions_statistics_clean.csv + --document structure + # ARTICLES - > df-wiki-cli meilisearch @@ -411,7 +492,7 @@ build:prod:wiki: deploy:dev: extends: .deploy rules: - - if: $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH == "rework-references" + - if: $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH == "distri-system-section" needs: - "build:dev:wiki" when: manual diff --git a/components/Nav/Navbar.vue b/components/Nav/Navbar.vue index e5a879a612b4b49c8d90009a22ec15c49ed4fc34..23ae1ca901c1815ff7f9a863b9a3a29c776d5dca 100644 --- a/components/Nav/Navbar.vue +++ b/components/Nav/Navbar.vue @@ -36,12 +36,11 @@ const sections = ref([ href: runtimeConfig.public.defenseFinderWebservice, }, { id: "wiki", label: "Wiki", to: '/', }, - { id: "refseq", label: "RefSeq DB", to: '/refseq/' }, - { id: "structure", label: "Structures DB", to: '/structure/' }, + { id: "refseq", label: "RefSeq DB", to: '/refseq' }, + { id: "structure", label: "Structures DB", to: '/structure' }, { id: "help", label: "Help", to: '/help/defensefinder' }, ]); - function toggleDrawer() { emit('update:drawer', !props.drawer) } @@ -63,7 +62,7 @@ function toggleDrawer() { {{ section.label }} </v-btn> <v-btn @click="toggleTheme" color="primary" :icon="theme.global.current.value.dark ? 'md:light_mode' : 'md:dark_mode' - "></v-btn> + "></v-btn> </template> <template v-else> <v-menu> diff --git a/components/content/MolstarPdbePlugin.vue b/components/content/MolstarPdbePlugin.vue index 82f8db51cd1b8741ccde6b7ff2e503ac872cbf74..83a69f03e78ee4ed7c27ed3b501330a3b176727a 100644 --- a/components/content/MolstarPdbePlugin.vue +++ b/components/content/MolstarPdbePlugin.vue @@ -17,7 +17,6 @@ export interface Props { } const { mobile } = useDisplay() -// const selectedPdb = ref('') const refinedDataUrls = computed(() => { function refinedUrl(url: string) { @@ -105,7 +104,6 @@ useHead({ }) const pdbeMolstarComponent = ref(null) -// const selectedPdb = ref("/wiki/avs/AVAST_I,AVAST_I__Avs1A,0,V-plddts_85.07081.pdb") const selectedPdb: Ref<string | null> = ref(null) const structureToDownload: Ref<string | null> = ref(null) const selectedPaePath = computed(() => { @@ -151,9 +149,6 @@ function setSelectedPdbToFirst() { } } -// const moleculeFormat = computed(() => { -// return toValue(selectedPdb)?.split(".")?.[-1]?.toLowerCase() ?? "pdb" -// }) const moleculeFormat: Ref<string> = ref("pdb") </script> @@ -191,7 +186,8 @@ const moleculeFormat: Ref<string> = ref("pdb") <v-card flat :rounded="false"> <v-toolbar> <v-toolbar-title>Structures</v-toolbar-title> - <v-select v-model="selectedPdb" label="Select PDB" :items="refinedDataUrls" hide-details="auto"></v-select> + <v-select v-model="selectedPdb" label="Select PDB" :items="refinedDataUrls" + hide-details="auto"></v-select> <v-spacer></v-spacer> <v-btn :disabled="!selectedPdb" icon="md:download" :href="structureToDownload"></v-btn> @@ -207,15 +203,12 @@ const moleculeFormat: Ref<string> = ref("pdb") class="d-flex align-center justify-center flex-wrap text-center mx-auto px-4 my-3" :height="computedHeight" :width="computedWidth" style="position:relative;"> <pdbe-molstar ref="pdbeMolstarComponent" :custom-data-url="selectedPdb" alphafold-view - sequence-panel="true" landscape="false" :custom-data-format="moleculeFormat"></pdbe-molstar> + sequence-panel="true" landscape="false" + :custom-data-format="moleculeFormat"></pdbe-molstar> </v-sheet> </v-col> <v-col v-if="moleculeFormat === 'cif'" :cols="mobile ? 12 : undefined"> <v-img :src="selectedPaePath"></v-img> - - <!-- <PlotFigure v-if="sanitizedPaeData?.length > 0 && paeError === null" defer - :options="plotPaeOptions"></PlotFigure> - <v-alert v-else type="warning" variant="tonal">{{ paeError }}</v-alert> --> <v-card flat color="transparent"> <v-card-title>Model Confidence</v-card-title> <v-card-text> diff --git a/components/content/RefseqDb.vue b/components/content/RefseqDb.vue index 131a9200d371a7b07ceedb11a8d3513c9a0ee21a..f0528b29f410e78d468a0ec80c1223c41d2b23ca 100644 --- a/components/content/RefseqDb.vue +++ b/components/content/RefseqDb.vue @@ -201,24 +201,26 @@ async function getAllHits(params: { index: string, params: Record<string, any>, selectedTaxoRank.value = "Superkingdom" } - if (params.index === toValue(dbName)) { - - pendingAllHits.value = true - try { - const { data, error } = await useAsyncMeiliSearch({ - ...params, - params: { - ...params.params, - 'attributesToRetrieve': ['type', 'Assembly', ...toValue(availableTaxo)] - } - }) - allHits.value = data.value - } finally { - pendingAllHits.value = false + // if (params.index === toValue(dbName)) { - } + pendingAllHits.value = true + try { + const { data, error } = await useAsyncMeiliSearch({ + ...params, + index: "refseqsanitized", + params: { + ...params.params, + + 'attributesToRetrieve': ['type', 'Assembly', ...toValue(availableTaxo)] + } + }) + allHits.value = data.value + } finally { + pendingAllHits.value = false } + + // } } @@ -363,8 +365,8 @@ const binPlotOptions = computed(() => { marginBottom: 200, padding: 0, grid: true, + aspectRatio: true, x: { tickRotate: 90, label: "Systems", domain: toValue(sortedCellDomain) }, - // y: { tickFormat: 's' }, color: { scheme: "plasma", legend: true, label: `Proportion per ${selectedTaxoRank.value}`, domain: [0, 100] }, } }) @@ -375,12 +377,18 @@ const binPlotGroup = computed(() => { { label: (d) => d.fill, fill: { + /** + * + * @param I is the list of element index that are par of the same group (cell) + * @param X is the list of all elements + */ reduceIndex: function (I, X) { const toValTaxonomyFacet = toValue(taxonomyFacet) if (toValTaxonomyFacet !== undefined) { const clade = X[I[0]][selectedTaxoRank.value] const system = X[I[0]].type - // Get the list of item for this group + // Get the list of all the items for this group (same cell) + // and group them per type and assembly const itemsPerGroup = d3.rollup(I.map(index => { return X[index] }), D => D.length, d => d.type, d => d.Assembly) @@ -408,22 +416,20 @@ const binPlotGroup = computed(() => { const binPlotDataOptions = computed(() => { const toValueAllHits = toValue(allHits) + const toValBinPlotGroup = toValue(binPlotGroup) const data = toValueAllHits?.hits ?? [] + const plotCellMark = Plot.cell(data, toValBinPlotGroup) return toValueAllHits?.hits?.length > 0 ? { ...binPlotOptions.value, width: width.value, - title: "Proportion of genomes with defense system X for a given clade", + title: `Proportion of genomes with defense system X per ${selectedTaxoRank.value} taxonomic rank`, color: { ...binPlotOptions.value.color, type: scaleType.value, tickFormat: '~s', ticks: scaleType.value === 'symlog' ? 3 : 5, }, - marks: [ - Plot.cell(data, - toValue(binPlotGroup) - ), - ], + marks: [plotCellMark], } : null }) diff --git a/content/2.general-concepts/3.defense-systems-effectors.md b/content/2.general-concepts/3.defense-systems-effectors.md index 461d46fecee0aef8ecd5ff66ac4f81fc6932d795..fa4a8904c4ea687a877502e92a4caaf6317d4863 100644 --- a/content/2.general-concepts/3.defense-systems-effectors.md +++ b/content/2.general-concepts/3.defense-systems-effectors.md @@ -11,7 +11,7 @@ contributors: Most of the anti-phage defense systems of bacteria can be described as a combination of two main components. First, a sensing component that detects phage infection to trigger the immune response -(see [defense-systems_trigger](/general-concepts/defense-systems_trigger/)). +(see [defense-systems_trigger](/general-concepts/defense-systems_trigger)). Second, an effector component that mediates the immune response following the detection of phage infection. The effector components of anti-phage systems are very diverse, and can be arbitrarily distributed in broad categories :ref{doi=10.1038/s41579-023-00934-x} : diff --git a/data/refseq_res.csv b/data/refseq_res.csv index becb696192d60bc89a646284da9d6d86ba120069..0a1eb9240b45c61ab360341a90d140ed28017fc9 100644 --- a/data/refseq_res.csv +++ b/data/refseq_res.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a7d382a7f767718dc48aa49ae3dd9b0159fdffd96e48946c7a167bcbc516deb -size 68772089 +oid sha256:cdd04a43d107190f9a80e39be9684f6a130908a45de88b65e31b821393e3b9be +size 68458331 diff --git a/data/refseq_res_sm.csv b/data/refseq_res_sm.csv deleted file mode 100644 index 0fc07a648f019b58b75e3834c9417aee56f6e363..0000000000000000000000000000000000000000 --- a/data/refseq_res_sm.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9d3cf3c0649a0a3158a686e1f4c79b8978faf4ba5e14d365bf8349553c25a55 -size 2254987 diff --git a/docker-compose.yml b/docker-compose.yml index 1d7e174e51d23ed142a117a8847c8a9ece08d9bc..0cedb3670df391be58a329e1d72b5524cde783c0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ services: context: . target: dev args: - BASE_URL: /wiki/ + BASE_URL: /wiki MEILI_HOST: http://localhost:7700 MEILI_API_KEY: f9cc073016cbb392365aae86517878cb3f3408bb85c1fafd06e27f73ccb35e3d HOST_URL: http://localhost:8082 diff --git a/nuxt.config.ts b/nuxt.config.ts index 3701ae9f48748f34b5c3bc8e888b0d231cd421e4..419aead0afc19593c45ce976bceb6ed602ebcb36 100644 --- a/nuxt.config.ts +++ b/nuxt.config.ts @@ -19,11 +19,12 @@ export default defineNuxtConfig({ "data-domaim": "defense-finder.dev.pasteur.cloud", src: "https://plausible.pasteur.cloud/js/script.js" } - - ] } }, + router: { + strict: true + }, site: { url: 'https://defensefinder.mdmlab.fr', name: 'DefenseFinder webservice and knowledge base', diff --git a/packages/df-wiki-cli/data/test/refseq.csv b/packages/df-wiki-cli/data/test/refseq.csv new file mode 100644 index 0000000000000000000000000000000000000000..63cae96a82b258d253725663ae0d4de7cf6a0d5f --- /dev/null +++ b/packages/df-wiki-cli/data/test/refseq.csv @@ -0,0 +1,19 @@ +sys_id,Assembly,replicon,type,subtype,sys_beg,sys_end,protein_in_syst,genes_count,name_of_profiles_in_sys,accession_in_sys,Superkingdom,phylum,class,order,family,genus,species +GCF_001602115_NZ_CP014352_AbiE_1,GCF_001602115.1,NZ_CP014352,AbiE,AbiE,GCF_001602115.1_NZ_CP014352_01750,GCF_001602115.1_NZ_CP014352_01751,"GCF_001602115.1_NZ_CP014352_01750, GCF_001602115.1_NZ_CP014352_01751",2.0,"AbiEii__AbiEii, AbiEii__AbiEi_1","WP_062819585.1, WP_062819586.1",Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_Thoeris_II_1,GCF_001602115.1,NZ_CP014352,Thoeris,Thoeris_II,GCF_001602115.1_NZ_CP014352_00483,GCF_001602115.1_NZ_CP014352_00484,"GCF_001602115.1_NZ_CP014352_00483, GCF_001602115.1_NZ_CP014352_00484",2.0,"Thoeris__ThsB_Global, Thoeris_II__ThsA_new_petit","WP_062818945.1, WP_082815879.1",Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_IIG_5,GCF_001602115.1,NZ_CP014352,RM,RM_Type_IIG,GCF_001602115.1_NZ_CP014352_00058,GCF_001602115.1_NZ_CP014352_00058,GCF_001602115.1_NZ_CP014352_00058,1.0,RM_Type_IIG__Type_IIG,WP_062818741.1,Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_I_1,GCF_001602115.1,NZ_CP014352,RM,RM_Type_I,GCF_001602115.1_NZ_CP014352_00060,GCF_001602115.1_NZ_CP014352_00062,"GCF_001602115.1_NZ_CP014352_00060, GCF_001602115.1_NZ_CP014352_00061, GCF_001602115.1_NZ_CP014352_00062",3.0,"RM__Type_I_REases, RM__Type_I_S, RM__Type_I_MTases","WP_062818743.1, WP_062818744.1, WP_062818745.1",Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_IIG_6,GCF_001602115.1,NZ_CP014352,RM,RM_Type_IIG,GCF_001602115.1_NZ_CP014352_00069,GCF_001602115.1_NZ_CP014352_00069,GCF_001602115.1_NZ_CP014352_00069,1.0,RM_Type_IIG__Type_IIG,WP_062818749.1,Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_I_2,GCF_001602115.1,NZ_CP014352,RM,RM_Type_I,GCF_001602115.1_NZ_CP014352_00475,GCF_001602115.1_NZ_CP014352_00477,"GCF_001602115.1_NZ_CP014352_00475, GCF_001602115.1_NZ_CP014352_00476, GCF_001602115.1_NZ_CP014352_00477",3.0,"RM__Type_I_MTases, RM__Type_I_S, RM__Type_I_REases","WP_062818938.1, WP_082815877.1, WP_062818940.1",Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_I_3,GCF_001602115.1,NZ_CP014352,RM,RM_Type_I,GCF_001602115.1_NZ_CP014352_02756,GCF_001602115.1_NZ_CP014352_02758,"GCF_001602115.1_NZ_CP014352_02756, GCF_001602115.1_NZ_CP014352_02757, GCF_001602115.1_NZ_CP014352_02758",3.0,"RM__Type_I_REases, RM__Type_I_S, RM__Type_I_MTases","WP_062820191.1, WP_062820192.1, WP_062820839.1",Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_IIG_7,GCF_001602115.1,NZ_CP014352,RM,RM_Type_IIG,GCF_001602115.1_NZ_CP014352_03005,GCF_001602115.1_NZ_CP014352_03005,GCF_001602115.1_NZ_CP014352_03005,1.0,RM_Type_IIG__Type_IIG,WP_198401420.1,Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_I_4,GCF_001602115.1,NZ_CP014352,RM,RM_Type_I,GCF_001602115.1_NZ_CP014352_03420,GCF_001602115.1_NZ_CP014352_03422,"GCF_001602115.1_NZ_CP014352_03420, GCF_001602115.1_NZ_CP014352_03421, GCF_001602115.1_NZ_CP014352_03422",3.0,"RM__Type_I_REases, RM__Type_I_S, RM__Type_I_MTases","WP_062818743.1, WP_062818744.1, WP_062818745.1",Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_RM_Type_IIG_8,GCF_001602115.1,NZ_CP014352,RM,RM_Type_IIG,GCF_001602115.1_NZ_CP014352_03427,GCF_001602115.1_NZ_CP014352_03427,GCF_001602115.1_NZ_CP014352_03427,1.0,RM_Type_IIG__Type_IIG,WP_157773967.1,Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014353,GCF_001602115.1,NZ_CP014353,No system found,,,,,,,,Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_001602115_NZ_CP014352_CAS_Class1-Subtype-I-G_1,GCF_001602115.1,NZ_CP014352,Cas,CAS_Class1-Subtype-I-G,GCF_001602115.1_NZ_CP014352_01614,GCF_001602115.1_NZ_CP014352_01620,"GCF_001602115_NZ_CP014352_01614, GCF_001602115_NZ_CP014352_01615, GCF_001602115_NZ_CP014352_01616, GCF_001602115_NZ_CP014352_01617, GCF_001602115_NZ_CP014352_01618, GCF_001602115_NZ_CP014352_01619, GCF_001602115_NZ_CP014352_01620",7.0,"DEDDh_I_II_III_IV_V_VI_1, csb1gr7_I-G_1, csb2gr5_I-G_1, cas3_I-G_3, csb3_I-G_1, cas1_I_II_III_IV_V_VI_1, cas2_I_II_III_IV_V_VI_3","WP_198401461.1, WP_062819507.1, WP_062819508.1, WP_062819509.1, WP_062819510.1, WP_062819511.1, WP_062819512.1",Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Propionibacteriaceae,Acidipropionibacterium,Acidipropionibacterium acidipropionici +GCF_000830055_NZ_CP010781_Gabija_1,GCF_000830055.1,NZ_CP010781,Gabija,Gabija,GCF_000830055.1_NZ_CP010781_01480,GCF_000830055.1_NZ_CP010781_01481,"GCF_000830055.1_NZ_CP010781_01480, GCF_000830055.1_NZ_CP010781_01481",2.0,"Gabija__GajA, Gabija__GajB_2","WP_005115822.1, WP_000073989.1",Bacteria,Pseudomonadota,Gammaproteobacteria,Moraxellales,Moraxellaceae,Acinetobacter,Acinetobacter baumannii +GCF_000830055_NZ_CP010781_Gao_Qat_1,GCF_000830055.1,NZ_CP010781,Gao_Qat,Gao_Qat,GCF_000830055.1_NZ_CP010781_00952,GCF_000830055.1_NZ_CP010781_00955,"GCF_000830055.1_NZ_CP010781_00952, GCF_000830055.1_NZ_CP010781_00953, GCF_000830055.1_NZ_CP010781_00954, GCF_000830055.1_NZ_CP010781_00955",4.0,"Gao_Qat__QatA, Gao_Qat__QatB, Gao_Qat__QatC, Gao_Qat__QatD","WP_000269396.1, WP_000537345.1, WP_041152179.1, WP_000937120.1",Bacteria,Pseudomonadota,Gammaproteobacteria,Moraxellales,Moraxellaceae,Acinetobacter,Acinetobacter baumannii +GCF_000830055_NZ_CP010781_RosmerTA_1,GCF_000830055.1,NZ_CP010781,RosmerTA,RosmerTA,GCF_000830055.1_NZ_CP010781_00651,GCF_000830055.1_NZ_CP010781_00652,"GCF_000830055.1_NZ_CP010781_00651, GCF_000830055.1_NZ_CP010781_00652",2.0,"RosmerTA__RmrA_2634932349, RosmerTA__RmrT_2634932349","WP_000482796.1, WP_001182927.1",Bacteria,Pseudomonadota,Gammaproteobacteria,Moraxellales,Moraxellaceae,Acinetobacter,Acinetobacter baumannii +GCF_000830055_NZ_CP010781_RM_Type_II_1,GCF_000830055.1,NZ_CP010781,RM,RM_Type_II,GCF_000830055.1_NZ_CP010781_03697,GCF_000830055.1_NZ_CP010781_03698,"GCF_000830055.1_NZ_CP010781_03697, GCF_000830055.1_NZ_CP010781_03698",2.0,"RM_Type_II__Type_II_REases, RM_Type_II__Type_II_MTases","WP_001062713.1, WP_000862934.1",Bacteria,Pseudomonadota,Gammaproteobacteria,Moraxellales,Moraxellaceae,Acinetobacter,Acinetobacter baumannii +GCF_000830055_NZ_CP010782,GCF_000830055.1,NZ_CP010782,No system found,,,,,,,,Bacteria,Pseudomonadota,Gammaproteobacteria,Moraxellales,Moraxellaceae,Acinetobacter,Acinetobacter baumannii +GCF_000830055_NZ_CP010781_CAS_Class1-Subtype-I-F_1,GCF_000830055.1,NZ_CP010781,Cas,CAS_Class1-Subtype-I-F,GCF_000830055.1_NZ_CP010781_02755,GCF_000830055.1_NZ_CP010781_02760,"GCF_000830055_NZ_CP010781_02755, GCF_000830055_NZ_CP010781_02756, GCF_000830055_NZ_CP010781_02757, GCF_000830055_NZ_CP010781_02758, GCF_000830055_NZ_CP010781_02759, GCF_000830055_NZ_CP010781_02760",6.0,"cas6f_I_II_III_IV_V_VI_3, cas7f_I-F_2, cas5f_I-F_4, cas8f_I-F_8, cas3f_I-F_1, cas1_I-F_1","WP_001104789.1, WP_001097003.1, WP_001215684.1, WP_000841022.1, WP_000637362.1, WP_000436801.1",Bacteria,Pseudomonadota,Gammaproteobacteria,Moraxellales,Moraxellaceae,Acinetobacter,Acinetobacter baumannii diff --git a/packages/df-wiki-cli/df_wiki_cli/content/main.py b/packages/df-wiki-cli/df_wiki_cli/content/main.py index 17b7865777a7117790fa90495b7ddd602bc903f7..ac7072571803c17ddcad4346904c0b401b35da15 100644 --- a/packages/df-wiki-cli/df_wiki_cli/content/main.py +++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py @@ -244,7 +244,7 @@ def pae2png(tsv_file, png_file): plt.close() -@app.command() +@app.command(help="Remove version from sys_id") def refseq( input: Annotated[ Path, @@ -276,3 +276,189 @@ def refseq( console.print(f"[green]{row['sys_id']} -> {result}") row["sys_id"] = result writer.writerow(row) + + +@app.command( + help='Remove "No system found" hits if the are not the only hit for an assembly' +) +def refseq_sanitized_hits( + input: Annotated[ + Path, + typer.Option( + exists=False, + file_okay=True, + writable=True, + ), + ], + output: Annotated[ + Path, + typer.Option( + file_okay=True, + dir_okay=False, + writable=True, + resolve_path=True, + ), + ], +): + df = pd.read_csv(input) + df_final = _sanitized_refseq_hits(df) + df_final.reset_index().to_csv(output, index=False) + return df_final + + +@app.command(help="Group hits per assembly and types (from 'sanitized-hits')") +def refseq_group_per_assembly_and_type( + input: Annotated[ + Path, + typer.Option( + exists=False, + file_okay=True, + writable=True, + ), + ], + output: Annotated[ + Path, + typer.Option( + file_okay=True, + dir_okay=False, + writable=True, + resolve_path=True, + ), + ], +): + df = pd.read_csv(input) + df_final = _sanitized_refseq_hits(df) + df_final_grouped = df_final.groupby( + [ + "Assembly", + "type", + "Superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ], + as_index=False, + ).size() + df_final_grouped.reset_index().to_csv(output, index=False) + + +@app.command() +def refseq_group_per_assembly( + input: Annotated[ + Path, + typer.Option( + exists=False, + file_okay=True, + writable=True, + ), + ], + output: Annotated[ + Path, + typer.Option( + file_okay=True, + dir_okay=False, + writable=True, + resolve_path=True, + ), + ], +): + df = pd.read_csv(input) + + df["Assembly"] = df["Assembly"].apply(remove_version) + df_grouped = df.groupby( + [ + "Assembly", + "Superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ], + as_index=False, + ).size() + df_grouped.reset_index().to_csv(output, index=False) + + +@app.command() +def refseq_type_count( + input: Annotated[ + Path, + typer.Option( + exists=False, + file_okay=True, + writable=True, + help="csv file with type and taxo (No system found removed when other system are founded in the same assembly)", + ), + ], + output: Annotated[ + Path, + typer.Option( + file_okay=True, + dir_okay=False, + writable=True, + resolve_path=True, + ), + ], +): + df = pd.read_csv(input) + grouped_per_type = df.groupby( + ["type"], + as_index=False, + ).size() + grouped_per_type.reset_index().to_csv(output, index=False) + + +def remove_version(assembly): + return assembly.split(".")[0] + + +def _sanitized_refseq_hits(df): + df["Assembly"] = df["Assembly"].apply(remove_version) + # Lower type namesmc + # df["type"] = df["type"].apply(lambda x: x.lower()) + + # Get all row with no system type + df_no_system = df.loc[df["type"] == "No system found"] + # unique assembly with no sys + serie_assembly_with_no_sys = df_no_system["Assembly"].unique() + # filter assembly to have those with no sys + df_with_no_sys = df[df["Assembly"].isin(serie_assembly_with_no_sys)] + # Group them by assembly, type, taxo + no_sys_assembly_by_size = df_with_no_sys.groupby( + [ + "Assembly", + "type", + "Superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ], + as_index=False, + ).size() + + # count each occurrence + df_again_per_assembly = no_sys_assembly_by_size.groupby( + "Assembly", + as_index=False, + ).size() + # filter to keep only size > 1 (when == 1 it means that there is only "no system found for an assembly") + # so we should keep it + df_size_sup_1 = df_again_per_assembly[df_again_per_assembly["size"] > 1] + assembly_where_should_remove_no_sys_found = df_size_sup_1["Assembly"].unique() + + # Construct new dataset to remove entries with no system found + # while found system on other replicon that belongs to the + # same assembly + df_filtered_assembly_only_with_sys = df[ + (df["type"] != "No system found") + | ~df.Assembly.isin(assembly_where_should_remove_no_sys_found) + ] + return df_filtered_assembly_only_with_sys diff --git a/packages/df-wiki-cli/df_wiki_cli/meilisearch/__init__.py b/packages/df-wiki-cli/df_wiki_cli/meilisearch/__init__.py index 5707df46949c4fd609e35340458fb61e1a5c30aa..52668833951380349e912f84de1e06f3b9fe508e 100644 --- a/packages/df-wiki-cli/df_wiki_cli/meilisearch/__init__.py +++ b/packages/df-wiki-cli/df_wiki_cli/meilisearch/__init__.py @@ -38,6 +38,28 @@ class RefSeqCsv(BaseModel): species: str +class RefSeqTaxo(BaseModel): + index: int + Assembly: str + Superkingdom: str + phylum: str + class_: str = Field(..., alias="class") + order: str + family: str + genus: str + species: str + size: int + + +class RefSeqTaxoType(RefSeqTaxo): + type: str + + +class RefSeqTypeCount(BaseModel): + type: str + size: int + + class StructureTypes(str, Enum): Validated = "Validated" DF = "DF" @@ -84,28 +106,57 @@ def update_refseqtaxo(host: str, key: str, file: Path, document: str): documents = [] with open(file, "r") as csvfile: csvreader = csv.DictReader(csvfile) - assembly = {} for row in csvreader: - assembly_id = row["Assembly"] - assembly[row["Assembly"]] = { - k: row[k] - for k in ( - "Superkingdom", - "phylum", - "class", - "order", - "family", - "genus", - "species", - "Assembly", - ) - if k in row - } - assembly[assembly_id]["Assembly"] = assembly[assembly_id]["Assembly"].split('.')[0] - for item in assembly.values(): - documents.append(item) + doc = RefSeqTaxo(**row) + documents.append(doc.model_dump(by_alias=True)) tasks = index.add_documents_in_batches(documents, primary_key="Assembly") - print(tasks) + for task in tasks: + console.print(task) + index.update_pagination_settings({"maxTotalHits": 1000000}) + index.update_filterable_attributes( + body=[ + "Superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "Assembly", + ] + ) + index.update_sortable_attributes( + [ + "Superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "Assembly", + "size", + ] + ) + params = { + "maxValuesPerFacet": 1000000, + "sortFacetValuesBy": {"*": "count"}, + } + index.update_faceting_settings(params) + + +def update_refseqtaxotype(host: str, key: str, file: Path, document: str): + client = meilisearch.Client(host, key) + index = client.index(document.lower()) + documents = [] + with open(file, "r") as csvfile: + csvreader = csv.DictReader(csvfile) + for row in csvreader: + doc = RefSeqTaxoType(**row) + documents.append(doc.model_dump(by_alias=True)) + tasks = index.add_documents_in_batches(documents, primary_key="index") + for task in tasks: + console.print(task) index.update_pagination_settings({"maxTotalHits": 1000000}) index.update_filterable_attributes( body=[ @@ -129,6 +180,35 @@ def update_refseqtaxo(host: str, key: str, file: Path, document: str): "genus", "species", "Assembly", + "type", + "size", + ] + ) + params = { + "maxValuesPerFacet": 1000000, + "sortFacetValuesBy": {"*": "count"}, + } + index.update_faceting_settings(params) + + +def update_refseqtypecount(host: str, key: str, file: Path, document: str): + client = meilisearch.Client(host, key) + index = client.index(document.lower()) + documents = [] + with open(file, "r") as csvfile: + csvreader = csv.DictReader(csvfile) + for row in csvreader: + doc = RefSeqTypeCount(**row) + documents.append(doc.model_dump(by_alias=True)) + tasks = index.add_documents_in_batches(documents, primary_key="type") + for task in tasks: + console.print(task) + index.update_pagination_settings({"maxTotalHits": 1000000}) + index.update_filterable_attributes(body=["type"]) + index.update_sortable_attributes( + [ + "type", + "size", ] ) params = { @@ -342,7 +422,6 @@ def update_articles( print(attr_task) - def split_on_comma(str_val: str) -> List[str]: for val in str_val.split(","): yield val.strip() diff --git a/packages/df-wiki-cli/df_wiki_cli/meilisearch/main.py b/packages/df-wiki-cli/df_wiki_cli/meilisearch/main.py index e6cb0e7b9c38fe0874f3efa988810e8494633c9f..b722a7a7cf965837c610584c1c2a086f660cd039 100644 --- a/packages/df-wiki-cli/df_wiki_cli/meilisearch/main.py +++ b/packages/df-wiki-cli/df_wiki_cli/meilisearch/main.py @@ -6,6 +6,8 @@ from df_wiki_cli.meilisearch import ( update_refseqtaxo, update_articles, update_refseq, + update_refseqtaxotype, + update_refseqtypecount, update_structure, update_systems, ) @@ -21,6 +23,9 @@ app = typer.Typer() class Documents(str, Enum): refseqtaxo = "refseqtaxo" + refseqtaxotype = "refseqtaxotype" + refseqtypecount = "refseqtypecount" + refseqsanitized = "refseqsanitized" refseq = "refseq" structure = "structure" systems = "systems" @@ -63,6 +68,7 @@ def update( ] = Documents.refseq, content_type: Annotated[str, typer.Option(help="Content-Type header")] = "text/csv", ): + if document == "refseqtaxo": update_refseqtaxo(ctx.obj.host, ctx.obj.key, file, document) if document == "refseq": @@ -73,6 +79,12 @@ def update( update_systems(ctx.obj.host, ctx.obj.key, file, document) if document == "article": update_articles(ctx.obj.host, ctx.obj.key, file, document) + if document == "refseqtaxotype": + update_refseqtaxotype(ctx.obj.host, ctx.obj.key, file, document) + if document == "refseqsanitized": + update_refseq(ctx.obj.host, ctx.obj.key, file, document) + if document == "refseqtypecount": + update_refseqtypecount(ctx.obj.host, ctx.obj.key, file, document) @app.command() @@ -97,6 +109,12 @@ def index_update(ctx: typer.Context, index: str, primary_key: str): console.print(task) +@app.command() +def index_delete(ctx: typer.Context, index: str): + client = meilisearch.Client(ctx.obj.host, ctx.obj.key) + client.index(index).delete() + + @app.command() def task(ctx: typer.Context, id: str): client = meilisearch.Client(ctx.obj.host, ctx.obj.key) diff --git a/packages/df-wiki-cli/pyproject.toml b/packages/df-wiki-cli/pyproject.toml index d0a3341443d8102a33b93a44f44f7b16797bb3ed..b6763ddc21e03ebb437cb3085a4e46142d434989 100644 --- a/packages/df-wiki-cli/pyproject.toml +++ b/packages/df-wiki-cli/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "df-wiki-cli" -version = "0.1.6" +version = "0.1.7" description = "" authors = ["Remi PLANEL <rplanel@pasteur.fr>"] readme = "README.md"