Skip to content
Snippets Groups Projects
Commit ac32dcc0 authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

df-wiki-cli: can create a systems index in meilisearch

parent 47f443b8
No related branches found
No related tags found
1 merge request!161List system ms
Pipeline #118732 passed
This commit is part of merge request !161. Comments created here will be created in the context of that merge request.
import typer
import sys
import re
import json
import pandas as pd
from pandas.errors import InvalidIndexError
import shutil
from typing_extensions import Annotated
from typing import Optional, List
......@@ -111,3 +114,80 @@ def structure(
systemDir = output / system
systemDir.mkdir(parents=True, exist_ok=True)
shutil.copy2(f, systemDir)
@app.command()
def systems(
dir: Annotated[
Path,
typer.Option(exists=False, file_okay=False, readable=True, dir_okay=True),
],
pfam: Annotated[
Path,
typer.Option(
exists=False,
file_okay=True,
writable=True,
),
],
output: Annotated[
Path,
typer.Option(
file_okay=True,
dir_okay=False,
writable=True,
resolve_path=True,
),
],
):
with open(pfam, "r") as pf:
pfam_df = pd.read_csv(pf, index_col="AC", keep_default_na=False)
systems = []
with open(output, "a") as ty:
for file in dir.iterdir():
if file.suffix == ".md":
console.rule(f"[bold blue]{file.name}", style="blue")
with open(file) as f:
metadata, _ = frontmatter.parse(f.read())
del metadata["layout"]
if "tableColumns" in metadata:
table_data = metadata["tableColumns"]
if "PFAM" in table_data:
# print(table_data["PFAM"])
pfams_list = [
pfam.strip()
for pfam in table_data["PFAM"].split(",")
]
pfam_metadata = list()
for pfam in pfams_list:
try:
pfam_obj = pfam_df.loc[[pfam]]
# print(pfam_obj)
pfam_to_dict = pfam_obj.to_dict(orient="index")
pfam_to_dict[pfam]["AC"] = pfam
flatten_value = pfam_to_dict[pfam]
pfam_metadata.append(flatten_value)
except KeyError as err:
console.print(f"[bold red]{err}", style="red")
console.print(
f"[bold red]No pfam entry or {pfam}",
style="red",
)
continue
del metadata["tableColumns"]
if "article" in table_data:
metadata["doi"] = table_data["article"]["doi"]
if "abstract" in table_data["article"]:
metadata["abstract"] = table_data["article"][
"abstract"
]
del table_data["article"]
sanitizedMetadata = {**metadata, **table_data}
sanitizedMetadata["PFAM"] = pfam_metadata
systems.append(sanitizedMetadata)
json_object = json.dumps(systems, indent=2)
ty.write(json_object)
import meilisearch
from pathlib import Path
import csv
import json
from typing import Annotated, List, Optional
from pydantic import BaseModel, Field, BeforeValidator
from enum import Enum
......@@ -175,6 +175,37 @@ def update_structure(
index.update_typo_tolerance({"enabled": False})
def update_systems(
host: str,
key: str,
file: Path,
document: str,
):
client = meilisearch.Client(host, key)
index = client.index(document.lower())
with open(file, "r") as jsonfile:
json_object = json.load(jsonfile)
tasks = index.add_documents_in_batches(json_object, primary_key="title")
for task in tasks:
print(task)
pagination_settings_task = index.update_pagination_settings(
{"maxTotalHits": 100000}
)
print(pagination_settings_task)
attr_task = index.update_filterable_attributes(
body=["title", "Sensor", "Activator", "Effector", "PFAM.AC"]
)
params = {
"maxValuesPerFacet": 1000000,
"sortFacetValuesBy": {"*": "count"},
}
index.update_faceting_settings(params)
print(attr_task)
index.update_sortable_attributes(["title", "Sensor", "Activator", "Effector"])
index.update_typo_tolerance({"enabled": False})
def split_on_comma(str_val: str) -> List[str]:
for val in str_val.split(","):
yield val.strip()
......@@ -2,7 +2,7 @@ import typer
import meilisearch
from typing_extensions import Annotated
from pathlib import Path
from df_wiki_cli.meilisearch import update_refseq, update_structure
from df_wiki_cli.meilisearch import update_refseq, update_structure, update_systems
from enum import Enum
from types import SimpleNamespace
......@@ -12,6 +12,7 @@ app = typer.Typer()
class Documents(str, Enum):
refseq = "refseq"
structure = "structure"
systems = "systems"
@app.callback()
......@@ -54,6 +55,8 @@ def update(
update_refseq(ctx.obj.host, ctx.obj.key, file, document)
if document == "structure":
update_structure(ctx.obj.host, ctx.obj.key, file, document)
if document == "systems":
update_systems(ctx.obj.host, ctx.obj.key, file, document)
@app.command()
......
......@@ -26,7 +26,10 @@ def fetch_pfam(output):
else:
kk, v = line.split(" ")
g, k = kk.split()
d[pfID][k] = v
if (k == 'AC'):
d[pfID][k] = v.split(".")[0]
else:
d[pfID][k] = v
df = pd.DataFrame(d).T
df.index.name = "ID"
......
[tool.poetry]
name = "df-wiki-cli"
version = "0.1.2"
version = "0.1.3"
description = ""
authors = ["Remi PLANEL <rplanel@pasteur.fr>"]
readme = "README.md"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment