get operon struct from multiple version

5d50d1ba · Remi PLANEL · e70b3a91 · 5d50d1ba · 5d50d1ba · 5d50d1ba
Commit 5d50d1ba authored 1 year ago by Remi PLANEL
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -192,7 +192,42 @@ lint:
  variables:
    MEILI_HOST: "http://localhost:7700"
  script:
-    # - rm data/list-systems.json
+   # STRUCTURE
+    - >
+      df-wiki-cli 
+      meilisearch 
+      delete-all-documents structure
+    - >
+      df-wiki-cli
+      meilisearch 
+      --host ${MEILI_HOST}
+      --key "${MEILI_MASTER_KEY}"
+      update structure
+      --file data/all_predictions_statistics.tsv
+    # SYSTEM OPERON STRUCUTRE
+    - >
+      df-wiki-cli 
+      meilisearch 
+      delete-all-documents system-operon-structure
+    - >
+      df-wiki-cli 
+      content 
+      system-operon-structure 
+      --tags "1.2.2"
+      --tags "1.2.3"
+      --tags "1.2.4"
+      --versions "1.2.2"
+      --versions "1.2.3"
+      --versions "v1.2.4"
+      --structure data/all_predictions_statistics.tsv
+      --output data/system-structures.csv
+    - >
+      df-wiki-cli 
+      meilisearch 
+      update system-operon-structure 
+      --file data/system-structures.csv
    ###### REFSEQ
    # delete all document refseq
    - >
@@ -303,26 +338,6 @@ lint:
      --key "${MEILI_MASTER_KEY}"
      update systems
      --file data/list-systems.json
-    # STRUCTURE
-    - >
-      df-wiki-cli
-      meilisearch 
-      --host ${MEILI_HOST}
-      --key "${MEILI_MASTER_KEY}"
-      update structure
-      --file data/all_predictions_statistics_clean.csv
-    # SYSTEM OPERON STRUCUTRE
-    - >
-      df-wiki-cli 
-      content 
-      system-operon-structure 
-      --version "1.2.4" --output data/system-structures.csv
-    - >
-      df-wiki-cli 
-      meilisearch 
-      update system-operon-structure 
-      --file data/system-structures.csv
    # ARTICLES
    - > 

--- a/packages/df-wiki-cli/df_wiki_cli/content/main.py
+++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py
@@ -5,9 +5,10 @@ import pandas as pd
 import csv
 import tempfile
 import matplotlib.pyplot as plt
+from operator import itemgetter
 from pandas.errors import ParserError
 from typing_extensions import Annotated
-from typing import Optional, List
+from typing import Optional, List, Tuple
 from pathlib import Path
 from pydantic import BaseModel, ValidationError
 import frontmatter
@@ -254,53 +255,54 @@ def system_operon_structure(
            resolve_path=True,
        ),
    ] = "./system-structures.csv",
-    version: Annotated[str, typer.Option(help="Defense finder model")] = "v1.2.4",
+    structure: Annotated[
-    tag: Annotated[str, typer.Option(help="Defense finder model")] = "1.2.4",
+        Path,
+        typer.Option(
+            file_okay=True,
+            dir_okay=False,
+            writable=False,
+            resolve_path=True,
+        ),
+    ] = "./all_predictions_statistics.tsv",
+    versions: Annotated[List[str], typer.Option(help="Defense finder model")] = [
+        "1.2.2",
+        "1.2.3",
+        "v1.2.4",
+    ],
+    tags: Annotated[List[str], typer.Option(help="Defense finder model")] = [
+        "1.2.2",
+        "1.2.3",
+        "1.2.4",
+    ],
 ):
    # get defense finder model from github
+    releases = zip(versions, tags)
+    model_dirs = list(download_model_release(releases))
-    df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz"
+    systems = []
-    _, tmp_path = tempfile.mkstemp()
+    with open(structure) as tsvfile:
-    tmp_root_dir = tempfile.gettempdir()
+        tsvreader = csv.DictReader(tsvfile, delimiter="\t")
-    df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}")
+        for row in tsvreader:
-    df_model_definitions_dir = df_model_dir / "defense-finder-models" / "definitions"
+            systems.append({"system": row["system"], "subsystem": row["subsystem"]})
-    console.print(f"Download models: {df_model_url}")
-    response = requests.get(
-        df_model_url,
-        allow_redirects=True,
-    )
-    with open(tmp_path, mode="wb") as file:
-        file.write(response.content)
-    console.print("untar file")
-    with tarfile.open(tmp_path) as archive:
-        archive.extractall(df_model_dir)
-    # # extract foreach system and subsystem list genes
-    # set the order
    system_genes = []
-    for child in df_model_definitions_dir.iterdir():
+    for system_def in systems:
-        for system_path in child.iterdir():
+        system, subsystem = itemgetter("system", "subsystem")(system_def)
-            system = system_path.name
+        list_paths = list(gen_model_path(model_dirs))
-            # console.rule(system)
+        if system != "#N/A" and subsystem != "#N/A":
-            subsystem_list = (
+            def_path = find_model_definition(system, subsystem, list_paths)
-                s for s in system_path.iterdir() if str(s).endswith(".xml")
-            )
-            for subsystem in subsystem_list:
-                susbsystem_name = subsystem.stem
-                console.print(susbsystem_name)
            in_exchangeables = False
            current_gene = {}
            exchangeables = []
+            with open(def_path["path"]) as file:
-                with open(subsystem) as file:
                for event, elem in ET.iterparse(file, events=("start", "end")):
                    if event == "start":
                        if elem.tag == "gene" and not in_exchangeables:
                            current_gene = {
                                "system": system,
-                                    "subsystem": susbsystem_name,
+                                "subsystem": subsystem,
                                "gene": elem.attrib["name"],
+                                "version": def_path["version"],
                                "exchangeables": None,
                            }
                            system_genes.append(current_gene)
@@ -316,7 +318,7 @@ def system_operon_structure(
                            exchangeables = []
    with open(output, "w") as f:
-        fieldnames = ["id", "system", "subsystem", "gene", "exchangeables"]
+        fieldnames = ["id", "system", "subsystem", "version", "gene", "exchangeables"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for id, gene in enumerate(system_genes):
@@ -325,6 +327,30 @@ def system_operon_structure(
            writer.writerow(gene)
+def download_model_release(releases):
+    for version, tag in releases:
+        df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz"
+        _, tmp_path = tempfile.mkstemp()
+        tmp_root_dir = tempfile.gettempdir()
+        df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}")
+        df_model_definitions_dir = (
+            df_model_dir / "defense-finder-models" / "definitions"
+        )
+        console.print(f"Download models: {df_model_url}")
+        response = requests.get(
+            df_model_url,
+            allow_redirects=True,
+        )
+        with open(tmp_path, mode="wb") as file:
+            file.write(response.content)
+        console.print("untar file")
+        with tarfile.open(tmp_path) as archive:
+            archive.extractall(df_model_dir)
+        yield {"version": tag, "dir": df_model_definitions_dir}
 TMP_CIF = """
 #
 loop_
@@ -681,3 +707,29 @@ def _sanitized_refseq_hits(df):
        | ~df.Assembly.isin(assembly_where_should_remove_no_sys_found)
    ]
    return df_filtered_assembly_only_with_sys
+def find_model_definition(system, subsystem, list_paths):
+    found_path = None
+    for p in list_paths:
+        path = p["path"]
+        parts = path.parts
+        if path.stem == subsystem and parts[-2] == system:
+            console.rule(f"{system} - {subsystem}")
+            console.print(p)
+            found_path = {"path": path, "version": p["version"]}
+            break
+    if found_path is None:
+        raise FileNotFoundError
+    else:
+        return found_path
+def gen_model_path(model_dirs):
+    for model_dir in model_dirs:
+        for subdir in model_dir["dir"].iterdir():
+            for system_path in subdir.iterdir():
+                for subsystem_path in system_path.iterdir():
+                    if str(subsystem_path).endswith(".xml"):
+                        yield {"path": subsystem_path, "version": model_dir["version"]}
--- a/packages/df-wiki-cli/df_wiki_cli/meilisearch/update/main.py
+++ b/packages/df-wiki-cli/df_wiki_cli/meilisearch/update/main.py
@@ -108,6 +108,7 @@ class SystemOperonStructure(BaseModel):
    id: int
    system: str
    subsystem: str
+    version: str
    gene: str
    exchangeables: Optional[List[str]]

--- a/packages/df-wiki-cli/pyproject.toml
+++ b/packages/df-wiki-cli/pyproject.toml
 [tool.poetry]
 name = "df-wiki-cli"
-version = "0.1.9"
+version = "0.2.0"
 description = ""
 authors = ["Remi  PLANEL <rplanel@pasteur.fr>"]
 readme = "README.md"