diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5ad0b6ca06365e42271fc5d9ea13973f3d6536b7..cf8463f27addbf6032d2b19cfe344ef88195b3cd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -192,7 +192,42 @@ lint: variables: MEILI_HOST: "http://localhost:7700" script: - # - rm data/list-systems.json + # STRUCTURE + - > + df-wiki-cli + meilisearch + delete-all-documents structure + - > + df-wiki-cli + meilisearch + --host ${MEILI_HOST} + --key "${MEILI_MASTER_KEY}" + update structure + --file data/all_predictions_statistics.tsv + + # SYSTEM OPERON STRUCUTRE + - > + df-wiki-cli + meilisearch + delete-all-documents system-operon-structure + - > + df-wiki-cli + content + system-operon-structure + --tags "1.2.2" + --tags "1.2.3" + --tags "1.2.4" + --versions "1.2.2" + --versions "1.2.3" + --versions "v1.2.4" + --structure data/all_predictions_statistics.tsv + --output data/system-structures.csv + - > + df-wiki-cli + meilisearch + update system-operon-structure + --file data/system-structures.csv + ###### REFSEQ # delete all document refseq - > @@ -303,27 +338,7 @@ lint: --key "${MEILI_MASTER_KEY}" update systems --file data/list-systems.json - # STRUCTURE - - > - df-wiki-cli - meilisearch - --host ${MEILI_HOST} - --key "${MEILI_MASTER_KEY}" - update structure - --file data/all_predictions_statistics_clean.csv - - # SYSTEM OPERON STRUCUTRE - - > - df-wiki-cli - content - system-operon-structure - --version "1.2.4" --output data/system-structures.csv - - > - df-wiki-cli - meilisearch - update system-operon-structure - --file data/system-structures.csv - + # ARTICLES - > df-wiki-cli diff --git a/packages/df-wiki-cli/df_wiki_cli/content/main.py b/packages/df-wiki-cli/df_wiki_cli/content/main.py index 227fbfd01a2aa35f6a5bbc29fb51c1f29005d3a1..48a89a26ed9f91b7fb893e758d6c924eb3691821 100644 --- a/packages/df-wiki-cli/df_wiki_cli/content/main.py +++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py @@ -5,9 +5,10 @@ import pandas as pd import csv import tempfile import matplotlib.pyplot as plt +from operator import itemgetter from pandas.errors import ParserError from typing_extensions import Annotated -from typing import Optional, List +from typing import Optional, List, Tuple from pathlib import Path from pydantic import BaseModel, ValidationError import frontmatter @@ -254,69 +255,70 @@ def system_operon_structure( resolve_path=True, ), ] = "./system-structures.csv", - version: Annotated[str, typer.Option(help="Defense finder model")] = "v1.2.4", - tag: Annotated[str, typer.Option(help="Defense finder model")] = "1.2.4", + structure: Annotated[ + Path, + typer.Option( + file_okay=True, + dir_okay=False, + writable=False, + resolve_path=True, + ), + ] = "./all_predictions_statistics.tsv", + versions: Annotated[List[str], typer.Option(help="Defense finder model")] = [ + "1.2.2", + "1.2.3", + "v1.2.4", + ], + tags: Annotated[List[str], typer.Option(help="Defense finder model")] = [ + "1.2.2", + "1.2.3", + "1.2.4", + ], ): # get defense finder model from github - - df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz" - _, tmp_path = tempfile.mkstemp() - tmp_root_dir = tempfile.gettempdir() - df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}") - df_model_definitions_dir = df_model_dir / "defense-finder-models" / "definitions" - console.print(f"Download models: {df_model_url}") - response = requests.get( - df_model_url, - allow_redirects=True, - ) - with open(tmp_path, mode="wb") as file: - file.write(response.content) - - console.print("untar file") - with tarfile.open(tmp_path) as archive: - archive.extractall(df_model_dir) - # # extract foreach system and subsystem list genes - # set the order + releases = zip(versions, tags) + model_dirs = list(download_model_release(releases)) + + systems = [] + with open(structure) as tsvfile: + tsvreader = csv.DictReader(tsvfile, delimiter="\t") + for row in tsvreader: + systems.append({"system": row["system"], "subsystem": row["subsystem"]}) system_genes = [] - for child in df_model_definitions_dir.iterdir(): - for system_path in child.iterdir(): - system = system_path.name - # console.rule(system) - subsystem_list = ( - s for s in system_path.iterdir() if str(s).endswith(".xml") - ) - for subsystem in subsystem_list: - susbsystem_name = subsystem.stem - console.print(susbsystem_name) - in_exchangeables = False - current_gene = {} - exchangeables = [] - - with open(subsystem) as file: - for event, elem in ET.iterparse(file, events=("start", "end")): - if event == "start": - if elem.tag == "gene" and not in_exchangeables: - current_gene = { - "system": system, - "subsystem": susbsystem_name, - "gene": elem.attrib["name"], - "exchangeables": None, - } - system_genes.append(current_gene) - if elem.tag == "gene" and in_exchangeables: - exchangeables.append(elem.attrib["name"]) - if elem.tag == "exchangeables": - in_exchangeables = True - exchangeables = [] - elif event == "end": - if elem.tag == "exchangeables": - in_exchangeables = False - current_gene["exchangeables"] = ",".join(exchangeables) - exchangeables = [] + for system_def in systems: + system, subsystem = itemgetter("system", "subsystem")(system_def) + list_paths = list(gen_model_path(model_dirs)) + if system != "#N/A" and subsystem != "#N/A": + def_path = find_model_definition(system, subsystem, list_paths) + in_exchangeables = False + current_gene = {} + exchangeables = [] + with open(def_path["path"]) as file: + for event, elem in ET.iterparse(file, events=("start", "end")): + if event == "start": + if elem.tag == "gene" and not in_exchangeables: + current_gene = { + "system": system, + "subsystem": subsystem, + "gene": elem.attrib["name"], + "version": def_path["version"], + "exchangeables": None, + } + system_genes.append(current_gene) + if elem.tag == "gene" and in_exchangeables: + exchangeables.append(elem.attrib["name"]) + if elem.tag == "exchangeables": + in_exchangeables = True + exchangeables = [] + elif event == "end": + if elem.tag == "exchangeables": + in_exchangeables = False + current_gene["exchangeables"] = ",".join(exchangeables) + exchangeables = [] with open(output, "w") as f: - fieldnames = ["id", "system", "subsystem", "gene", "exchangeables"] + fieldnames = ["id", "system", "subsystem", "version", "gene", "exchangeables"] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for id, gene in enumerate(system_genes): @@ -325,6 +327,30 @@ def system_operon_structure( writer.writerow(gene) +def download_model_release(releases): + + for version, tag in releases: + df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz" + _, tmp_path = tempfile.mkstemp() + tmp_root_dir = tempfile.gettempdir() + df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}") + df_model_definitions_dir = ( + df_model_dir / "defense-finder-models" / "definitions" + ) + console.print(f"Download models: {df_model_url}") + response = requests.get( + df_model_url, + allow_redirects=True, + ) + with open(tmp_path, mode="wb") as file: + file.write(response.content) + + console.print("untar file") + with tarfile.open(tmp_path) as archive: + archive.extractall(df_model_dir) + yield {"version": tag, "dir": df_model_definitions_dir} + + TMP_CIF = """ # loop_ @@ -681,3 +707,29 @@ def _sanitized_refseq_hits(df): | ~df.Assembly.isin(assembly_where_should_remove_no_sys_found) ] return df_filtered_assembly_only_with_sys + + +def find_model_definition(system, subsystem, list_paths): + found_path = None + for p in list_paths: + path = p["path"] + parts = path.parts + if path.stem == subsystem and parts[-2] == system: + console.rule(f"{system} - {subsystem}") + console.print(p) + found_path = {"path": path, "version": p["version"]} + break + + if found_path is None: + raise FileNotFoundError + else: + return found_path + + +def gen_model_path(model_dirs): + for model_dir in model_dirs: + for subdir in model_dir["dir"].iterdir(): + for system_path in subdir.iterdir(): + for subsystem_path in system_path.iterdir(): + if str(subsystem_path).endswith(".xml"): + yield {"path": subsystem_path, "version": model_dir["version"]} diff --git a/packages/df-wiki-cli/df_wiki_cli/meilisearch/update/main.py b/packages/df-wiki-cli/df_wiki_cli/meilisearch/update/main.py index ef1e2c20874558e25e177200f050a73932d9af9e..de579e18a5ff7402bca8632078b321c9e8866abd 100644 --- a/packages/df-wiki-cli/df_wiki_cli/meilisearch/update/main.py +++ b/packages/df-wiki-cli/df_wiki_cli/meilisearch/update/main.py @@ -108,6 +108,7 @@ class SystemOperonStructure(BaseModel): id: int system: str subsystem: str + version: str gene: str exchangeables: Optional[List[str]] diff --git a/packages/df-wiki-cli/pyproject.toml b/packages/df-wiki-cli/pyproject.toml index 27dd969e124ccf0286f72fbd5f49d739b900046d..885575df9e1daec640d813327c0ee02246f3dc49 100644 --- a/packages/df-wiki-cli/pyproject.toml +++ b/packages/df-wiki-cli/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "df-wiki-cli" -version = "0.1.9" +version = "0.2.0" description = "" authors = ["Remi PLANEL <rplanel@pasteur.fr>"] readme = "README.md"