parse df models to get list gene per system and subsystem

c9989ba1 · Remi PLANEL · 16410613 · c9989ba1
Commit c9989ba1 authored 11 months ago by Remi PLANEL
--- a/packages/df-wiki-cli/df_wiki_cli/content/main.py
+++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py
@@ -16,6 +16,8 @@ from rich.console import Console
 import re
 import requests
 from Bio.PDB import PDBParser, MMCIFIO
+import tarfile
+import xml.etree.ElementTree as ET
 console = Console()
 app = typer.Typer()
@@ -241,6 +243,79 @@ def systems(
            ty.write(json_object)
+@app.command()
+def system_operon_structure(
+    version: Annotated[str, typer.Option(help="Defense finder model")] = "1.2.4",
+):
+    # get defense finder model from github
+    # https://github.com/mdmparis/defense-finder-models/releases/download/1.2.4/defense-finder-models-v1.2.4.tar.gz
+    df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{version}/defense-finder-models-v{version}.tar.gz"
+    _, tmp_path = tempfile.mkstemp()
+    tmp_root_dir = tempfile.gettempdir()
+    df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-v{version}")
+    df_model_definitions_dir = df_model_dir / "defense-finder-models" / "definitions"
+    console.print(f"Download models: {df_model_url}")
+    # response = requests.get(
+    #     df_model_url,
+    #     allow_redirects=True,
+    # )
+    # with open(tmp_path, mode="wb") as file:
+    #     file.write(response.content)
+    # console.print("untar file")
+    # with tarfile.open(tmp_path) as archive:
+    #     archive.extractall(df_model_dir)
+    # # extract foreach system and subsystem list genes
+    # set the order
+    system_genes = []
+    for child in df_model_definitions_dir.iterdir():
+        for system_path in child.iterdir():
+            system = system_path.name
+            console.rule(system)
+            subsystem_list = (
+                s for s in system_path.iterdir() if str(s).endswith(".xml")
+            )
+            for subsystem in subsystem_list:
+                susbsystem_name = subsystem.stem
+                console.print(susbsystem_name)
+                with open(subsystem) as file:
+                    tree = ET.parse(file)
+                    root = tree.getroot()
+                    current_gene = {}
+                    for child in root.iter():
+                        if child.tag == "gene":
+                            current_gene = {
+                                "system": system,
+                                "subsystem": susbsystem_name,
+                                "gene": child.attrib["name"],
+                                "alternatives": None,
+                            }
+                            system_genes.append(current_gene)
+                        if child.tag == "exchangeables":
+                            alternatives = []
+                            for ex_gene in child.iter():
+                                # console.rule("exchangeables")
+                                # console.print(ex_gene.attrib)
+                                if ex_gene.tag == "gene":
+                                    console.print()
+                                    alternatives.append(ex_gene.attrib["name"])
+                            current_gene["alternatives"] = ",".join(alternatives)
+                            current_gene = {}
+                        print(current_gene)
+                        # print(child.tag, child.attrib)
+    # print(system_genes)
+    with open("/tmp/log", "w") as f:
+        fieldnames = ["system", "subsystem", "gene", "alternatives"]
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for gene in system_genes:
+            # gene["alternatives"] = ",".join(gene["alternatives"])
+            writer.writerow(gene)
 TMP_CIF = """
 #
 loop_
@@ -511,7 +586,7 @@ def markdown(
            with open(dst, "r+") as f:
                all_file = f.read()
                if (
                    re.search(
                        r"#{2}\s+Structure",