From f166a689120d2afbc21af0b66ad6de04184fcaf8 Mon Sep 17 00:00:00 2001
From: Remi  PLANEL <rplanel@pasteur.fr>
Date: Fri, 29 Mar 2024 13:13:09 +0100
Subject: [PATCH] add command to get structure data

---
 .../df-wiki-cli/df_wiki_cli/content/main.py   | 152 +++++++++++++-----
 packages/df-wiki-cli/poetry.lock              |  36 ++++-
 packages/df-wiki-cli/pyproject.toml           |   3 +-
 3 files changed, 147 insertions(+), 44 deletions(-)

diff --git a/packages/df-wiki-cli/df_wiki_cli/content/main.py b/packages/df-wiki-cli/df_wiki_cli/content/main.py
index db32af7e..25c47cc2 100644
--- a/packages/df-wiki-cli/df_wiki_cli/content/main.py
+++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py
@@ -2,7 +2,6 @@ import typer
 import sys
 import json
 import pandas as pd
-import shutil
 import csv
 import matplotlib.pyplot as plt
 from pandas.errors import ParserError
@@ -14,6 +13,8 @@ import frontmatter
 from enum import Enum
 from rich.console import Console
 import re
+import requests
+from Bio.PDB import PDBParser, MMCIFIO
 
 console = Console()
 app = typer.Typer()
@@ -77,8 +78,12 @@ def lint(
             console.print("[green] Everything is alright")
 
 
-@app.command()
+@app.command(
+    help="Download all structures from statistics file. Generate PAE from raw data and organize these files per systems"
+)
 def structure(
+    user: Annotated[str, typer.Option(help="username credential")],
+    password: Annotated[str, typer.Option(help="password credential")],
     stat: Annotated[
         Path,
         typer.Option(
@@ -90,21 +95,10 @@ def structure(
             resolve_path=True,
         ),
     ],
-    dir: Annotated[
-        Path,
-        typer.Option(
-            exists=True,
-            file_okay=False,
-            dir_okay=True,
-            writable=True,
-            readable=True,
-            resolve_path=True,
-        ),
-    ],
     output: Annotated[
         Path,
         typer.Option(
-            exists=True,
+            exists=False,
             file_okay=False,
             dir_okay=True,
             writable=True,
@@ -115,41 +109,56 @@ def structure(
 ):
     with open(stat, "r") as stat_f:
         reader = csv.DictReader(stat_f)
+        count_row = 0
         for row in reader:
-            dir_name = row["System_name_ok"]
+            count_row += 1
+            system_dir_name = row["System_name_ok"]
             pdb_path_file = Path(row["pdb"])
-            cif_file_name = Path(str(pdb_path_file).split(".pdb")[0] + ".cif")
+            foldseek_html_file = Path(str(pdb_path_file).split(".pdb")[0] + ".html")
+            png_structure = Path(str(pdb_path_file).split(".pdb")[0] + ".png")
             files = [
                 {"f": pdb_path_file, "d": "PDB"},
-                {"f": cif_file_name, "d": "CIF"},
                 {"f": Path(row["pae_table"]), "d": "PAE"},
                 {"f": Path(row["fasta_file"]), "d": "Fastas"},
-                {"f": Path(row["Foldseek_name"]), "d": "foldseek_monomers_html"},
+                {"f": foldseek_html_file, "d": "foldseek_pdb_html"},
+                {"f": png_structure, "d": "png"},
             ]
-            target_dir = output / dir_name
+
+            target_dir = output / system_dir_name
             target_dir.mkdir(parents=True, exist_ok=True)
+            base_url = "https://data.atkinson-lab.com/DefenceFinder"
+            console.rule(f"[bold blue]{count_row} / {system_dir_name}", style="blue")
             for f in files:
+                console.print(f"[bold blue]{f['d']}", style="blue")
                 str_f = str(f["f"])
                 if str_f and str_f != "." and str_f != "" and str_f != "na":
-                    file_to_copy = dir / f["d"] / f["f"]
-                    if file_to_copy.exists():
-                        if f["d"] == "PAE":
-                            png_file = str(file_to_copy).split(".tsv")[0] + ".png"
-                            try:
-                                pae2png(file_to_copy, png_file)
-                            except ParserError as err:
-                                console.print(
-                                    f"[red] file {str(file_to_copy.name)} cannot be parsed"
-                                )
-                                print(err)
-                            else:
-                                shutil.copy2(png_file, target_dir)
-                        else:
-                            shutil.copy2(file_to_copy, target_dir)
-                    else:
-                        console.print(
-                            f"[red] file {str(file_to_copy.name)} does not exist"
-                        )
+                    # get the file from atkinson lab
+                    target_file = target_dir / f["f"]
+                    file_to_fetch = f"{base_url}/{f['d']}/{f['f']}"
+                    console.print(f"Fetch : {file_to_fetch}")
+                    response = requests.get(
+                        file_to_fetch,
+                        auth=(user, password),
+                        allow_redirects=True,
+                        stream=True,
+                    )
+                    with open(target_file, "wb") as fh:
+                        for chunk in response.iter_content(chunk_size=1024):
+                            if chunk:
+                                fh.write(chunk)
+
+                    if f["d"] == "PDB":
+                        # pdb2cif
+                        pdb2cif(target_file)
+                    if f["d"] == "PAE":
+                        png_file = str(target_file).split(".tsv")[0] + ".pae.png"
+                        try:
+                            pae2png(target_file, png_file)
+                        except ParserError as err:
+                            console.print(
+                                f"[red] file {str(target_file.name)} cannot be parsed"
+                            )
+                            print(err)
 
 
 @app.command()
@@ -231,7 +240,68 @@ def systems(
             ty.write(json_object)
 
 
+TMP_CIF = """
+#
+loop_
+_ma_qa_metric.id
+_ma_qa_metric.mode
+_ma_qa_metric.name
+_ma_qa_metric.software_group_id
+_ma_qa_metric.type
+1 global pLDDT 1 pLDDT 
+2 local  pLDDT 1 pLDDT 
+#
+_ma_qa_metric_global.metric_id    1
+_ma_qa_metric_global.metric_value {:.06}
+_ma_qa_metric_global.model_id     1
+_ma_qa_metric_global.ordinal_id   1
+#
+loop_
+_ma_qa_metric_local.label_asym_id
+_ma_qa_metric_local.label_comp_id
+_ma_qa_metric_local.label_seq_id
+_ma_qa_metric_local.metric_id
+_ma_qa_metric_local.metric_value
+_ma_qa_metric_local.model_id
+_ma_qa_metric_local.ordinal_id
+"""
+
+
+def pdb2cif(pdb):
+    cif = str(pdb).split(".pdb")[0] + ".cif"
+    console.print(f"convert {pdb} -> {cif}")
+    p = PDBParser()
+    struc = p.get_structure("", pdb)
+
+    list_atoms = []
+    for a in struc.get_atoms():
+        list_atoms.append(
+            [
+                a.parent.parent.id,
+                a.parent.resname,
+                a.parent.id[1],
+                "2",
+                a.bfactor,
+                1,
+                a.parent.id[1],
+            ]
+        )
+    df = pd.DataFrame(list_atoms).drop_duplicates()
+
+    with open(cif, "w") as of:
+        of.write(TMP_CIF.format(df[4].mean()))
+    df.to_csv(cif, index=False, header=False, mode="a", sep=" ")
+
+    io = MMCIFIO()
+    io.set_structure(struc)
+
+    with open(cif, "a") as of:
+        io.save(of)
+
+
 def pae2png(tsv_file, png_file):
+
+    console.print(f"Convert : {tsv_file} row data to {png_file}")
     v = pd.read_table(tsv_file, index_col=0, low_memory=False)
     fig, ax = plt.subplots(1, 1, figsize=(4, 3), facecolor=None)
     m = ax.matshow(v, cmap="Greens_r", vmin=0, vmax=35, aspect="auto", origin="lower")
@@ -341,7 +411,7 @@ def refseq_group_per_assembly_and_type(
             "species",
         ],
         as_index=False,
-        dropna=False
+        dropna=False,
     ).size()
     df_final_grouped.reset_index().to_csv(output, index=False)
 
@@ -446,9 +516,7 @@ def _sanitized_refseq_hits(df):
 
     # count each occurrence
     df_again_per_assembly = no_sys_assembly_by_size.groupby(
-        "Assembly",
-        as_index=False,
-        dropna=False
+        "Assembly", as_index=False, dropna=False
     ).size()
     # filter to keep only size > 1 (when == 1 it means that there is only "no system found for an assembly")
     # so we should keep it
diff --git a/packages/df-wiki-cli/poetry.lock b/packages/df-wiki-cli/poetry.lock
index 8d3b6e0c..34991255 100644
--- a/packages/df-wiki-cli/poetry.lock
+++ b/packages/df-wiki-cli/poetry.lock
@@ -24,6 +24,40 @@ files = [
 [package.dependencies]
 pyparsing = ">=2.0.3"
 
+[[package]]
+name = "biopython"
+version = "1.83"
+description = "Freely available tools for computational molecular biology."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "biopython-1.83-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e2cc737906d8de47eedbc4476f711b960c16a65daa8cdd021875398c81999a09"},
+    {file = "biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2df408be9816dd98c28fe181ea93fb6e0d375bf1763ad9ed503ac30bb2df5b1a"},
+    {file = "biopython-1.83-cp310-cp310-win32.whl", hash = "sha256:a0c1c70789c7e2a26563db5ba533fb9fea0cc1f2c7bc7ad240146cb223ba44a3"},
+    {file = "biopython-1.83-cp310-cp310-win_amd64.whl", hash = "sha256:56f03f43c183acb88c082bc31e5f047fcc6d0aceb5270fbd29c31ab769795b86"},
+    {file = "biopython-1.83-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a01dfdad7210f2fd5c4f36606278f91dbfdda6dac02347206d13cc618e79fe32"},
+    {file = "biopython-1.83-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c756c0b81702c705141c87c2805203df01c6d4cf290e8cefd48cbc61a3c85b82"},
+    {file = "biopython-1.83-cp311-cp311-win32.whl", hash = "sha256:0496f2a6e6e060d8ff0f34784ad15ed342b10cfe282020efe168286f0c14c479"},
+    {file = "biopython-1.83-cp311-cp311-win_amd64.whl", hash = "sha256:8552cc467429b555c604b84fc174c33923bf7e4c735774eda505f1d5a9c2feab"},
+    {file = "biopython-1.83-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0d5ce14755a6b49dea4743cf6929570afe5becb66ad222194984c7bf04218f86"},
+    {file = "biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b35aa095de0fa8339b70664797d0e83322a1a9d512e2fd52d4e872df5189f56"},
+    {file = "biopython-1.83-cp312-cp312-win32.whl", hash = "sha256:118425a210cb3d184c7a78154c5646089366faf124cd46c6056ca7f9302b94ad"},
+    {file = "biopython-1.83-cp312-cp312-win_amd64.whl", hash = "sha256:ca94e8ea8907de841a515af55acb1922a9de99b3144c738a193f2a75e4726078"},
+    {file = "biopython-1.83-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e37884fe39e4560bf5934a4ec4ba7f7fe0e7c091053d03d05b20a70557167717"},
+    {file = "biopython-1.83-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd9bc6fef3f6a10043635a75e1a77c9dce877375140e81059c67c73d4ce65c4c"},
+    {file = "biopython-1.83-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3584122a5daca25b3914a32c52785b051c11518cd5e111e9e89ee04a6234fe"},
+    {file = "biopython-1.83-cp38-cp38-win32.whl", hash = "sha256:641c1a860705d6740eb16c6147b2b730b05a8f5974db804c14d5faa8a1446085"},
+    {file = "biopython-1.83-cp38-cp38-win_amd64.whl", hash = "sha256:94b68e550619e1b6e3784ed8cecb62f201d70d8b87d3a90365291f065ab42bd9"},
+    {file = "biopython-1.83-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:81d1e2515b380e1876720ba79dbf50f8ef3a38cc38ba5953ef61ec20d0934ee2"},
+    {file = "biopython-1.83-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec82350c24cdcf34a8d4a5f189d0ff7dc025658098a60e6f0e681d24b6a1414e"},
+    {file = "biopython-1.83-cp39-cp39-win32.whl", hash = "sha256:e914f7161b3831d7c58db33cc5c7ca64b42c9877c5a776a8313e7a5fd494f8de"},
+    {file = "biopython-1.83-cp39-cp39-win_amd64.whl", hash = "sha256:aae1b156a76907c2abfe9d141776b0aead65695ea914eaecdf12bd1e8991f869"},
+    {file = "biopython-1.83.tar.gz", hash = "sha256:78e6bfb78de63034037afd35fe77cb6e0a9e5b62706becf78a7d922b16ed83f7"},
+]
+
+[package.dependencies]
+numpy = "*"
+
 [[package]]
 name = "black"
 version = "23.11.0"
@@ -1418,4 +1452,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.13"
-content-hash = "d526b2f2acfadf5a8eccfae5e902ab702eaedd8ca88ec310d2f6c4dab88bea8a"
+content-hash = "8834fc25ed4a598a81158cb81ef55d0fb6d950ee29a02caa62e3241dcefc3ca0"
diff --git a/packages/df-wiki-cli/pyproject.toml b/packages/df-wiki-cli/pyproject.toml
index b6763ddc..e32dd2b3 100644
--- a/packages/df-wiki-cli/pyproject.toml
+++ b/packages/df-wiki-cli/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "df-wiki-cli"
-version = "0.1.7"
+version = "0.1.8"
 description = ""
 authors = ["Remi  PLANEL <rplanel@pasteur.fr>"]
 readme = "README.md"
@@ -19,6 +19,7 @@ pydantic-yaml = "^1.2.0"
 python-frontmatter = "^1.0.1"
 matplotlib = "^3.8.2"
 habanero = "^1.2.6"
+biopython = "^1.83"
 
 
 [tool.poetry.group.dev.dependencies]
-- 
GitLab