From f166a689120d2afbc21af0b66ad6de04184fcaf8 Mon Sep 17 00:00:00 2001 From: Remi PLANEL <rplanel@pasteur.fr> Date: Fri, 29 Mar 2024 13:13:09 +0100 Subject: [PATCH] add command to get structure data --- .../df-wiki-cli/df_wiki_cli/content/main.py | 152 +++++++++++++----- packages/df-wiki-cli/poetry.lock | 36 ++++- packages/df-wiki-cli/pyproject.toml | 3 +- 3 files changed, 147 insertions(+), 44 deletions(-) diff --git a/packages/df-wiki-cli/df_wiki_cli/content/main.py b/packages/df-wiki-cli/df_wiki_cli/content/main.py index db32af7e..25c47cc2 100644 --- a/packages/df-wiki-cli/df_wiki_cli/content/main.py +++ b/packages/df-wiki-cli/df_wiki_cli/content/main.py @@ -2,7 +2,6 @@ import typer import sys import json import pandas as pd -import shutil import csv import matplotlib.pyplot as plt from pandas.errors import ParserError @@ -14,6 +13,8 @@ import frontmatter from enum import Enum from rich.console import Console import re +import requests +from Bio.PDB import PDBParser, MMCIFIO console = Console() app = typer.Typer() @@ -77,8 +78,12 @@ def lint( console.print("[green] Everything is alright") -@app.command() +@app.command( + help="Download all structures from statistics file. Generate PAE from raw data and organize these files per systems" +) def structure( + user: Annotated[str, typer.Option(help="username credential")], + password: Annotated[str, typer.Option(help="password credential")], stat: Annotated[ Path, typer.Option( @@ -90,21 +95,10 @@ def structure( resolve_path=True, ), ], - dir: Annotated[ - Path, - typer.Option( - exists=True, - file_okay=False, - dir_okay=True, - writable=True, - readable=True, - resolve_path=True, - ), - ], output: Annotated[ Path, typer.Option( - exists=True, + exists=False, file_okay=False, dir_okay=True, writable=True, @@ -115,41 +109,56 @@ def structure( ): with open(stat, "r") as stat_f: reader = csv.DictReader(stat_f) + count_row = 0 for row in reader: - dir_name = row["System_name_ok"] + count_row += 1 + system_dir_name = row["System_name_ok"] pdb_path_file = Path(row["pdb"]) - cif_file_name = Path(str(pdb_path_file).split(".pdb")[0] + ".cif") + foldseek_html_file = Path(str(pdb_path_file).split(".pdb")[0] + ".html") + png_structure = Path(str(pdb_path_file).split(".pdb")[0] + ".png") files = [ {"f": pdb_path_file, "d": "PDB"}, - {"f": cif_file_name, "d": "CIF"}, {"f": Path(row["pae_table"]), "d": "PAE"}, {"f": Path(row["fasta_file"]), "d": "Fastas"}, - {"f": Path(row["Foldseek_name"]), "d": "foldseek_monomers_html"}, + {"f": foldseek_html_file, "d": "foldseek_pdb_html"}, + {"f": png_structure, "d": "png"}, ] - target_dir = output / dir_name + + target_dir = output / system_dir_name target_dir.mkdir(parents=True, exist_ok=True) + base_url = "https://data.atkinson-lab.com/DefenceFinder" + console.rule(f"[bold blue]{count_row} / {system_dir_name}", style="blue") for f in files: + console.print(f"[bold blue]{f['d']}", style="blue") str_f = str(f["f"]) if str_f and str_f != "." and str_f != "" and str_f != "na": - file_to_copy = dir / f["d"] / f["f"] - if file_to_copy.exists(): - if f["d"] == "PAE": - png_file = str(file_to_copy).split(".tsv")[0] + ".png" - try: - pae2png(file_to_copy, png_file) - except ParserError as err: - console.print( - f"[red] file {str(file_to_copy.name)} cannot be parsed" - ) - print(err) - else: - shutil.copy2(png_file, target_dir) - else: - shutil.copy2(file_to_copy, target_dir) - else: - console.print( - f"[red] file {str(file_to_copy.name)} does not exist" - ) + # get the file from atkinson lab + target_file = target_dir / f["f"] + file_to_fetch = f"{base_url}/{f['d']}/{f['f']}" + console.print(f"Fetch : {file_to_fetch}") + response = requests.get( + file_to_fetch, + auth=(user, password), + allow_redirects=True, + stream=True, + ) + with open(target_file, "wb") as fh: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + fh.write(chunk) + + if f["d"] == "PDB": + # pdb2cif + pdb2cif(target_file) + if f["d"] == "PAE": + png_file = str(target_file).split(".tsv")[0] + ".pae.png" + try: + pae2png(target_file, png_file) + except ParserError as err: + console.print( + f"[red] file {str(target_file.name)} cannot be parsed" + ) + print(err) @app.command() @@ -231,7 +240,68 @@ def systems( ty.write(json_object) +TMP_CIF = """ +# +loop_ +_ma_qa_metric.id +_ma_qa_metric.mode +_ma_qa_metric.name +_ma_qa_metric.software_group_id +_ma_qa_metric.type +1 global pLDDT 1 pLDDT +2 local pLDDT 1 pLDDT +# +_ma_qa_metric_global.metric_id 1 +_ma_qa_metric_global.metric_value {:.06} +_ma_qa_metric_global.model_id 1 +_ma_qa_metric_global.ordinal_id 1 +# +loop_ +_ma_qa_metric_local.label_asym_id +_ma_qa_metric_local.label_comp_id +_ma_qa_metric_local.label_seq_id +_ma_qa_metric_local.metric_id +_ma_qa_metric_local.metric_value +_ma_qa_metric_local.model_id +_ma_qa_metric_local.ordinal_id +""" + + +def pdb2cif(pdb): + cif = str(pdb).split(".pdb")[0] + ".cif" + console.print(f"convert {pdb} -> {cif}") + p = PDBParser() + struc = p.get_structure("", pdb) + + list_atoms = [] + for a in struc.get_atoms(): + list_atoms.append( + [ + a.parent.parent.id, + a.parent.resname, + a.parent.id[1], + "2", + a.bfactor, + 1, + a.parent.id[1], + ] + ) + df = pd.DataFrame(list_atoms).drop_duplicates() + + with open(cif, "w") as of: + of.write(TMP_CIF.format(df[4].mean())) + df.to_csv(cif, index=False, header=False, mode="a", sep=" ") + + io = MMCIFIO() + io.set_structure(struc) + + with open(cif, "a") as of: + io.save(of) + + def pae2png(tsv_file, png_file): + + console.print(f"Convert : {tsv_file} row data to {png_file}") v = pd.read_table(tsv_file, index_col=0, low_memory=False) fig, ax = plt.subplots(1, 1, figsize=(4, 3), facecolor=None) m = ax.matshow(v, cmap="Greens_r", vmin=0, vmax=35, aspect="auto", origin="lower") @@ -341,7 +411,7 @@ def refseq_group_per_assembly_and_type( "species", ], as_index=False, - dropna=False + dropna=False, ).size() df_final_grouped.reset_index().to_csv(output, index=False) @@ -446,9 +516,7 @@ def _sanitized_refseq_hits(df): # count each occurrence df_again_per_assembly = no_sys_assembly_by_size.groupby( - "Assembly", - as_index=False, - dropna=False + "Assembly", as_index=False, dropna=False ).size() # filter to keep only size > 1 (when == 1 it means that there is only "no system found for an assembly") # so we should keep it diff --git a/packages/df-wiki-cli/poetry.lock b/packages/df-wiki-cli/poetry.lock index 8d3b6e0c..34991255 100644 --- a/packages/df-wiki-cli/poetry.lock +++ b/packages/df-wiki-cli/poetry.lock @@ -24,6 +24,40 @@ files = [ [package.dependencies] pyparsing = ">=2.0.3" +[[package]] +name = "biopython" +version = "1.83" +description = "Freely available tools for computational molecular biology." +optional = false +python-versions = ">=3.8" +files = [ + {file = "biopython-1.83-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e2cc737906d8de47eedbc4476f711b960c16a65daa8cdd021875398c81999a09"}, + {file = "biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2df408be9816dd98c28fe181ea93fb6e0d375bf1763ad9ed503ac30bb2df5b1a"}, + {file = "biopython-1.83-cp310-cp310-win32.whl", hash = "sha256:a0c1c70789c7e2a26563db5ba533fb9fea0cc1f2c7bc7ad240146cb223ba44a3"}, + {file = "biopython-1.83-cp310-cp310-win_amd64.whl", hash = "sha256:56f03f43c183acb88c082bc31e5f047fcc6d0aceb5270fbd29c31ab769795b86"}, + {file = "biopython-1.83-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a01dfdad7210f2fd5c4f36606278f91dbfdda6dac02347206d13cc618e79fe32"}, + {file = "biopython-1.83-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c756c0b81702c705141c87c2805203df01c6d4cf290e8cefd48cbc61a3c85b82"}, + {file = "biopython-1.83-cp311-cp311-win32.whl", hash = "sha256:0496f2a6e6e060d8ff0f34784ad15ed342b10cfe282020efe168286f0c14c479"}, + {file = "biopython-1.83-cp311-cp311-win_amd64.whl", hash = "sha256:8552cc467429b555c604b84fc174c33923bf7e4c735774eda505f1d5a9c2feab"}, + {file = "biopython-1.83-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0d5ce14755a6b49dea4743cf6929570afe5becb66ad222194984c7bf04218f86"}, + {file = "biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b35aa095de0fa8339b70664797d0e83322a1a9d512e2fd52d4e872df5189f56"}, + {file = "biopython-1.83-cp312-cp312-win32.whl", hash = "sha256:118425a210cb3d184c7a78154c5646089366faf124cd46c6056ca7f9302b94ad"}, + {file = "biopython-1.83-cp312-cp312-win_amd64.whl", hash = "sha256:ca94e8ea8907de841a515af55acb1922a9de99b3144c738a193f2a75e4726078"}, + {file = "biopython-1.83-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e37884fe39e4560bf5934a4ec4ba7f7fe0e7c091053d03d05b20a70557167717"}, + {file = "biopython-1.83-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd9bc6fef3f6a10043635a75e1a77c9dce877375140e81059c67c73d4ce65c4c"}, + {file = "biopython-1.83-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3584122a5daca25b3914a32c52785b051c11518cd5e111e9e89ee04a6234fe"}, + {file = "biopython-1.83-cp38-cp38-win32.whl", hash = "sha256:641c1a860705d6740eb16c6147b2b730b05a8f5974db804c14d5faa8a1446085"}, + {file = "biopython-1.83-cp38-cp38-win_amd64.whl", hash = "sha256:94b68e550619e1b6e3784ed8cecb62f201d70d8b87d3a90365291f065ab42bd9"}, + {file = "biopython-1.83-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:81d1e2515b380e1876720ba79dbf50f8ef3a38cc38ba5953ef61ec20d0934ee2"}, + {file = "biopython-1.83-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec82350c24cdcf34a8d4a5f189d0ff7dc025658098a60e6f0e681d24b6a1414e"}, + {file = "biopython-1.83-cp39-cp39-win32.whl", hash = "sha256:e914f7161b3831d7c58db33cc5c7ca64b42c9877c5a776a8313e7a5fd494f8de"}, + {file = "biopython-1.83-cp39-cp39-win_amd64.whl", hash = "sha256:aae1b156a76907c2abfe9d141776b0aead65695ea914eaecdf12bd1e8991f869"}, + {file = "biopython-1.83.tar.gz", hash = "sha256:78e6bfb78de63034037afd35fe77cb6e0a9e5b62706becf78a7d922b16ed83f7"}, +] + +[package.dependencies] +numpy = "*" + [[package]] name = "black" version = "23.11.0" @@ -1418,4 +1452,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.13" -content-hash = "d526b2f2acfadf5a8eccfae5e902ab702eaedd8ca88ec310d2f6c4dab88bea8a" +content-hash = "8834fc25ed4a598a81158cb81ef55d0fb6d950ee29a02caa62e3241dcefc3ca0" diff --git a/packages/df-wiki-cli/pyproject.toml b/packages/df-wiki-cli/pyproject.toml index b6763ddc..e32dd2b3 100644 --- a/packages/df-wiki-cli/pyproject.toml +++ b/packages/df-wiki-cli/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "df-wiki-cli" -version = "0.1.7" +version = "0.1.8" description = "" authors = ["Remi PLANEL <rplanel@pasteur.fr>"] readme = "README.md" @@ -19,6 +19,7 @@ pydantic-yaml = "^1.2.0" python-frontmatter = "^1.0.1" matplotlib = "^3.8.2" habanero = "^1.2.6" +biopython = "^1.83" [tool.poetry.group.dev.dependencies] -- GitLab