Skip to content
Snippets Groups Projects
Commit f166a689 authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

add command to get structure data

parent dd746c6c
No related branches found
No related tags found
1 merge request!231System distribution plot edit article
......@@ -2,7 +2,6 @@ import typer
import sys
import json
import pandas as pd
import shutil
import csv
import matplotlib.pyplot as plt
from pandas.errors import ParserError
......@@ -14,6 +13,8 @@ import frontmatter
from enum import Enum
from rich.console import Console
import re
import requests
from Bio.PDB import PDBParser, MMCIFIO
console = Console()
app = typer.Typer()
......@@ -77,8 +78,12 @@ def lint(
console.print("[green] Everything is alright")
@app.command()
@app.command(
help="Download all structures from statistics file. Generate PAE from raw data and organize these files per systems"
)
def structure(
user: Annotated[str, typer.Option(help="username credential")],
password: Annotated[str, typer.Option(help="password credential")],
stat: Annotated[
Path,
typer.Option(
......@@ -90,21 +95,10 @@ def structure(
resolve_path=True,
),
],
dir: Annotated[
Path,
typer.Option(
exists=True,
file_okay=False,
dir_okay=True,
writable=True,
readable=True,
resolve_path=True,
),
],
output: Annotated[
Path,
typer.Option(
exists=True,
exists=False,
file_okay=False,
dir_okay=True,
writable=True,
......@@ -115,41 +109,56 @@ def structure(
):
with open(stat, "r") as stat_f:
reader = csv.DictReader(stat_f)
count_row = 0
for row in reader:
dir_name = row["System_name_ok"]
count_row += 1
system_dir_name = row["System_name_ok"]
pdb_path_file = Path(row["pdb"])
cif_file_name = Path(str(pdb_path_file).split(".pdb")[0] + ".cif")
foldseek_html_file = Path(str(pdb_path_file).split(".pdb")[0] + ".html")
png_structure = Path(str(pdb_path_file).split(".pdb")[0] + ".png")
files = [
{"f": pdb_path_file, "d": "PDB"},
{"f": cif_file_name, "d": "CIF"},
{"f": Path(row["pae_table"]), "d": "PAE"},
{"f": Path(row["fasta_file"]), "d": "Fastas"},
{"f": Path(row["Foldseek_name"]), "d": "foldseek_monomers_html"},
{"f": foldseek_html_file, "d": "foldseek_pdb_html"},
{"f": png_structure, "d": "png"},
]
target_dir = output / dir_name
target_dir = output / system_dir_name
target_dir.mkdir(parents=True, exist_ok=True)
base_url = "https://data.atkinson-lab.com/DefenceFinder"
console.rule(f"[bold blue]{count_row} / {system_dir_name}", style="blue")
for f in files:
console.print(f"[bold blue]{f['d']}", style="blue")
str_f = str(f["f"])
if str_f and str_f != "." and str_f != "" and str_f != "na":
file_to_copy = dir / f["d"] / f["f"]
if file_to_copy.exists():
if f["d"] == "PAE":
png_file = str(file_to_copy).split(".tsv")[0] + ".png"
try:
pae2png(file_to_copy, png_file)
except ParserError as err:
console.print(
f"[red] file {str(file_to_copy.name)} cannot be parsed"
)
print(err)
else:
shutil.copy2(png_file, target_dir)
else:
shutil.copy2(file_to_copy, target_dir)
else:
console.print(
f"[red] file {str(file_to_copy.name)} does not exist"
)
# get the file from atkinson lab
target_file = target_dir / f["f"]
file_to_fetch = f"{base_url}/{f['d']}/{f['f']}"
console.print(f"Fetch : {file_to_fetch}")
response = requests.get(
file_to_fetch,
auth=(user, password),
allow_redirects=True,
stream=True,
)
with open(target_file, "wb") as fh:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
fh.write(chunk)
if f["d"] == "PDB":
# pdb2cif
pdb2cif(target_file)
if f["d"] == "PAE":
png_file = str(target_file).split(".tsv")[0] + ".pae.png"
try:
pae2png(target_file, png_file)
except ParserError as err:
console.print(
f"[red] file {str(target_file.name)} cannot be parsed"
)
print(err)
@app.command()
......@@ -231,7 +240,68 @@ def systems(
ty.write(json_object)
TMP_CIF = """
#
loop_
_ma_qa_metric.id
_ma_qa_metric.mode
_ma_qa_metric.name
_ma_qa_metric.software_group_id
_ma_qa_metric.type
1 global pLDDT 1 pLDDT
2 local pLDDT 1 pLDDT
#
_ma_qa_metric_global.metric_id 1
_ma_qa_metric_global.metric_value {:.06}
_ma_qa_metric_global.model_id 1
_ma_qa_metric_global.ordinal_id 1
#
loop_
_ma_qa_metric_local.label_asym_id
_ma_qa_metric_local.label_comp_id
_ma_qa_metric_local.label_seq_id
_ma_qa_metric_local.metric_id
_ma_qa_metric_local.metric_value
_ma_qa_metric_local.model_id
_ma_qa_metric_local.ordinal_id
"""
def pdb2cif(pdb):
cif = str(pdb).split(".pdb")[0] + ".cif"
console.print(f"convert {pdb} -> {cif}")
p = PDBParser()
struc = p.get_structure("", pdb)
list_atoms = []
for a in struc.get_atoms():
list_atoms.append(
[
a.parent.parent.id,
a.parent.resname,
a.parent.id[1],
"2",
a.bfactor,
1,
a.parent.id[1],
]
)
df = pd.DataFrame(list_atoms).drop_duplicates()
with open(cif, "w") as of:
of.write(TMP_CIF.format(df[4].mean()))
df.to_csv(cif, index=False, header=False, mode="a", sep=" ")
io = MMCIFIO()
io.set_structure(struc)
with open(cif, "a") as of:
io.save(of)
def pae2png(tsv_file, png_file):
console.print(f"Convert : {tsv_file} row data to {png_file}")
v = pd.read_table(tsv_file, index_col=0, low_memory=False)
fig, ax = plt.subplots(1, 1, figsize=(4, 3), facecolor=None)
m = ax.matshow(v, cmap="Greens_r", vmin=0, vmax=35, aspect="auto", origin="lower")
......@@ -341,7 +411,7 @@ def refseq_group_per_assembly_and_type(
"species",
],
as_index=False,
dropna=False
dropna=False,
).size()
df_final_grouped.reset_index().to_csv(output, index=False)
......@@ -446,9 +516,7 @@ def _sanitized_refseq_hits(df):
# count each occurrence
df_again_per_assembly = no_sys_assembly_by_size.groupby(
"Assembly",
as_index=False,
dropna=False
"Assembly", as_index=False, dropna=False
).size()
# filter to keep only size > 1 (when == 1 it means that there is only "no system found for an assembly")
# so we should keep it
......
......@@ -24,6 +24,40 @@ files = [
[package.dependencies]
pyparsing = ">=2.0.3"
[[package]]
name = "biopython"
version = "1.83"
description = "Freely available tools for computational molecular biology."
optional = false
python-versions = ">=3.8"
files = [
{file = "biopython-1.83-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e2cc737906d8de47eedbc4476f711b960c16a65daa8cdd021875398c81999a09"},
{file = "biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2df408be9816dd98c28fe181ea93fb6e0d375bf1763ad9ed503ac30bb2df5b1a"},
{file = "biopython-1.83-cp310-cp310-win32.whl", hash = "sha256:a0c1c70789c7e2a26563db5ba533fb9fea0cc1f2c7bc7ad240146cb223ba44a3"},
{file = "biopython-1.83-cp310-cp310-win_amd64.whl", hash = "sha256:56f03f43c183acb88c082bc31e5f047fcc6d0aceb5270fbd29c31ab769795b86"},
{file = "biopython-1.83-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a01dfdad7210f2fd5c4f36606278f91dbfdda6dac02347206d13cc618e79fe32"},
{file = "biopython-1.83-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c756c0b81702c705141c87c2805203df01c6d4cf290e8cefd48cbc61a3c85b82"},
{file = "biopython-1.83-cp311-cp311-win32.whl", hash = "sha256:0496f2a6e6e060d8ff0f34784ad15ed342b10cfe282020efe168286f0c14c479"},
{file = "biopython-1.83-cp311-cp311-win_amd64.whl", hash = "sha256:8552cc467429b555c604b84fc174c33923bf7e4c735774eda505f1d5a9c2feab"},
{file = "biopython-1.83-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0d5ce14755a6b49dea4743cf6929570afe5becb66ad222194984c7bf04218f86"},
{file = "biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b35aa095de0fa8339b70664797d0e83322a1a9d512e2fd52d4e872df5189f56"},
{file = "biopython-1.83-cp312-cp312-win32.whl", hash = "sha256:118425a210cb3d184c7a78154c5646089366faf124cd46c6056ca7f9302b94ad"},
{file = "biopython-1.83-cp312-cp312-win_amd64.whl", hash = "sha256:ca94e8ea8907de841a515af55acb1922a9de99b3144c738a193f2a75e4726078"},
{file = "biopython-1.83-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e37884fe39e4560bf5934a4ec4ba7f7fe0e7c091053d03d05b20a70557167717"},
{file = "biopython-1.83-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd9bc6fef3f6a10043635a75e1a77c9dce877375140e81059c67c73d4ce65c4c"},
{file = "biopython-1.83-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3584122a5daca25b3914a32c52785b051c11518cd5e111e9e89ee04a6234fe"},
{file = "biopython-1.83-cp38-cp38-win32.whl", hash = "sha256:641c1a860705d6740eb16c6147b2b730b05a8f5974db804c14d5faa8a1446085"},
{file = "biopython-1.83-cp38-cp38-win_amd64.whl", hash = "sha256:94b68e550619e1b6e3784ed8cecb62f201d70d8b87d3a90365291f065ab42bd9"},
{file = "biopython-1.83-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:81d1e2515b380e1876720ba79dbf50f8ef3a38cc38ba5953ef61ec20d0934ee2"},
{file = "biopython-1.83-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec82350c24cdcf34a8d4a5f189d0ff7dc025658098a60e6f0e681d24b6a1414e"},
{file = "biopython-1.83-cp39-cp39-win32.whl", hash = "sha256:e914f7161b3831d7c58db33cc5c7ca64b42c9877c5a776a8313e7a5fd494f8de"},
{file = "biopython-1.83-cp39-cp39-win_amd64.whl", hash = "sha256:aae1b156a76907c2abfe9d141776b0aead65695ea914eaecdf12bd1e8991f869"},
{file = "biopython-1.83.tar.gz", hash = "sha256:78e6bfb78de63034037afd35fe77cb6e0a9e5b62706becf78a7d922b16ed83f7"},
]
[package.dependencies]
numpy = "*"
[[package]]
name = "black"
version = "23.11.0"
......@@ -1418,4 +1452,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata]
lock-version = "2.0"
python-versions = ">=3.11,<3.13"
content-hash = "d526b2f2acfadf5a8eccfae5e902ab702eaedd8ca88ec310d2f6c4dab88bea8a"
content-hash = "8834fc25ed4a598a81158cb81ef55d0fb6d950ee29a02caa62e3241dcefc3ca0"
[tool.poetry]
name = "df-wiki-cli"
version = "0.1.7"
version = "0.1.8"
description = ""
authors = ["Remi PLANEL <rplanel@pasteur.fr>"]
readme = "README.md"
......@@ -19,6 +19,7 @@ pydantic-yaml = "^1.2.0"
python-frontmatter = "^1.0.1"
matplotlib = "^3.8.2"
habanero = "^1.2.6"
biopython = "^1.83"
[tool.poetry.group.dev.dependencies]
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment