Skip to content
Snippets Groups Projects
Commit 9a2e57e1 authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

Merge branch 'operon-struct-type' into operon-struct-type-article-update

parents f45f0cbb 5d50d1ba
No related branches found
No related tags found
No related merge requests found
Pipeline #129132 failed with stages
in 6 minutes and 22 seconds
......@@ -187,7 +187,19 @@ lint:
variables:
MEILI_HOST: "http://localhost:7700"
script:
# - rm data/list-systems.json
# STRUCTURE
- >
df-wiki-cli
meilisearch
delete-all-documents structure
- >
df-wiki-cli
meilisearch
--host ${MEILI_HOST}
--key "${MEILI_MASTER_KEY}"
update structure
--file data/all_predictions_statistics.tsv
# SYSTEM OPERON STRUCUTRE
- >
df-wiki-cli
......@@ -197,8 +209,13 @@ lint:
df-wiki-cli
content
system-operon-structure
--tag "1.2.2"
--version "1.2.2"
--tags "1.2.2"
--tags "1.2.3"
--tags "1.2.4"
--versions "1.2.2"
--versions "1.2.3"
--versions "v1.2.4"
--structure data/all_predictions_statistics.tsv
--output data/system-structures.csv
- >
df-wiki-cli
......@@ -317,20 +334,7 @@ lint:
--key "${MEILI_MASTER_KEY}"
update systems
--file data/list-systems.json
# STRUCTURE
- >
df-wiki-cli
meilisearch
delete-all-documents structure
- >
df-wiki-cli
meilisearch
--host ${MEILI_HOST}
--key "${MEILI_MASTER_KEY}"
update structure
--file data/all_predictions_statistics.tsv
# ARTICLES
- >
df-wiki-cli
......
......@@ -5,9 +5,10 @@ import pandas as pd
import csv
import tempfile
import matplotlib.pyplot as plt
from operator import itemgetter
from pandas.errors import ParserError
from typing_extensions import Annotated
from typing import Optional, List
from typing import Optional, List, Tuple
from pathlib import Path
from pydantic import BaseModel, ValidationError
import frontmatter
......@@ -254,69 +255,70 @@ def system_operon_structure(
resolve_path=True,
),
] = "./system-structures.csv",
version: Annotated[str, typer.Option(help="Defense finder model")] = "v1.2.4",
tag: Annotated[str, typer.Option(help="Defense finder model")] = "1.2.4",
structure: Annotated[
Path,
typer.Option(
file_okay=True,
dir_okay=False,
writable=False,
resolve_path=True,
),
] = "./all_predictions_statistics.tsv",
versions: Annotated[List[str], typer.Option(help="Defense finder model")] = [
"1.2.2",
"1.2.3",
"v1.2.4",
],
tags: Annotated[List[str], typer.Option(help="Defense finder model")] = [
"1.2.2",
"1.2.3",
"1.2.4",
],
):
# get defense finder model from github
df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz"
_, tmp_path = tempfile.mkstemp()
tmp_root_dir = tempfile.gettempdir()
df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}")
df_model_definitions_dir = df_model_dir / "defense-finder-models" / "definitions"
console.print(f"Download models: {df_model_url}")
response = requests.get(
df_model_url,
allow_redirects=True,
)
with open(tmp_path, mode="wb") as file:
file.write(response.content)
console.print("untar file")
with tarfile.open(tmp_path) as archive:
archive.extractall(df_model_dir)
# # extract foreach system and subsystem list genes
# set the order
releases = zip(versions, tags)
model_dirs = list(download_model_release(releases))
systems = []
with open(structure) as tsvfile:
tsvreader = csv.DictReader(tsvfile, delimiter="\t")
for row in tsvreader:
systems.append({"system": row["system"], "subsystem": row["subsystem"]})
system_genes = []
for child in df_model_definitions_dir.iterdir():
for system_path in child.iterdir():
system = system_path.name
# console.rule(system)
subsystem_list = (
s for s in system_path.iterdir() if str(s).endswith(".xml")
)
for subsystem in subsystem_list:
susbsystem_name = subsystem.stem
console.print(susbsystem_name)
in_exchangeables = False
current_gene = {}
exchangeables = []
with open(subsystem) as file:
for event, elem in ET.iterparse(file, events=("start", "end")):
if event == "start":
if elem.tag == "gene" and not in_exchangeables:
current_gene = {
"system": system,
"subsystem": susbsystem_name,
"gene": elem.attrib["name"],
"exchangeables": None,
}
system_genes.append(current_gene)
if elem.tag == "gene" and in_exchangeables:
exchangeables.append(elem.attrib["name"])
if elem.tag == "exchangeables":
in_exchangeables = True
exchangeables = []
elif event == "end":
if elem.tag == "exchangeables":
in_exchangeables = False
current_gene["exchangeables"] = ",".join(exchangeables)
exchangeables = []
for system_def in systems:
system, subsystem = itemgetter("system", "subsystem")(system_def)
list_paths = list(gen_model_path(model_dirs))
if system != "#N/A" and subsystem != "#N/A":
def_path = find_model_definition(system, subsystem, list_paths)
in_exchangeables = False
current_gene = {}
exchangeables = []
with open(def_path["path"]) as file:
for event, elem in ET.iterparse(file, events=("start", "end")):
if event == "start":
if elem.tag == "gene" and not in_exchangeables:
current_gene = {
"system": system,
"subsystem": subsystem,
"gene": elem.attrib["name"],
"version": def_path["version"],
"exchangeables": None,
}
system_genes.append(current_gene)
if elem.tag == "gene" and in_exchangeables:
exchangeables.append(elem.attrib["name"])
if elem.tag == "exchangeables":
in_exchangeables = True
exchangeables = []
elif event == "end":
if elem.tag == "exchangeables":
in_exchangeables = False
current_gene["exchangeables"] = ",".join(exchangeables)
exchangeables = []
with open(output, "w") as f:
fieldnames = ["id", "system", "subsystem", "gene", "exchangeables"]
fieldnames = ["id", "system", "subsystem", "version", "gene", "exchangeables"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for id, gene in enumerate(system_genes):
......@@ -325,6 +327,30 @@ def system_operon_structure(
writer.writerow(gene)
def download_model_release(releases):
for version, tag in releases:
df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz"
_, tmp_path = tempfile.mkstemp()
tmp_root_dir = tempfile.gettempdir()
df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}")
df_model_definitions_dir = (
df_model_dir / "defense-finder-models" / "definitions"
)
console.print(f"Download models: {df_model_url}")
response = requests.get(
df_model_url,
allow_redirects=True,
)
with open(tmp_path, mode="wb") as file:
file.write(response.content)
console.print("untar file")
with tarfile.open(tmp_path) as archive:
archive.extractall(df_model_dir)
yield {"version": tag, "dir": df_model_definitions_dir}
TMP_CIF = """
#
loop_
......@@ -681,3 +707,29 @@ def _sanitized_refseq_hits(df):
| ~df.Assembly.isin(assembly_where_should_remove_no_sys_found)
]
return df_filtered_assembly_only_with_sys
def find_model_definition(system, subsystem, list_paths):
found_path = None
for p in list_paths:
path = p["path"]
parts = path.parts
if path.stem == subsystem and parts[-2] == system:
console.rule(f"{system} - {subsystem}")
console.print(p)
found_path = {"path": path, "version": p["version"]}
break
if found_path is None:
raise FileNotFoundError
else:
return found_path
def gen_model_path(model_dirs):
for model_dir in model_dirs:
for subdir in model_dir["dir"].iterdir():
for system_path in subdir.iterdir():
for subsystem_path in system_path.iterdir():
if str(subsystem_path).endswith(".xml"):
yield {"path": subsystem_path, "version": model_dir["version"]}
......@@ -108,6 +108,7 @@ class SystemOperonStructure(BaseModel):
id: int
system: str
subsystem: str
version: str
gene: str
exchangeables: Optional[List[str]]
......
[tool.poetry]
name = "df-wiki-cli"
version = "0.1.9"
version = "0.2.0"
description = ""
authors = ["Remi PLANEL <rplanel@pasteur.fr>"]
readme = "README.md"
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment