Skip to content
Snippets Groups Projects
Commit 5d50d1ba authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

get operon struct from multiple version

parent e70b3a91
Branches
No related tags found
No related merge requests found
Pipeline #129131 failed
...@@ -192,7 +192,42 @@ lint: ...@@ -192,7 +192,42 @@ lint:
variables: variables:
MEILI_HOST: "http://localhost:7700" MEILI_HOST: "http://localhost:7700"
script: script:
# - rm data/list-systems.json # STRUCTURE
- >
df-wiki-cli
meilisearch
delete-all-documents structure
- >
df-wiki-cli
meilisearch
--host ${MEILI_HOST}
--key "${MEILI_MASTER_KEY}"
update structure
--file data/all_predictions_statistics.tsv
# SYSTEM OPERON STRUCUTRE
- >
df-wiki-cli
meilisearch
delete-all-documents system-operon-structure
- >
df-wiki-cli
content
system-operon-structure
--tags "1.2.2"
--tags "1.2.3"
--tags "1.2.4"
--versions "1.2.2"
--versions "1.2.3"
--versions "v1.2.4"
--structure data/all_predictions_statistics.tsv
--output data/system-structures.csv
- >
df-wiki-cli
meilisearch
update system-operon-structure
--file data/system-structures.csv
###### REFSEQ ###### REFSEQ
# delete all document refseq # delete all document refseq
- > - >
...@@ -303,26 +338,6 @@ lint: ...@@ -303,26 +338,6 @@ lint:
--key "${MEILI_MASTER_KEY}" --key "${MEILI_MASTER_KEY}"
update systems update systems
--file data/list-systems.json --file data/list-systems.json
# STRUCTURE
- >
df-wiki-cli
meilisearch
--host ${MEILI_HOST}
--key "${MEILI_MASTER_KEY}"
update structure
--file data/all_predictions_statistics_clean.csv
# SYSTEM OPERON STRUCUTRE
- >
df-wiki-cli
content
system-operon-structure
--version "1.2.4" --output data/system-structures.csv
- >
df-wiki-cli
meilisearch
update system-operon-structure
--file data/system-structures.csv
# ARTICLES # ARTICLES
- > - >
......
...@@ -5,9 +5,10 @@ import pandas as pd ...@@ -5,9 +5,10 @@ import pandas as pd
import csv import csv
import tempfile import tempfile
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from operator import itemgetter
from pandas.errors import ParserError from pandas.errors import ParserError
from typing_extensions import Annotated from typing_extensions import Annotated
from typing import Optional, List from typing import Optional, List, Tuple
from pathlib import Path from pathlib import Path
from pydantic import BaseModel, ValidationError from pydantic import BaseModel, ValidationError
import frontmatter import frontmatter
...@@ -254,53 +255,54 @@ def system_operon_structure( ...@@ -254,53 +255,54 @@ def system_operon_structure(
resolve_path=True, resolve_path=True,
), ),
] = "./system-structures.csv", ] = "./system-structures.csv",
version: Annotated[str, typer.Option(help="Defense finder model")] = "v1.2.4", structure: Annotated[
tag: Annotated[str, typer.Option(help="Defense finder model")] = "1.2.4", Path,
typer.Option(
file_okay=True,
dir_okay=False,
writable=False,
resolve_path=True,
),
] = "./all_predictions_statistics.tsv",
versions: Annotated[List[str], typer.Option(help="Defense finder model")] = [
"1.2.2",
"1.2.3",
"v1.2.4",
],
tags: Annotated[List[str], typer.Option(help="Defense finder model")] = [
"1.2.2",
"1.2.3",
"1.2.4",
],
): ):
# get defense finder model from github # get defense finder model from github
releases = zip(versions, tags)
model_dirs = list(download_model_release(releases))
df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz" systems = []
_, tmp_path = tempfile.mkstemp() with open(structure) as tsvfile:
tmp_root_dir = tempfile.gettempdir() tsvreader = csv.DictReader(tsvfile, delimiter="\t")
df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}") for row in tsvreader:
df_model_definitions_dir = df_model_dir / "defense-finder-models" / "definitions" systems.append({"system": row["system"], "subsystem": row["subsystem"]})
console.print(f"Download models: {df_model_url}")
response = requests.get(
df_model_url,
allow_redirects=True,
)
with open(tmp_path, mode="wb") as file:
file.write(response.content)
console.print("untar file")
with tarfile.open(tmp_path) as archive:
archive.extractall(df_model_dir)
# # extract foreach system and subsystem list genes
# set the order
system_genes = [] system_genes = []
for child in df_model_definitions_dir.iterdir(): for system_def in systems:
for system_path in child.iterdir(): system, subsystem = itemgetter("system", "subsystem")(system_def)
system = system_path.name list_paths = list(gen_model_path(model_dirs))
# console.rule(system) if system != "#N/A" and subsystem != "#N/A":
subsystem_list = ( def_path = find_model_definition(system, subsystem, list_paths)
s for s in system_path.iterdir() if str(s).endswith(".xml")
)
for subsystem in subsystem_list:
susbsystem_name = subsystem.stem
console.print(susbsystem_name)
in_exchangeables = False in_exchangeables = False
current_gene = {} current_gene = {}
exchangeables = [] exchangeables = []
with open(def_path["path"]) as file:
with open(subsystem) as file:
for event, elem in ET.iterparse(file, events=("start", "end")): for event, elem in ET.iterparse(file, events=("start", "end")):
if event == "start": if event == "start":
if elem.tag == "gene" and not in_exchangeables: if elem.tag == "gene" and not in_exchangeables:
current_gene = { current_gene = {
"system": system, "system": system,
"subsystem": susbsystem_name, "subsystem": subsystem,
"gene": elem.attrib["name"], "gene": elem.attrib["name"],
"version": def_path["version"],
"exchangeables": None, "exchangeables": None,
} }
system_genes.append(current_gene) system_genes.append(current_gene)
...@@ -316,7 +318,7 @@ def system_operon_structure( ...@@ -316,7 +318,7 @@ def system_operon_structure(
exchangeables = [] exchangeables = []
with open(output, "w") as f: with open(output, "w") as f:
fieldnames = ["id", "system", "subsystem", "gene", "exchangeables"] fieldnames = ["id", "system", "subsystem", "version", "gene", "exchangeables"]
writer = csv.DictWriter(f, fieldnames=fieldnames) writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader() writer.writeheader()
for id, gene in enumerate(system_genes): for id, gene in enumerate(system_genes):
...@@ -325,6 +327,30 @@ def system_operon_structure( ...@@ -325,6 +327,30 @@ def system_operon_structure(
writer.writerow(gene) writer.writerow(gene)
def download_model_release(releases):
for version, tag in releases:
df_model_url = f"https://github.com/mdmparis/defense-finder-models/releases/download/{tag}/defense-finder-models-{version}.tar.gz"
_, tmp_path = tempfile.mkstemp()
tmp_root_dir = tempfile.gettempdir()
df_model_dir = Path(f"{tmp_root_dir}/defense-finder-models-{version}")
df_model_definitions_dir = (
df_model_dir / "defense-finder-models" / "definitions"
)
console.print(f"Download models: {df_model_url}")
response = requests.get(
df_model_url,
allow_redirects=True,
)
with open(tmp_path, mode="wb") as file:
file.write(response.content)
console.print("untar file")
with tarfile.open(tmp_path) as archive:
archive.extractall(df_model_dir)
yield {"version": tag, "dir": df_model_definitions_dir}
TMP_CIF = """ TMP_CIF = """
# #
loop_ loop_
...@@ -681,3 +707,29 @@ def _sanitized_refseq_hits(df): ...@@ -681,3 +707,29 @@ def _sanitized_refseq_hits(df):
| ~df.Assembly.isin(assembly_where_should_remove_no_sys_found) | ~df.Assembly.isin(assembly_where_should_remove_no_sys_found)
] ]
return df_filtered_assembly_only_with_sys return df_filtered_assembly_only_with_sys
def find_model_definition(system, subsystem, list_paths):
found_path = None
for p in list_paths:
path = p["path"]
parts = path.parts
if path.stem == subsystem and parts[-2] == system:
console.rule(f"{system} - {subsystem}")
console.print(p)
found_path = {"path": path, "version": p["version"]}
break
if found_path is None:
raise FileNotFoundError
else:
return found_path
def gen_model_path(model_dirs):
for model_dir in model_dirs:
for subdir in model_dir["dir"].iterdir():
for system_path in subdir.iterdir():
for subsystem_path in system_path.iterdir():
if str(subsystem_path).endswith(".xml"):
yield {"path": subsystem_path, "version": model_dir["version"]}
...@@ -108,6 +108,7 @@ class SystemOperonStructure(BaseModel): ...@@ -108,6 +108,7 @@ class SystemOperonStructure(BaseModel):
id: int id: int
system: str system: str
subsystem: str subsystem: str
version: str
gene: str gene: str
exchangeables: Optional[List[str]] exchangeables: Optional[List[str]]
......
[tool.poetry] [tool.poetry]
name = "df-wiki-cli" name = "df-wiki-cli"
version = "0.1.9" version = "0.2.0"
description = "" description = ""
authors = ["Remi PLANEL <rplanel@pasteur.fr>"] authors = ["Remi PLANEL <rplanel@pasteur.fr>"]
readme = "README.md" readme = "README.md"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment