diff --git a/parsers/README.md b/parsers/README.md index 609e0ec9910214950838a4c275a8d7dc276d0ee0..5cb6d9b1fcca54af21a8e90b359afe1c03673f2a 100755 --- a/parsers/README.md +++ b/parsers/README.md @@ -7,7 +7,7 @@ The RAW folder contains files from other databases than are parsed to extract in #### AbDb.fasta -Data from <http://www.abybank.org/abdb/> (**Download Dataset** tab) can be downloaded from the **Complete Dataset** of **Redundant Antibody List** (<http://www.abybank.org/abdb/Data/Redundant_files/Redundant_LH_Combined_Martin.txt>). +Data from <http://www.abybank.org/abdb/> (**Download Dataset** tab) can be downloaded from the **Complete Dataset** of **Redundant Antibody List** (<http://www.abybank.org/abdb/Data/Redundant_files/Redundant_LH_Combined_Martin.txt>). New version seems to be stored in <http://www.abybank.org/abdb/snapshots/>. Inside, the file **antibodies.txt** is the easiest to parse. From this file, retrieve all PDB IDs (4 characters before the **_**) to create a temporary file (*tmp.txt*), containing one ID per line. diff --git a/parsers/scripts/getters/get_ABDB.py b/parsers/scripts/getters/get_ABDB.py index d7b387fedf26c45c9c49917409e9926479af0aaf..e35ff164497ffb2875a56ff76e12d4430c14e42c 100644 --- a/parsers/scripts/getters/get_ABDB.py +++ b/parsers/scripts/getters/get_ABDB.py @@ -25,6 +25,8 @@ import os import sys import subprocess +import re +import zipfile filename = "AbDb" @@ -34,10 +36,37 @@ print(f"Starting {filename}...") # Create the run dir if necessary if not os.path.exists("run"): os.makedirs("run") +if not os.path.exists("../RAW"): + os.makedirs("../RAW") + +# Get the indexof +try: + subprocess.run(f"wget -q -O - http://www.abybank.org/abdb/snapshots/ -O run/indexof.txt", + shell=True, + check=True) +except subprocess.CalledProcessError as err: + print(f"command '{err.cmd}' return with error (code "\ + f"{err.returncode}): {err.stderr}", file=sys.stderr) + sys.exit(1) + +# Get all lines having a download link +all_lines = [] +with open("run/indexof.txt") as ind_f: + for line in ind_f: + if ".zip" in line: + all_lines.append(line) + +# Get the name of the latest file +m = re.search("(abdb.*\.zip)\"", all_lines[-1]) +latest_file = m.group(1) + +# Clean the initial downloaded file +os.remove(f"run/indexof.txt") + # Get the raw data try: - subprocess.run(f"wget http://www.abybank.org/abdb/Data/Redundant_files/Redundant_LH_Combined_Martin.txt -O run/{filename}.txt", + subprocess.run(f"wget -N http://www.abybank.org/abdb/snapshots/{latest_file} -P ../RAW/", shell=True, check=True) except subprocess.CalledProcessError as err: @@ -45,16 +74,17 @@ except subprocess.CalledProcessError as err: f"{err.returncode}): {err.stderr}", file=sys.stderr) sys.exit(1) + # Get all unique ids all_ids = set() -with open(f"run/{filename}.txt", encoding="utf-8") as inp_f: - for line in inp_f: - ids = line.split(", ") - for id_ in ids: - all_ids.add(id_.split("_")[0]) - -# Clean the initial downloaded file -os.remove(f"run/{filename}.txt") +# Open the zip +with zipfile.ZipFile(f"../RAW/{latest_file}") as zip_f: + # Get the file, line by line + for line in zip_f.read("abdb_newdata_20240706/antibodies.txt").decode("utf-8").split("\n"): + # Line with ids + if len(line) > 4: + # Add it + all_ids.add(line.split("_")[0]) # Create a clean file with open(f"run/{filename}.txt", "w", encoding="utf-8") as out_f: