...
 
Commits (4)
genomes_proks815-genomesonnovember.csv
Project-14607.pdf
output.csv
.idea
.venv
......
......@@ -4,6 +4,7 @@ import argparse
import csv
import json
import os
from xml.etree.ElementTree import fromstring, fromstringlist
from Bio import Entrez
......@@ -13,7 +14,7 @@ Entrez.api_key = os.environ.get("NCBI_API_KEY", None)
def cache_json(function):
def wrapper(*args):
path = ".cached/%s.json" % "--".join(args)
path = ".cached/%s.%s.json" % (function.__name__, "--".join(args))
try:
with open(path, 'r') as file:
return json.loads(file.read())
......@@ -23,6 +24,7 @@ def cache_json(function):
with open(path, 'w') as file:
file.write(json.dumps(rv, indent=4))
return rv
return wrapper
......@@ -44,27 +46,47 @@ def assembly_record(identifier):
with Entrez.esearch(db="assembly", retmax=2, term=identifier) as handle:
record = Entrez.read(handle)
if int(record["Count"]) > 1:
raise NotImplementedError
count = int(record["Count"])
if count > 1:
raise NotImplementedError("Too much hit on assembly for %s" % identifier)
if count == 0:
return {}
record_id = record["IdList"]
with Entrez.efetch(db="assembly", id=record_id, retmode="xml", rettype="docsum") as handle:
return Entrez.read(handle)
@cache_json
def sra_records(identifier):
with Entrez.esearch(db="sra", retmax=2, term=identifier) as handle:
record = Entrez.read(handle)
count = int(record["Count"])
# if count > 1:
# raise NotImplementedError("Too much hit on sra for %s" % identifier)
if count == 0:
return {}
ret = []
for record_id in record["IdList"]:
with Entrez.efetch(db="sra", id=record_id, retmode="xml", rettype="docsum") as handle:
for response in Entrez.read(handle):
ret.append(response)
return ret
class StrainWrapped:
def __init__(self, strain, bio_sample, assembly):
self.strain = strain
self.bio_sample = bio_sample
self.assembly = assembly
self.strain = strain.strip()
self.bio_sample = bio_sample.strip()
self.assembly = assembly.strip()
@property
def _sample_data(self):
# print(json.dumps(record, indent=4))
def _bio_sample_sample_data(self):
sample_data = bio_sample_record(self.bio_sample)["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]
from xml.etree.ElementTree import fromstring
myxml = fromstring(sample_data)
return myxml
my_xml = fromstring(sample_data)
return my_xml
@property
def host(self):
......@@ -72,15 +94,25 @@ class StrainWrapped:
@property
def location(self):
# where was obtained the sample, ville, hospital
return self._bio_sample_sample_data_attr("geo_loc_name")
@property
def isolation_date(self):
return bio_sample_record(self.bio_sample)["DocumentSummarySet"]["DocumentSummary"][0]["Date"]
# date du prélévement
# return bio_sample_record(self.bio_sample)["DocumentSummarySet"]["DocumentSummary"][0]["Date"]
return self._bio_sample_sample_data_attr("collection_date")
@property
def source(self):
return bio_sample_record(self.bio_sample)["DocumentSummarySet"]["DocumentSummary"][0]["SourceSample"]
# ville campagne
# return bio_sample_record(self.bio_sample)["DocumentSummarySet"]["DocumentSummary"][0]["SourceSample"]
return "TODO"
@property
def origin(self):
# prélévement sanguin ? bucale ? ...
return self._bio_sample_sample_data_attr("isolation_source")
@property
def coverage(self):
......@@ -88,20 +120,37 @@ class StrainWrapped:
@property
def sequencing(self):
return "TODO"
# type ngs pacbio, long read. Donne machine, technique de sequençage, longueur des reads, illumina ou autre,
# nom de la machine, nom du kit
return ", ".join(self._sra_attrs("instrument_model"))
@property
def assembling(self):
return "TODO"
@property
def origin(self):
return "TODO"
# <assembly-level>5</assembly-level>
# <assembly-status>Complete Genome</assembly-status>
my_xml = fromstringlist([
"<wrap>",
assembly_record(self.assembly)["DocumentSummarySet"]["DocumentSummary"][0]["Meta"],
"</wrap>",
])
for node in my_xml.iterfind(".//assembly-status"):
return node.text
def _bio_sample_sample_data_attr(self, attr_name):
attr = self._sample_data.find("Attributes/Attribute[@attribute_name=\"" + attr_name + "\"]")
attr = self._bio_sample_sample_data.find(".//*[@attribute_name=\"" + attr_name + "\"]")
return attr.text if attr is not None else None
def _sra_attrs(self, attr_name):
for entry in sra_records(self.bio_sample):
my_xml = fromstringlist([
"<wrap>",
entry["ExpXml"],
"</wrap>",
])
nodes = my_xml.iterfind(".//*[@" + attr_name + "]")
for node in nodes:
yield node.attrib[attr_name]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
......