Commit 19e77add authored by Bryan  BRANCOTTE's avatar Bryan BRANCOTTE
Browse files

prevent importing duplicated host/virus, WIP #240

 * add documentation about duplicate not allowed
 * use update_or_create to overwrite duplicate
 * check when parsing, and notify of duplication
 * add example on how it should work
parent e6f79acc
......@@ -5,7 +5,7 @@ What is a compatible file?
Short version
-------------
A compatible file is an Excel spreadsheet (.xlsx) in which :term:`hosts <host>` are indicated in columns and :term:`viruses <virus>` are indicated in rows. Each cell is filled with the :term:`response` (a digit) of the interaction between one :term:`host` and one :term:`virus` (see example below). Warning: only data located in the first Excel sheet will be taken into account!
A compatible file is an Excel spreadsheet (.xlsx) in which :term:`hosts <host>` are indicated in columns and :term:`viruses <virus>` are indicated in rows. Each cell is filled with the :term:`response` (a digit) of the interaction between one :term:`host` and one :term:`virus` (see example below). Warning: only data located in the first Excel sheet will be taken into account, :term:`host`/:term:`virus` must not be duplicated.
+---------------------------+-------------------------------+------------------------------+
| | E. coli MG1655 (NC_000913.3) | E. coli O157:H7 (AE005174.2) |
......@@ -100,6 +100,11 @@ Variants in the file disposition
| Batch | 11603 | Jane Doe |
+-----------------------+--------------------+---------------------+
Duplication of virus or host
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
It is not allowed to provide file with :term:`host` or :term:`virus` duplicated. If you do so the import will only keep one version of it. Whether the first or last occurrence is kept depends on implementation details and can be changed at any moment. When importing a file with duplication, a warning is rendered indicating which occurrence is kept.
Robustness of file import
~~~~~~~~~~~~~~~~~~~~~~~~~
The resilience of the import module to read and interpret the file is of a paramount importance, we generated multiple configurations in which a file could be written and how we should read it. At each change in the programme we test that each file is still read as expected. The file collection can be browsed at https://gitlab.pasteur.fr/hub/viralhostrangedb/tree/master/src/viralhostrange/test_data, where for an input file ``<filename>.xlsx`` the data we extract from it is ``<filename>.xlsx.json``.
......
{
"C1": {
"r1": "0",
"r2": "0",
"r2": "2",
"r3": "0"
},
"C2": {
......
......@@ -4,7 +4,7 @@
"r2": "0"
},
"C2": {
"r1": "1",
"r1": "2",
"r2": "1"
},
"C3": {
......@@ -12,7 +12,7 @@
"r2": "2"
},
"C4": {
"r1": "2",
"r1": "0",
"r2": "0"
}
}
\ No newline at end of file
......@@ -132,7 +132,7 @@ class MessageImportationObserver(ImportationObserver):
if column_id in self.host_warned:
return
self.host_warned.add(column_id)
msg = "[ImportErr1] " + gettext("Could not parse host \"%(host)s\" at column \"%(column_id)s\" "
msg = "[ImportErr1] " + gettext("Issue with host \"%(host)s\" at column \"%(column_id)s\" "
"(i.e column \"%(column_index)s\")") % dict(
host=str(host),
column_id=str(column_id),
......@@ -150,7 +150,7 @@ class MessageImportationObserver(ImportationObserver):
if row_id in self.virus_warned:
return
self.virus_warned.add(row_id)
msg = "[ImportErr3] " + gettext("Could not parse virus \"%(virus)s\" at row \"%(row_id)s\"") % dict(
msg = "[ImportErr3] " + gettext("Issue with virus \"%(virus)s\" at row \"%(row_id)s\"") % dict(
virus=str(virus),
row_id=str(row_id),
)
......@@ -300,13 +300,15 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
elif filename.endswith("ods"):
my_reader = ods_to_csv(file)
else:
content = pd.read_excel(file, index_col=0, header=0, sheet_name=sheet_name, engine='openpyxl')
content = pd.read_excel(file, index_col=0, header=None, sheet_name=sheet_name, engine='openpyxl')
my_reader = csv.reader(content.to_csv(sep=';').split('\n'), delimiter=';')
next(my_reader)
header = None
start_at = 0
has_seen_data = False
row_id = 0
blank_line = 0
virus_set = set()
for row in my_reader:
row_id += 1
sub_row = row[start_at:]
......@@ -322,8 +324,19 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
cell = cell.strip()
if header is None or not has_seen_data and id_col == 1 and sub_row[0] == "":
if cell != "" and not cell.startswith("Unnamed: "):
header = row
header = [h[:-2] if h.endswith(".0") else h for h in row]
start_at = id_col - 1 + start_at
header_set = set(header[1:])
if importation_observer:
for header_col, h in reversed(list(enumerate(header[1:]))):
try:
header_set.remove(h)
except KeyError:
importation_observer.notify_host_error(
h,
header_col + start_at,
reason=gettext("Duplicated host, this occurrence will not be imported"),
)
break
elif id_col > 0 and sub_row[0] != "":
cell = cell.strip()
......@@ -336,11 +349,23 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
continue
if not virus:
virus, virus_identifiers = extract_name_and_identifiers(sub_row[0])
former_len = len(virus_set)
virus_set.add((virus, str(virus_identifiers)))
if importation_observer and former_len == len(virus_set):
importation_observer.notify_virus_error(
sub_row[0],
row_id,
reason=gettext("Duplicated virus, this occurrence will overwrite the previous row"),
)
has_seen_data = True
h = header[id_col + start_at]
if h == "" or h.startswith("Unnamed: "):
if importation_observer:
importation_observer.notify_host_error(h, id_col + start_at)
importation_observer.notify_host_error(
h,
id_col + start_at,
reason=gettext("Empty host, not imported"),
)
continue
host, host_identifiers = extract_name_and_identifiers(h)
yield ViralHostResponse(
......@@ -569,35 +594,18 @@ def import_file(*, data_source, file, importation_observer: ImportationObserver
importation_observer.notify_response_error(vhr.virus, vhr.host, vhr.response, raw_response)
else:
raise e
try:
# search response in db
response = models.ViralHostResponseValueInDataSource.objects.get(
data_source=data_source,
virus=virus,
host=host,
)
if response.raw_response != raw_response:
# when raw response have changed, update it an try to keep the mapping from what it was
response.raw_response = raw_response
response.response_id = former_mapping.get(raw_response, not_mapped_yet.pk)
response.save()
old_virus_pk.discard(virus.pk)
old_host_pk.discard(host.pk)
except models.ViralHostResponseValueInDataSource.DoesNotExist:
# response is missing creating a new response, to save later
responses_to_create.append(
models.ViralHostResponseValueInDataSource(
data_source=data_source,
virus=virus,
host=host,
raw_response=raw_response,
response_id=former_mapping.get(raw_response, not_mapped_yet.pk),
)
)
# save new responses
models.ViralHostResponseValueInDataSource.objects.bulk_create(responses_to_create)
# update or create response in db
models.ViralHostResponseValueInDataSource.objects.update_or_create(
data_source=data_source,
virus=virus,
host=host,
defaults=dict(
raw_response=raw_response,
response_id=former_mapping.get(raw_response, not_mapped_yet.pk),
),
)
old_virus_pk.discard(virus.pk)
old_host_pk.discard(host.pk)
# remove responses associated to virus that we did not see
models.ViralHostResponseValueInDataSource.objects.filter(
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment