Commit 03f72d2c authored by Bryan  BRANCOTTE's avatar Bryan BRANCOTTE
Browse files

Merge branch 'prevent-duplication' into 'master'

Prevent issue from duplicated entries

Closes #240

See merge request !92
parents 6aab2865 faa5cc65
...@@ -5,7 +5,7 @@ What is a compatible file? ...@@ -5,7 +5,7 @@ What is a compatible file?
Short version Short version
------------- -------------
A compatible file is an Excel spreadsheet (.xlsx) in which :term:`hosts <host>` are indicated in columns and :term:`viruses <virus>` are indicated in rows. Each cell is filled with the :term:`response` (a digit) of the interaction between one :term:`host` and one :term:`virus` (see example below). Warning: only data located in the first Excel sheet will be taken into account! A compatible file is an Excel spreadsheet (.xlsx) in which :term:`hosts <host>` are indicated in columns and :term:`viruses <virus>` are indicated in rows. Each cell is filled with the :term:`response` (a digit) of the interaction between one :term:`host` and one :term:`virus` (see example below). Warning: only data located in the first Excel sheet will be taken into account, :term:`host`/:term:`virus` must not be duplicated.
+---------------------------+-------------------------------+------------------------------+ +---------------------------+-------------------------------+------------------------------+
| | E. coli MG1655 (NC_000913.3) | E. coli O157:H7 (AE005174.2) | | | E. coli MG1655 (NC_000913.3) | E. coli O157:H7 (AE005174.2) |
...@@ -100,6 +100,11 @@ Variants in the file disposition ...@@ -100,6 +100,11 @@ Variants in the file disposition
| Batch | 11603 | Jane Doe | | Batch | 11603 | Jane Doe |
+-----------------------+--------------------+---------------------+ +-----------------------+--------------------+---------------------+
Duplication of virus or host
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
It is not allowed to provide file with :term:`host` or :term:`virus` duplicated. If you do so the import will only keep one version of it. Whether the first or last occurrence is kept depends on implementation details and can be changed at any moment. When importing a file with duplication, a warning is rendered indicating which occurrence is kept.
Robustness of file import Robustness of file import
~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~
The resilience of the import module to read and interpret the file is of a paramount importance, we generated multiple configurations in which a file could be written and how we should read it. At each change in the programme we test that each file is still read as expected. The file collection can be browsed at https://gitlab.pasteur.fr/hub/viralhostrangedb/tree/master/src/viralhostrange/test_data, where for an input file ``<filename>.xlsx`` the data we extract from it is ``<filename>.xlsx.json``. The resilience of the import module to read and interpret the file is of a paramount importance, we generated multiple configurations in which a file could be written and how we should read it. At each change in the programme we test that each file is still read as expected. The file collection can be browsed at https://gitlab.pasteur.fr/hub/viralhostrangedb/tree/master/src/viralhostrange/test_data, where for an input file ``<filename>.xlsx`` the data we extract from it is ``<filename>.xlsx.json``.
......
...@@ -53,12 +53,21 @@ msgstr "" ...@@ -53,12 +53,21 @@ msgstr ""
#, python-format #, python-format
msgid "" msgid ""
"Could not parse host \"%(host)s\" at column \"%(column_id)s\" (i.e column " "Issue with host \"%(host)s\" at column \"%(column_id)s\" (i.e column "
"\"%(column_index)s\")" "\"%(column_index)s\")"
msgstr "" msgstr ""
#, python-format #, python-format
msgid "Could not parse virus \"%(virus)s\" at row \"%(row_id)s\"" msgid "Issue with virus \"%(virus)s\" at row \"%(row_id)s\""
msgstr ""
msgid "Duplicated host, this occurrence will not be imported"
msgstr ""
msgid "Duplicated virus, this occurrence will overwrite the previous row"
msgstr ""
msgid "Empty host, not imported"
msgstr "" msgstr ""
#, python-format #, python-format
...@@ -190,6 +199,10 @@ msgstr "" ...@@ -190,6 +199,10 @@ msgstr ""
msgid "Field is required" msgid "Field is required"
msgstr "" msgstr ""
#, python-format
msgid "Duplicated entry <b>%s</b> have been found but is not allowed."
msgstr ""
msgid "Only published data" msgid "Only published data"
msgstr "" msgstr ""
......
...@@ -2,7 +2,7 @@ Django<4.0 ...@@ -2,7 +2,7 @@ Django<4.0
psycopg2 psycopg2
psycopg2-binary psycopg2-binary
mod_wsgi mod_wsgi
pandas<=1.3.5 pandas==1.4.*
jinja2 jinja2
openpyxl openpyxl
django-crispy-forms django-crispy-forms
......
{
"C1": {
"r1": "0",
"r2": "2",
"r3": "0"
},
"C2": {
"r1": "2",
"r2": "1",
"r3": "2"
},
"C4": {
"r1": "0",
"r2": "0",
"r3": "x"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "2",
"r3": "0"
},
"C2": {
"r1": "1",
"r2": "1",
"r3": "2"
},
"C4": {
"r1": "2",
"r2": "0",
"r3": "0"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "0",
"r3": "0"
},
"C2": {
"r1": "1",
"r2": "1",
"r3": "2"
},
"c1": {
"r1": "0",
"r2": "2",
"r3": "0"
},
"C4": {
"r1": "2",
"r2": "0",
"r3": "0"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "0"
},
"C2": {
"r1": "2",
"r2": "1"
},
"C3": {
"r1": "0",
"r2": "2"
},
"C4": {
"r1": "0",
"r2": "0"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "0",
"R1": "0"
},
"C2": {
"r1": "1",
"r2": "1",
"R1": "2"
},
"C3": {
"r1": "0",
"r2": "2",
"R1": "0"
},
"C4": {
"r1": "2",
"r2": "0",
"R1": "0"
}
}
\ No newline at end of file
...@@ -47,6 +47,7 @@ ViralHostResponse = namedtuple( ...@@ -47,6 +47,7 @@ ViralHostResponse = namedtuple(
"host", "host",
"host_identifiers", "host_identifiers",
"response", "response",
"parsed_response",
"row_id", "row_id",
"col_id", "col_id",
]) ])
...@@ -70,13 +71,16 @@ class ImportationObserver: ...@@ -70,13 +71,16 @@ class ImportationObserver:
class Meta: class Meta:
abstract = True abstract = True
def notify_response_error(self, virus, host, response_str, replaced): DUPLICATED = 1
EMPTY_NAME = 2
def notify_response_error(self, virus, host, response_str):
pass pass
def notify_host_error(self, host, column_id, reason=None): def notify_host_error(self, host, column_id, reason=None, reason_id=None):
pass pass
def notify_virus_error(self, virus, row_id, reason=None): def notify_virus_error(self, virus, row_id, reason=None, reason_id=None):
pass pass
@staticmethod @staticmethod
...@@ -96,22 +100,27 @@ class MessageImportationObserver(ImportationObserver): ...@@ -96,22 +100,27 @@ class MessageImportationObserver(ImportationObserver):
self.host_warned = set() self.host_warned = set()
self.virus_warned = set() self.virus_warned = set()
def notify_response_error(self, virus, host, response_str, replaced): def add_message(self, request, level, message):
messages.add_message( messages.add_message(request=request, level=level, message=message)
def notify_response_error(self, virus, host, response_str):
self.add_message(
self.request, self.request,
messages.WARNING, messages.WARNING,
"[ImportErr2] " + gettext( "[ImportErr2] " + gettext(
"Could not import response \"%(response)s\" for virus \"%(virus)s\", host\"%(host)s\", " "Could not import response \"%(response)s\" for virus \"%(virus)s\", host\"%(host)s\"") % dict(
"replacing it with \"%(rpl)s\"") % dict(
response=str(response_str), response=str(response_str),
virus=str(virus), virus=str(virus),
host=str(host), host=str(host),
rpl=str(replaced),
) )
) )
@staticmethod @staticmethod
def reason_to_str(msg, reason): def reason_to_str(msg, reason, reason_id):
if reason_id == ImportationObserver.DUPLICATED:
return msg + gettext(": ") + gettext("Duplicated")
if reason_id == ImportationObserver.EMPTY_NAME:
return msg + gettext(": ") + gettext("Empty name")
msg = [msg, "<br/>"] msg = [msg, "<br/>"]
if type(reason) == ValidationError: if type(reason) == ValidationError:
reason = reason.error_dict reason = reason.error_dict
...@@ -128,41 +137,50 @@ class MessageImportationObserver(ImportationObserver): ...@@ -128,41 +137,50 @@ class MessageImportationObserver(ImportationObserver):
msg.append(str(reason)) msg.append(str(reason))
return "".join(msg) return "".join(msg)
def notify_host_error(self, host, column_id, reason=None): def notify_host_error(self, host, column_id, reason=None, reason_id=None):
if column_id in self.host_warned: if column_id in self.host_warned:
return return
self.host_warned.add(column_id) self.host_warned.add(column_id)
msg = "[ImportErr1] " + gettext("Could not parse host \"%(host)s\" at column \"%(column_id)s\" " msg = "[ImportErr1] " + gettext("Issue with host \"%(host)s\" at column \"%(column_id)s\" "
"(i.e column \"%(column_index)s\")") % dict( "(i.e column \"%(column_index)s\")") % dict(
host=str(host), host=str(host),
column_id=str(column_id), column_id=str(column_id),
column_index=str(self.id_to_excel_index(column_id)), column_index=str(self.id_to_excel_index(column_id)),
) )
if reason: if reason or reason_id:
msg = self.reason_to_str(msg, reason) msg = self.reason_to_str(msg, reason, reason_id)
messages.add_message( self.add_message(
self.request, self.request,
messages.WARNING, messages.WARNING,
mark_safe(msg), mark_safe(msg),
) )
def notify_virus_error(self, virus, row_id, reason=None): def notify_virus_error(self, virus, row_id, reason=None, reason_id=None):
if row_id in self.virus_warned: if row_id in self.virus_warned:
return return
self.virus_warned.add(row_id) self.virus_warned.add(row_id)
msg = "[ImportErr3] " + gettext("Could not parse virus \"%(virus)s\" at row \"%(row_id)s\"") % dict( msg = "[ImportErr3] " + gettext("Issue with virus \"%(virus)s\" at row \"%(row_id)s\"") % dict(
virus=str(virus), virus=str(virus),
row_id=str(row_id), row_id=str(row_id),
) )
if reason: if reason or reason_id:
msg = self.reason_to_str(msg, reason) msg = self.reason_to_str(msg, reason, reason_id)
messages.add_message( self.add_message(
self.request, self.request,
messages.WARNING, messages.WARNING,
mark_safe(msg), mark_safe(msg),
) )
class StackErrorImportationObserver(MessageImportationObserver):
def __init__(self):
super().__init__(request=None)
self.errors = []
def add_message(self, request, level, message):
self.errors.append(message)
def panda_color_mapping(v): def panda_color_mapping(v):
key = 'html_color_%s' % str(v) key = 'html_color_%s' % str(v)
color = cache.get(key) color = cache.get(key)
...@@ -300,13 +318,15 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n ...@@ -300,13 +318,15 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
elif filename.endswith("ods"): elif filename.endswith("ods"):
my_reader = ods_to_csv(file) my_reader = ods_to_csv(file)
else: else:
content = pd.read_excel(file, index_col=0, header=0, sheet_name=sheet_name, engine='openpyxl') content = pd.read_excel(file, index_col=0, header=None, sheet_name=sheet_name, engine='openpyxl')
my_reader = csv.reader(content.to_csv(sep=';').split('\n'), delimiter=';') my_reader = csv.reader(content.to_csv(sep=';').split('\n'), delimiter=';')
next(my_reader)
header = None header = None
start_at = 0 start_at = 0
has_seen_data = False has_seen_data = False
row_id = 0 row_id = 0
blank_line = 0 blank_line = 0
virus_set = set()
for row in my_reader: for row in my_reader:
row_id += 1 row_id += 1
sub_row = row[start_at:] sub_row = row[start_at:]
...@@ -322,8 +342,19 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n ...@@ -322,8 +342,19 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
cell = cell.strip() cell = cell.strip()
if header is None or not has_seen_data and id_col == 1 and sub_row[0] == "": if header is None or not has_seen_data and id_col == 1 and sub_row[0] == "":
if cell != "" and not cell.startswith("Unnamed: "): if cell != "" and not cell.startswith("Unnamed: "):
header = row header = [h[:-2] if h.endswith(".0") else h for h in row]
start_at = id_col - 1 + start_at start_at = id_col - 1 + start_at
header_set = set(header[1:])
if importation_observer:
for header_col, h in enumerate(header[1:]):
try:
header_set.remove(h)
except KeyError:
importation_observer.notify_host_error(
h,
header_col + start_at + 2,
reason_id=ImportationObserver.DUPLICATED,
)
break break
elif id_col > 0 and sub_row[0] != "": elif id_col > 0 and sub_row[0] != "":
cell = cell.strip() cell = cell.strip()
...@@ -336,22 +367,47 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n ...@@ -336,22 +367,47 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
continue continue
if not virus: if not virus:
virus, virus_identifiers = extract_name_and_identifiers(sub_row[0]) virus, virus_identifiers = extract_name_and_identifiers(sub_row[0])
former_len = len(virus_set)
virus_set.add((virus, str(virus_identifiers)))
if importation_observer and former_len == len(virus_set):
importation_observer.notify_virus_error(
sub_row[0],
row_id,
reason_id=ImportationObserver.DUPLICATED,
)
has_seen_data = True has_seen_data = True
h = header[id_col + start_at] h = header[id_col + start_at]
if h == "" or h.startswith("Unnamed: "): if h == "" or h.startswith("Unnamed: "):
if importation_observer: if importation_observer:
importation_observer.notify_host_error(h, id_col + start_at) importation_observer.notify_host_error(
h,
id_col + start_at,
reason_id=ImportationObserver.EMPTY_NAME,
)
continue continue
host, host_identifiers = extract_name_and_identifiers(h) host, host_identifiers = extract_name_and_identifiers(h)
try:
parsed_response = float(cell)
except ValueError:
parsed_response = -1000
if importation_observer:
importation_observer.notify_response_error(virus, host, cell)
yield ViralHostResponse( yield ViralHostResponse(
virus=virus, virus=virus,
virus_identifiers=virus_identifiers, virus_identifiers=virus_identifiers,
host=host, host=host,
host_identifiers=host_identifiers, host_identifiers=host_identifiers,
response=cell, response=cell,
parsed_response=parsed_response,
row_id=row_id, row_id=row_id,
col_id=id_col + start_at, col_id=id_col + start_at,
) )
elif id_col > 0 and importation_observer:
importation_observer.notify_virus_error(
sub_row[0],
row_id,
reason_id=ImportationObserver.EMPTY_NAME,
)
if input_file is not None: if input_file is not None:
input_file.close() input_file.close()
...@@ -449,8 +505,23 @@ def restore_backup(*, data_source, log_entry, importation_observer: ImportationO ...@@ -449,8 +505,23 @@ def restore_backup(*, data_source, log_entry, importation_observer: ImportationO
) )
def import_file_later(*, file):
observer = StackErrorImportationObserver()
data = list(__parse_file(file, importation_observer=observer))
def actually_import_file(data_source, importation_observer):
return __import_file(data_source=data_source, parsed_file=data, importation_observer=importation_observer)
return observer.errors, actually_import_file
@transaction.atomic @transaction.atomic
def import_file(*, data_source, file, importation_observer: ImportationObserver = None): def import_file(*, data_source, file, importation_observer: ImportationObserver = None):
parsed_file = parse_file(file, importation_observer)
return __import_file(data_source=data_source, parsed_file=parsed_file, importation_observer=importation_observer)
def __import_file(*, data_source, parsed_file, importation_observer: ImportationObserver = None):
""" """
Import the file and associate responses to the data source provided. If responses are already present there are Import the file and associate responses to the data source provided. If responses are already present there are
overwritten with the one from the file. Updated response are automatically map following the mapping observed in db overwritten with the one from the file. Updated response are automatically map following the mapping observed in db
...@@ -473,7 +544,7 @@ def import_file(*, data_source, file, importation_observer: ImportationObserver ...@@ -473,7 +544,7 @@ def import_file(*, data_source, file, importation_observer: ImportationObserver
responses_to_create = [] responses_to_create = []
# former mapping, if present empty dict otherwise # former mapping, if present empty dict otherwise
former_mapping = dict(data_source.get_mapping(only_pk=True)) former_mapping = dict(data_source.get_mapping(only_pk=True))
for vhr in parse_file(file, importation_observer): for vhr in parsed_file:
# if vhr.virus == "" or vhr.host == "" or vhr.response == "": # if vhr.virus == "" or vhr.host == "" or vhr.response == "":
# continue # continue
explicit_virus = explicit_item( explicit_virus = explicit_item(
...@@ -562,42 +633,24 @@ def import_file(*, data_source, file, importation_observer: ImportationObserver ...@@ -562,42 +633,24 @@ def import_file(*, data_source, file, importation_observer: ImportationObserver
host_dict[explicit_host] = host host_dict[explicit_host] = host
try: try:
raw_response = float(vhr.response) float(vhr.response)
except ValueError as e: except ValueError as e:
if importation_observer: if importation_observer:
raw_response = -1000 importation_observer.notify_response_error(vhr.virus, vhr.host, vhr.response)
importation_observer.notify_response_error(vhr.virus, vhr.host, vhr.response, raw_response)
else: else:
raise e raise e
try: # update or create response in db
# search response in db models.ViralHostResponseValueInDataSource.objects.update_or_create(
response = models.ViralHostResponseValueInDataSource.objects.get( data_source=data_source,
data_source=data_source, virus=virus,
virus=virus, host=host,
host=host, defaults=dict(
) raw_response=vhr.parsed_response,
response_id=former_mapping.get(vhr.parsed_response, not_mapped_yet.pk),
if response.raw_response != raw_response: ),
# when raw response have changed, update it an try to keep the mapping from what it was )
response.raw_response = raw_response old_virus_pk.discard(virus.pk)
response.response_id = former_mapping.get(raw_response, not_mapped_yet.pk) old_host_pk.discard(host.pk)
response.save()
old_virus_pk.discard(virus.pk)
old_host_pk.discard(host.pk)
except models.ViralHostResponseValueInDataSource.DoesNotExist:
# response is missing creating a new response, to save later
responses_to_create.append(
models.ViralHostResponseValueInDataSource(
data_source=data_source,
virus=virus,
host=host,
raw_response=raw_response,