Commit 03f72d2c authored by Bryan  BRANCOTTE's avatar Bryan BRANCOTTE
Browse files

Merge branch 'prevent-duplication' into 'master'

Prevent issue from duplicated entries

Closes #240

See merge request !92
parents 6aab2865 faa5cc65
......@@ -5,7 +5,7 @@ What is a compatible file?
Short version
-------------
A compatible file is an Excel spreadsheet (.xlsx) in which :term:`hosts <host>` are indicated in columns and :term:`viruses <virus>` are indicated in rows. Each cell is filled with the :term:`response` (a digit) of the interaction between one :term:`host` and one :term:`virus` (see example below). Warning: only data located in the first Excel sheet will be taken into account!
A compatible file is an Excel spreadsheet (.xlsx) in which :term:`hosts <host>` are indicated in columns and :term:`viruses <virus>` are indicated in rows. Each cell is filled with the :term:`response` (a digit) of the interaction between one :term:`host` and one :term:`virus` (see example below). Warning: only data located in the first Excel sheet will be taken into account, :term:`host`/:term:`virus` must not be duplicated.
+---------------------------+-------------------------------+------------------------------+
| | E. coli MG1655 (NC_000913.3) | E. coli O157:H7 (AE005174.2) |
......@@ -100,6 +100,11 @@ Variants in the file disposition
| Batch | 11603 | Jane Doe |
+-----------------------+--------------------+---------------------+
Duplication of virus or host
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
It is not allowed to provide file with :term:`host` or :term:`virus` duplicated. If you do so the import will only keep one version of it. Whether the first or last occurrence is kept depends on implementation details and can be changed at any moment. When importing a file with duplication, a warning is rendered indicating which occurrence is kept.
Robustness of file import
~~~~~~~~~~~~~~~~~~~~~~~~~
The resilience of the import module to read and interpret the file is of a paramount importance, we generated multiple configurations in which a file could be written and how we should read it. At each change in the programme we test that each file is still read as expected. The file collection can be browsed at https://gitlab.pasteur.fr/hub/viralhostrangedb/tree/master/src/viralhostrange/test_data, where for an input file ``<filename>.xlsx`` the data we extract from it is ``<filename>.xlsx.json``.
......
......@@ -53,12 +53,21 @@ msgstr ""
#, python-format
msgid ""
"Could not parse host \"%(host)s\" at column \"%(column_id)s\" (i.e column "
"Issue with host \"%(host)s\" at column \"%(column_id)s\" (i.e column "
"\"%(column_index)s\")"
msgstr ""
#, python-format
msgid "Could not parse virus \"%(virus)s\" at row \"%(row_id)s\""
msgid "Issue with virus \"%(virus)s\" at row \"%(row_id)s\""
msgstr ""
msgid "Duplicated host, this occurrence will not be imported"
msgstr ""
msgid "Duplicated virus, this occurrence will overwrite the previous row"
msgstr ""
msgid "Empty host, not imported"
msgstr ""
#, python-format
......@@ -190,6 +199,10 @@ msgstr ""
msgid "Field is required"
msgstr ""
#, python-format
msgid "Duplicated entry <b>%s</b> have been found but is not allowed."
msgstr ""
msgid "Only published data"
msgstr ""
......
......@@ -2,7 +2,7 @@ Django<4.0
psycopg2
psycopg2-binary
mod_wsgi
pandas<=1.3.5
pandas==1.4.*
jinja2
openpyxl
django-crispy-forms
......
{
"C1": {
"r1": "0",
"r2": "2",
"r3": "0"
},
"C2": {
"r1": "2",
"r2": "1",
"r3": "2"
},
"C4": {
"r1": "0",
"r2": "0",
"r3": "x"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "2",
"r3": "0"
},
"C2": {
"r1": "1",
"r2": "1",
"r3": "2"
},
"C4": {
"r1": "2",
"r2": "0",
"r3": "0"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "0",
"r3": "0"
},
"C2": {
"r1": "1",
"r2": "1",
"r3": "2"
},
"c1": {
"r1": "0",
"r2": "2",
"r3": "0"
},
"C4": {
"r1": "2",
"r2": "0",
"r3": "0"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "0"
},
"C2": {
"r1": "2",
"r2": "1"
},
"C3": {
"r1": "0",
"r2": "2"
},
"C4": {
"r1": "0",
"r2": "0"
}
}
\ No newline at end of file
{
"C1": {
"r1": "0",
"r2": "0",
"R1": "0"
},
"C2": {
"r1": "1",
"r2": "1",
"R1": "2"
},
"C3": {
"r1": "0",
"r2": "2",
"R1": "0"
},
"C4": {
"r1": "2",
"r2": "0",
"R1": "0"
}
}
\ No newline at end of file
......@@ -47,6 +47,7 @@ ViralHostResponse = namedtuple(
"host",
"host_identifiers",
"response",
"parsed_response",
"row_id",
"col_id",
])
......@@ -70,13 +71,16 @@ class ImportationObserver:
class Meta:
abstract = True
def notify_response_error(self, virus, host, response_str, replaced):
DUPLICATED = 1
EMPTY_NAME = 2
def notify_response_error(self, virus, host, response_str):
pass
def notify_host_error(self, host, column_id, reason=None):
def notify_host_error(self, host, column_id, reason=None, reason_id=None):
pass
def notify_virus_error(self, virus, row_id, reason=None):
def notify_virus_error(self, virus, row_id, reason=None, reason_id=None):
pass
@staticmethod
......@@ -96,22 +100,27 @@ class MessageImportationObserver(ImportationObserver):
self.host_warned = set()
self.virus_warned = set()
def notify_response_error(self, virus, host, response_str, replaced):
messages.add_message(
def add_message(self, request, level, message):
messages.add_message(request=request, level=level, message=message)
def notify_response_error(self, virus, host, response_str):
self.add_message(
self.request,
messages.WARNING,
"[ImportErr2] " + gettext(
"Could not import response \"%(response)s\" for virus \"%(virus)s\", host\"%(host)s\", "
"replacing it with \"%(rpl)s\"") % dict(
"Could not import response \"%(response)s\" for virus \"%(virus)s\", host\"%(host)s\"") % dict(
response=str(response_str),
virus=str(virus),
host=str(host),
rpl=str(replaced),
)
)
@staticmethod
def reason_to_str(msg, reason):
def reason_to_str(msg, reason, reason_id):
if reason_id == ImportationObserver.DUPLICATED:
return msg + gettext(": ") + gettext("Duplicated")
if reason_id == ImportationObserver.EMPTY_NAME:
return msg + gettext(": ") + gettext("Empty name")
msg = [msg, "<br/>"]
if type(reason) == ValidationError:
reason = reason.error_dict
......@@ -128,41 +137,50 @@ class MessageImportationObserver(ImportationObserver):
msg.append(str(reason))
return "".join(msg)
def notify_host_error(self, host, column_id, reason=None):
def notify_host_error(self, host, column_id, reason=None, reason_id=None):
if column_id in self.host_warned:
return
self.host_warned.add(column_id)
msg = "[ImportErr1] " + gettext("Could not parse host \"%(host)s\" at column \"%(column_id)s\" "
msg = "[ImportErr1] " + gettext("Issue with host \"%(host)s\" at column \"%(column_id)s\" "
"(i.e column \"%(column_index)s\")") % dict(
host=str(host),
column_id=str(column_id),
column_index=str(self.id_to_excel_index(column_id)),
)
if reason:
msg = self.reason_to_str(msg, reason)
messages.add_message(
if reason or reason_id:
msg = self.reason_to_str(msg, reason, reason_id)
self.add_message(
self.request,
messages.WARNING,
mark_safe(msg),
)
def notify_virus_error(self, virus, row_id, reason=None):
def notify_virus_error(self, virus, row_id, reason=None, reason_id=None):
if row_id in self.virus_warned:
return
self.virus_warned.add(row_id)
msg = "[ImportErr3] " + gettext("Could not parse virus \"%(virus)s\" at row \"%(row_id)s\"") % dict(
msg = "[ImportErr3] " + gettext("Issue with virus \"%(virus)s\" at row \"%(row_id)s\"") % dict(
virus=str(virus),
row_id=str(row_id),
)
if reason:
msg = self.reason_to_str(msg, reason)
messages.add_message(
if reason or reason_id:
msg = self.reason_to_str(msg, reason, reason_id)
self.add_message(
self.request,
messages.WARNING,
mark_safe(msg),
)
class StackErrorImportationObserver(MessageImportationObserver):
def __init__(self):
super().__init__(request=None)
self.errors = []
def add_message(self, request, level, message):
self.errors.append(message)
def panda_color_mapping(v):
key = 'html_color_%s' % str(v)
color = cache.get(key)
......@@ -300,13 +318,15 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
elif filename.endswith("ods"):
my_reader = ods_to_csv(file)
else:
content = pd.read_excel(file, index_col=0, header=0, sheet_name=sheet_name, engine='openpyxl')
content = pd.read_excel(file, index_col=0, header=None, sheet_name=sheet_name, engine='openpyxl')
my_reader = csv.reader(content.to_csv(sep=';').split('\n'), delimiter=';')
next(my_reader)
header = None
start_at = 0
has_seen_data = False
row_id = 0
blank_line = 0
virus_set = set()
for row in my_reader:
row_id += 1
sub_row = row[start_at:]
......@@ -322,8 +342,19 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
cell = cell.strip()
if header is None or not has_seen_data and id_col == 1 and sub_row[0] == "":
if cell != "" and not cell.startswith("Unnamed: "):
header = row
header = [h[:-2] if h.endswith(".0") else h for h in row]
start_at = id_col - 1 + start_at
header_set = set(header[1:])
if importation_observer:
for header_col, h in enumerate(header[1:]):
try:
header_set.remove(h)
except KeyError:
importation_observer.notify_host_error(
h,
header_col + start_at + 2,
reason_id=ImportationObserver.DUPLICATED,
)
break
elif id_col > 0 and sub_row[0] != "":
cell = cell.strip()
......@@ -336,22 +367,47 @@ def __parse_file(file, importation_observer: ImportationObserver = None, sheet_n
continue
if not virus:
virus, virus_identifiers = extract_name_and_identifiers(sub_row[0])
former_len = len(virus_set)
virus_set.add((virus, str(virus_identifiers)))
if importation_observer and former_len == len(virus_set):
importation_observer.notify_virus_error(
sub_row[0],
row_id,
reason_id=ImportationObserver.DUPLICATED,
)
has_seen_data = True
h = header[id_col + start_at]
if h == "" or h.startswith("Unnamed: "):
if importation_observer:
importation_observer.notify_host_error(h, id_col + start_at)
importation_observer.notify_host_error(
h,
id_col + start_at,
reason_id=ImportationObserver.EMPTY_NAME,
)
continue
host, host_identifiers = extract_name_and_identifiers(h)
try:
parsed_response = float(cell)
except ValueError:
parsed_response = -1000
if importation_observer:
importation_observer.notify_response_error(virus, host, cell)
yield ViralHostResponse(
virus=virus,
virus_identifiers=virus_identifiers,
host=host,
host_identifiers=host_identifiers,
response=cell,
parsed_response=parsed_response,
row_id=row_id,
col_id=id_col + start_at,
)
elif id_col > 0 and importation_observer:
importation_observer.notify_virus_error(
sub_row[0],
row_id,
reason_id=ImportationObserver.EMPTY_NAME,
)
if input_file is not None:
input_file.close()
......@@ -449,8 +505,23 @@ def restore_backup(*, data_source, log_entry, importation_observer: ImportationO
)
def import_file_later(*, file):
observer = StackErrorImportationObserver()
data = list(__parse_file(file, importation_observer=observer))
def actually_import_file(data_source, importation_observer):
return __import_file(data_source=data_source, parsed_file=data, importation_observer=importation_observer)
return observer.errors, actually_import_file
@transaction.atomic
def import_file(*, data_source, file, importation_observer: ImportationObserver = None):
parsed_file = parse_file(file, importation_observer)
return __import_file(data_source=data_source, parsed_file=parsed_file, importation_observer=importation_observer)
def __import_file(*, data_source, parsed_file, importation_observer: ImportationObserver = None):
"""
Import the file and associate responses to the data source provided. If responses are already present there are
overwritten with the one from the file. Updated response are automatically map following the mapping observed in db
......@@ -473,7 +544,7 @@ def import_file(*, data_source, file, importation_observer: ImportationObserver
responses_to_create = []
# former mapping, if present empty dict otherwise
former_mapping = dict(data_source.get_mapping(only_pk=True))
for vhr in parse_file(file, importation_observer):
for vhr in parsed_file:
# if vhr.virus == "" or vhr.host == "" or vhr.response == "":
# continue
explicit_virus = explicit_item(
......@@ -562,42 +633,24 @@ def import_file(*, data_source, file, importation_observer: ImportationObserver
host_dict[explicit_host] = host
try:
raw_response = float(vhr.response)
float(vhr.response)
except ValueError as e:
if importation_observer:
raw_response = -1000
importation_observer.notify_response_error(vhr.virus, vhr.host, vhr.response, raw_response)
importation_observer.notify_response_error(vhr.virus, vhr.host, vhr.response)
else:
raise e
try:
# search response in db
response = models.ViralHostResponseValueInDataSource.objects.get(
# update or create response in db
models.ViralHostResponseValueInDataSource.objects.update_or_create(
data_source=data_source,
virus=virus,
host=host,
defaults=dict(
raw_response=vhr.parsed_response,
response_id=former_mapping.get(vhr.parsed_response, not_mapped_yet.pk),
),
)
if response.raw_response != raw_response:
# when raw response have changed, update it an try to keep the mapping from what it was
response.raw_response = raw_response
response.response_id = former_mapping.get(raw_response, not_mapped_yet.pk)
response.save()
old_virus_pk.discard(virus.pk)
old_host_pk.discard(host.pk)
except models.ViralHostResponseValueInDataSource.DoesNotExist:
# response is missing creating a new response, to save later
responses_to_create.append(
models.ViralHostResponseValueInDataSource(
data_source=data_source,
virus=virus,
host=host,
raw_response=raw_response,
response_id=former_mapping.get(raw_response, not_mapped_yet.pk),
)
)
# save new responses
models.ViralHostResponseValueInDataSource.objects.bulk_create(responses_to_create)
# remove responses associated to virus that we did not see
models.ViralHostResponseValueInDataSource.objects.filter(
......
......@@ -14,7 +14,7 @@ from crispy_forms.layout import HTML, Layout, Row, Column
from django import forms
from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.exceptions import FieldDoesNotExist
from django.core.exceptions import FieldDoesNotExist, ValidationError
from django.db.models import Q, Min, Max
from django.forms import widgets
from django.utils import timezone
......@@ -55,11 +55,19 @@ class ImportDataSourceForm(forms.ModelForm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.on_disk_file = None
self.import_file_fcn = None
def full_clean(self):
super().full_clean()
if not self.is_bound or not self.make_upload_mandatory: # Stop further processing.
if not self.is_bound: # Stop further processing.
return
if self.files is not None and len(self.files) == 1:
file = list(self.files.values())[0]
errors, self.import_file_fcn = business_process.import_file_later(file=file)
for e in errors:
self.add_error('file', e)
# else:
# self.add_error('file', "No file provided")
# if (self.files is None or len(self.files) == 0) and len(self.cleaned_data["url"]) == 0 or \
# (self.files is not None and len(self.files) > 0) and len(self.cleaned_data["url"]) > 0:
# self.add_error("url", _("You have to either provide a file or an URL, and not both."))
......@@ -70,30 +78,10 @@ class ImportDataSourceForm(forms.ModelForm):
if owner is not None:
instance.owner = owner
instance.save()
file = None
if self.files is not None and len(self.files) == 1:
file = list(self.files.values())[0]
if (instance.raw_name or "") == "":
instance.raw_name = file.name
instance.raw_name = list(self.files.values())[0].name
instance.kind = "FILE"
# if len(self.cleaned_data["url"]) > 0:
# try:
# instance.kind = "URL"
# url = self.cleaned_data["url"]
# instance.raw_name = url
# raise NotImplementedError("Downloading from an URL is not available yet")
# except KeyError:
# pass
if file is not None:
# with NamedTemporaryFile(mode='wb+', suffix="-%s" % file.name, delete=False) as destination:
# print(destination.name)
# for chunk in file.chunks():
# destination.write(chunk)
# file = destination.name
# with open(file, "rb") as input_file:
business_process.import_file(
file=file,
self.import_file_fcn(
data_source=instance,
importation_observer=importation_observer,
)
......@@ -448,12 +436,6 @@ class AutoMergeSplitUpdateFormSet(forms.BaseModelFormSet):
self.has_her_identifier = False
def save(self, commit=True):
# objects = super().save(commit=False)
# self.save_m2m()
# for obj, reason in self.changed_objects:
# obj.save()
#
# def saveXXX(self, commit=True):
assert (self.__data_source is not None)
assert commit, "It has to be actually saved in DB"
objects = super().save(commit=False)
......@@ -469,20 +451,30 @@ class AutoMergeSplitUpdateFormSet(forms.BaseModelFormSet):
.filter(~Q(pk=obj.pk))
if self.has_her_identifier:
alter_egos = alter_egos.filter(her_identifier=obj.her_identifier)
# alter_ego is the object to which we have to rename the obj
alter_ego = alter_egos.first()
# If there is no alter_ego we may have to build one new
if alter_ego is None:
# If we can get the is_ncbi_identifier_value elsewhere then we re-use it
if 'identifier' in reason:
identifier_alter_ego = obj.__class__.objects.filter(identifier=obj.identifier).first()
if identifier_alter_ego is None:
obj.is_ncbi_identifier_value = None
else:
obj.is_ncbi_identifier_value = identifier_alter_ego.is_ncbi_identifier_value
# If the object is not used by someone else then we just change it
if obj.data_source.count() == 1:
obj.save()
continue
# obj is used, we clone it
obj.pk = None
obj.save()
# And use this copy as its alter_ego
alter_ego = obj
elif alter_ego.data_source.filter(pk=self.__data_source.pk).exists():
raise ValidationError(
mark_safe(ugettext("Duplicated entry <b>%s</b> have been found but is not allowed.")
% alter_ego.explicit_name_html))
# remove the old instance from the current data_source
old_o.data_source.remove(self.__data_source)
......@@ -490,7 +482,7 @@ class AutoMergeSplitUpdateFormSet(forms.BaseModelFormSet):
# in any case, replace the instance that was about to be returned by the alter ego/newly created one
indexed_objects[old_o.pk] = alter_ego
# obj is now an other db instance, while old_o is still pointing to the original db instance
# obj is now a new db instance, while old_o is still pointing to the original db instance
# add the new instance to the data source
alter_ego.data_source.add(self.__data_source)
......
......@@ -9,7 +9,7 @@ from django.test import TestCase
from django.utils import timezone
from viralhostrangedb import business_process, models