From bf724ffac688e051a1d8fccb8d7baf375e04bd1e Mon Sep 17 00:00:00 2001 From: Remi PLANEL <rplanel@pasteur.fr> Date: Thu, 7 Mar 2024 16:30:30 +0100 Subject: [PATCH] fix bug display proteins when mutiple nt contigs as input --- backend/analysis/models.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/backend/analysis/models.py b/backend/analysis/models.py index 3d42eb6..b72fc81 100644 --- a/backend/analysis/models.py +++ b/backend/analysis/models.py @@ -261,11 +261,10 @@ class Analysis(Invocation): queryset = Protein.objects.filter(analysis=self) if self.status == DONE and queryset.count() == 0: file_path = self.load_dataset("proteins", "proteins") - proteins = self.read_fasta_file(file_path) + proteins = self.read_fasta_file(file_path, isFromNt=True) if len(proteins) <= 1: file_path = self.load_dataset("sequences", "proteins") proteins = self.read_fasta_file(file_path) - # print(proteins) json_prots = [ProteinEntry(**prot).dict() for prot in proteins] prots = Protein(analysis=self, proteins=json_prots) @@ -283,24 +282,47 @@ class Analysis(Invocation): self.stderr = job.wrapped["tool_stderr"] self.save() - def read_fasta_file(self, file_path): + def read_fasta_file(self, file_path, isFromNt=False): + + # if is from Nt, need to sum prot length. + # In order to get proteins that belongs to same contig + # just remove (_\d+) to the id + sequences = [] if file_path is not None: with open(file_path) as handle: - + current_contig = None + offset = 0 + last_prot_end = 0 for record in SeqIO.parse(handle, "fasta"): + prot = {"id": record.id, "length": len(record), "strand": None} + + if isFromNt: + contig = "-".join(prot["id"].split("_")[0:-1]) + if current_contig is None or contig != current_contig: + current_contig = contig + if current_contig is not None: + offset = offset + last_prot_end + print(offset) + description_list = record.description.split(" # ") if len(description_list) == 5: start = description_list[1] end = description_list[2] strand = description_list[3] + if strand == "1" or strand == "-1": prot["strand"] = int(strand) - prot["start"] = int(start) - prot["end"] = int(end) else: strand = None + if isFromNt: + prot["start"] = offset + int(start) + prot["end"] = offset + int(end) + last_prot_end = prot["end"] + else: + prot["start"] = int(start) + prot["end"] = int(end) sequences.append(prot) return sequences -- GitLab