diff --git a/backend/analysis/models.py b/backend/analysis/models.py index 3d42eb62957479fe821df4d14fc2a69c28f8a6c4..b72fc8186c51bac750d2b5885fdf108a8466422b 100644 --- a/backend/analysis/models.py +++ b/backend/analysis/models.py @@ -261,11 +261,10 @@ class Analysis(Invocation): queryset = Protein.objects.filter(analysis=self) if self.status == DONE and queryset.count() == 0: file_path = self.load_dataset("proteins", "proteins") - proteins = self.read_fasta_file(file_path) + proteins = self.read_fasta_file(file_path, isFromNt=True) if len(proteins) <= 1: file_path = self.load_dataset("sequences", "proteins") proteins = self.read_fasta_file(file_path) - # print(proteins) json_prots = [ProteinEntry(**prot).dict() for prot in proteins] prots = Protein(analysis=self, proteins=json_prots) @@ -283,24 +282,47 @@ class Analysis(Invocation): self.stderr = job.wrapped["tool_stderr"] self.save() - def read_fasta_file(self, file_path): + def read_fasta_file(self, file_path, isFromNt=False): + + # if is from Nt, need to sum prot length. + # In order to get proteins that belongs to same contig + # just remove (_\d+) to the id + sequences = [] if file_path is not None: with open(file_path) as handle: - + current_contig = None + offset = 0 + last_prot_end = 0 for record in SeqIO.parse(handle, "fasta"): + prot = {"id": record.id, "length": len(record), "strand": None} + + if isFromNt: + contig = "-".join(prot["id"].split("_")[0:-1]) + if current_contig is None or contig != current_contig: + current_contig = contig + if current_contig is not None: + offset = offset + last_prot_end + print(offset) + description_list = record.description.split(" # ") if len(description_list) == 5: start = description_list[1] end = description_list[2] strand = description_list[3] + if strand == "1" or strand == "-1": prot["strand"] = int(strand) - prot["start"] = int(start) - prot["end"] = int(end) else: strand = None + if isFromNt: + prot["start"] = offset + int(start) + prot["end"] = offset + int(end) + last_prot_end = prot["end"] + else: + prot["start"] = int(start) + prot["end"] = int(end) sequences.append(prot) return sequences