diff --git a/module/utils.py b/module/utils.py index 0438cd4e2b917fdd66d00a7ea9e6d853f57677f1..8fbe91a82520c9b0249af6474a31bc4d7df5f2b8 100644 --- a/module/utils.py +++ b/module/utils.py @@ -137,32 +137,39 @@ def armfinder_to_table(data_resistance:pd.DataFrame) -> pd.DataFrame: 'POINTX' : "!", 'BLASTX' : "*", 'PARTIALX' : "?", - 'PARTIAL_CONTIG_ENDX' : "?$", #The PARTIAL_CONTIG_ENDX method is only attributedd when the start or end position of the sequence being searched coincides exactly with the start or end of the contig. + 'PARTIAL_CONTIG_ENDX' : "_end_of_contig", #The PARTIAL_CONTIG_ENDX method is only attributedd when the start or end position of the sequence being searched coincides exactly with the start or end of the contig. + 'CTRL_CONTIG_END' : "_end_of_contig", 'INTERNAL_STOP' : "#"} + + avoid_NTTB_prediction = ['PARTIAL_CONTIG_ENDX', + 'CTRL_CONTIG_END'] + data_resistance['Class'] = data_resistance['Class'].fillna ('NoClass') Class = data_resistance['Class'].value_counts().keys() Strains = data_resistance['Name'].value_counts().keys() table = pd.DataFrame('',index=Strains, columns=Class) for res in data_resistance.index : - gene = data_resistance['Gene symbol'][res] + dico_Method[data_resistance['Method'][res]] - - if 'tox' in data_resistance['Gene symbol'][res] : - if float(data_resistance['% Coverage of reference sequence'][res]) != 100.00 : - if (data_resistance['Method'][res] == 'BLASTX') : - gene = data_resistance['Gene symbol'][res] + "-NTTB?-"+str(round(100-float(data_resistance['% Coverage of reference sequence'][res])))+"%" - else : - gene = data_resistance['Gene symbol'][res] + "-NTTB" + dico_Method[data_resistance['Method'][res]] - - if is_contig_edge(data_resistance.iloc[res]) : # Used to find certain cases of interruption due to a contig end that AMRfinder is unable to find. - gene = f"{data_resistance['Gene symbol'][res]}_end_of_contig" - + gene = data_resistance['Gene symbol'][res] + dico_Method[data_resistance['Method'][res]] + # Search for certain cases of interruption due to a contig end that AMRfinder is unable to find. + if is_contig_edge(data_resistance.iloc[res]) : + data_resistance['Method'][res] = "CTRL_CONTIG_END" + + if ('tox' in data_resistance['Gene symbol'][res]) and \ + (float(data_resistance['% Coverage of reference sequence'][res]) != 100.00) and \ + (data_resistance['Method'][res] not in avoid_NTTB_prediction) : + gene = data_resistance['Gene symbol'][res] + "-NTTB" + + # For all methods where coverage can be < 100%, display the %age of missing coverage if (data_resistance['Method'][res] == 'PARTIALX') or \ + (data_resistance['Method'][res] == 'BLASTX') or \ (data_resistance['Method'][res] == 'PARTIAL_CONTIG_ENDX') or \ - ("end_of_contig" in gene) or \ - (data_resistance['Method'][res] == 'INTERNAL_STOP') : - gene += "-"+str(round(100-float(data_resistance['% Coverage of reference sequence'][res])))+"%" - + (data_resistance['Method'][res] == 'CTRL_CONTIG_END') or \ + (data_resistance['Method'][res] == 'INTERNAL_STOP') : + missing_coverage = round(100-float(data_resistance['% Coverage of reference sequence'][res]),1) + if (100 - missing_coverage) < 100 : + gene = f"{gene}-{missing_coverage}%" + print(gene) strain = data_resistance['Name'][res] family = data_resistance['Class'][res]