Skip to content
Snippets Groups Projects
Commit 84066732 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Fix hand back xls remove when vname changes

parent e133c0d8
Branches
No related tags found
No related merge requests found
...@@ -29,19 +29,19 @@ import logging ...@@ -29,19 +29,19 @@ import logging
logger = logging.getLogger("GC.final") logger = logging.getLogger("GC.final")
def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_list): def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, new_old, vnames_list):
""" """
Change fasta file: Change fasta file:
- if sequences in to_hand_back: remove them - if sequences in to_hand_back: remove them
- if sequence in virus_IDs: change its ID - if sequence in virus_IDs: change its ID
to_hand_back = {seq: reason} seq with old name to_hand_back = {seq: reason} seq with old name
vname_changes = {new_id: orig_id} new_old = {new_id: orig_id}
vnames_list : list of new vnames vnames_list : list of new vnames
""" """
problem_sequence_IDs = False problem_sequence_IDs = False
remain_vnames_list = vnames_list remain_vnames_list = vnames_list
change_vnames = {old:new for new, old in vname_changes.items()} old_new = {old:new for new, old in new_old.items()}
init_nb_seq = 0 # number of sequences in original fasta file init_nb_seq = 0 # number of sequences in original fasta file
final_nb_seq = 0 # number of sequences in fasta file final_nb_seq = 0 # number of sequences in fasta file
...@@ -64,8 +64,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l ...@@ -64,8 +64,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
elems = header_text.split(header) elems = header_text.split(header)
# ex1: ["", "|virus other info"] # ex1: ["", "|virus other info"]
# ex2: ["", "other info2"] # ex2: ["", "other info2"]
# If header not found in vnames, nor in changed vnames, log error and skip line # If header not found in vnames (header is original name), nor in changed vnames, log error and skip line
if header not in vnames_list and header not in change_vnames: if header not in vnames_list and header not in old_new:
logger.error(f"{header} entry in fasta file does not correspond to any vname. " logger.error(f"{header} entry in fasta file does not correspond to any vname. "
"This sequence will be removed in the curated fasta file.") "This sequence will be removed in the curated fasta file.")
problem_sequence_IDs = True problem_sequence_IDs = True
...@@ -84,8 +84,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l ...@@ -84,8 +84,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
to_write = False to_write = False
# Now, remove header from remaining sequences. # Now, remove header from remaining sequences.
# If header was changed, remove new name # If header was changed, remove new name
if header in change_vnames: if header in old_new:
header = change_vnames[header] header = old_new[header]
remain_vnames_list.remove(header) remain_vnames_list.remove(header)
# if header was not changed, remove curent name # if header was not changed, remove curent name
else: else:
...@@ -93,8 +93,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l ...@@ -93,8 +93,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
continue # go to next sequence, as this one will be removed continue # go to next sequence, as this one will be removed
# If header in changed vname, replace by new vname # If header in changed vname, replace by new vname
if header in change_vnames : if header in old_new :
header = change_vnames[header] header = old_new[header]
remain_vnames_list.remove(header) remain_vnames_list.remove(header)
# If header != ori_header, it means that it was in change_vnames. So, write new header # If header != ori_header, it means that it was in change_vnames. So, write new header
...@@ -115,8 +115,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l ...@@ -115,8 +115,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
# Write sequences missing in fasta file # Write sequences missing in fasta file
if remain_vnames_list: if remain_vnames_list:
for vname in remain_vnames_list: for vname in remain_vnames_list:
if vname in vname_changes: if vname in new_old:
logger.error(f"{vname} sequence (previously called {vname_changes[vname]}) is missing in fasta file. " logger.error(f"{vname} sequence (previously called {new_old[vname]}) is missing in fasta file. "
"This line will be removed in curated xls file.") "This line will be removed in curated xls file.")
problem_sequence_IDs = True problem_sequence_IDs = True
else: else:
...@@ -133,14 +133,14 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l ...@@ -133,14 +133,14 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
return remain_vnames_list, init_nb_seq, final_nb_seq return remain_vnames_list, init_nb_seq, final_nb_seq
def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain_vnames_list): def complete_xls(md, report, to_hand_back, new_old, cur_fasta_file, remain_vnames_list):
""" """
From curated metadatas and covsurver report, complete curated output file From curated metadatas and covsurver report, complete curated output file
md : dataframe with all metadata curated md : dataframe with all metadata curated. vnames are the new ones
report : dataframe with comments and symbols to add to md report : dataframe with comments and symbols to add to md
to_hand_back : dict {vname: reason} to_hand_back : dict {vname: reason}
vname_changes: dict {new_name: old_name} new_old: dict {new_name: old_name}
remain_vnames_list : list of xls vnames not found in fasta remain_vnames_list : list of xls vnames not found in fasta
...@@ -149,6 +149,7 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain ...@@ -149,6 +149,7 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain
hb: dataframe with lines of xls metadata removed hb: dataframe with lines of xls metadata removed
nb_removed : number of lines removed from xls (because hand back, or no fasta sequence) nb_removed : number of lines removed from xls (because hand back, or no fasta sequence)
""" """
old_new = {old:new for new, old in new_old.items()}
init_nb = 0 # number of lines in original xls file init_nb = 0 # number of lines in original xls file
nb_removed = 0 # number of lines removed in curated xls file nb_removed = 0 # number of lines removed in curated xls file
# Create new dataframe, where handed-back lines are put. # Create new dataframe, where handed-back lines are put.
...@@ -182,26 +183,32 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain ...@@ -182,26 +183,32 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain
if "FASTA" not in line["fn"]: if "FASTA" not in line["fn"]:
line["fn"] = os.path.basename(cur_fasta_file) line["fn"] = os.path.basename(cur_fasta_file)
# For each sequence to hand back, remove its line from md, and copy it to hand-back dataframe # Check if vname (new vname) in remain_vnames_list -> if yes, remove line
for vname in to_hand_back: # Remove lines which do not have a corresponding sequence in fasta file
for vname in remain_vnames_list:
# Remove line if it exists (not already removed by hand back for example)
line = md.loc[md.covv_virus_name == vname] line = md.loc[md.covv_virus_name == vname]
# add line to handback dataframe
# remove line from initial xls file if it exists
if not line.empty: if not line.empty:
hb = hb.append(line)
nb_removed += 1 nb_removed += 1
out_md = out_md.drop(md[md.covv_virus_name == vname].index) out_md = out_md.drop(md[md.covv_virus_name == vname].index)
else:
to_hand_back[vname] += "; no corresponding fasta sequence."
# Remove lines which do not have a corresponding sequence in fasta file
for vname in remain_vnames_list: # For each sequence to hand back, remove its line from md, and copy it to hand-back dataframe
# Remove line if it exists (not already removed by hand back for example) # hb_vname = vname in to_hand_back list, = original name
for hb_vname in to_hand_back:
# if hb_name has been changed, remove xls entry of corresponding new name
vname = hb_vname
if hb_vname in old_new:
vname = old_new[hb_vname]
line = md.loc[md.covv_virus_name == vname] line = md.loc[md.covv_virus_name == vname]
# add line to handback dataframe
# remove line from initial xls file if it exists
if not line.empty: if not line.empty:
hb = hb.append(line)
nb_removed += 1 nb_removed += 1
out_md = out_md.drop(md[md.covv_virus_name == vname].index) out_md = out_md.drop(md[md.covv_virus_name == vname].index)
return out_md, hb, init_nb, nb_removed return out_md, hb, init_nb, nb_removed
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment