Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
curation_tool
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Amandine PERRIN
curation_tool
Commits
84066732
Commit
84066732
authored
5 years ago
by
Amandine PERRIN
Browse files
Options
Downloads
Patches
Plain Diff
Fix hand back xls remove when vname changes
parent
e133c0d8
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
gisaid_curation/finalization_curation.py
+31
-24
31 additions, 24 deletions
gisaid_curation/finalization_curation.py
with
31 additions
and
24 deletions
gisaid_curation/finalization_curation.py
+
31
−
24
View file @
84066732
...
@@ -29,19 +29,19 @@ import logging
...
@@ -29,19 +29,19 @@ import logging
logger
=
logging
.
getLogger
(
"
GC.final
"
)
logger
=
logging
.
getLogger
(
"
GC.final
"
)
def
cure_fasta
(
fasta_file
,
cur_fasta_file
,
to_hand_back
,
vname_changes
,
vnames_list
):
def
cure_fasta
(
fasta_file
,
cur_fasta_file
,
to_hand_back
,
new_old
,
vnames_list
):
"""
"""
Change fasta file:
Change fasta file:
- if sequences in to_hand_back: remove them
- if sequences in to_hand_back: remove them
- if sequence in virus_IDs: change its ID
- if sequence in virus_IDs: change its ID
to_hand_back = {seq: reason} seq with old name
to_hand_back = {seq: reason} seq with old name
vname_changes
= {new_id: orig_id}
new_old
= {new_id: orig_id}
vnames_list : list of new vnames
vnames_list : list of new vnames
"""
"""
problem_sequence_IDs
=
False
problem_sequence_IDs
=
False
remain_vnames_list
=
vnames_list
remain_vnames_list
=
vnames_list
change_vnames
=
{
old
:
new
for
new
,
old
in
vname_changes
.
items
()}
old_new
=
{
old
:
new
for
new
,
old
in
new_old
.
items
()}
init_nb_seq
=
0
# number of sequences in original fasta file
init_nb_seq
=
0
# number of sequences in original fasta file
final_nb_seq
=
0
# number of sequences in fasta file
final_nb_seq
=
0
# number of sequences in fasta file
...
@@ -64,8 +64,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
...
@@ -64,8 +64,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
elems
=
header_text
.
split
(
header
)
elems
=
header_text
.
split
(
header
)
# ex1: ["", "|virus other info"]
# ex1: ["", "|virus other info"]
# ex2: ["", "other info2"]
# ex2: ["", "other info2"]
# If header not found in vnames, nor in changed vnames, log error and skip line
# If header not found in vnames
(header is original name)
, nor in changed vnames, log error and skip line
if
header
not
in
vnames_list
and
header
not
in
change_vnames
:
if
header
not
in
vnames_list
and
header
not
in
old_new
:
logger
.
error
(
f
"
{
header
}
entry in fasta file does not correspond to any vname.
"
logger
.
error
(
f
"
{
header
}
entry in fasta file does not correspond to any vname.
"
"
This sequence will be removed in the curated fasta file.
"
)
"
This sequence will be removed in the curated fasta file.
"
)
problem_sequence_IDs
=
True
problem_sequence_IDs
=
True
...
@@ -84,8 +84,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
...
@@ -84,8 +84,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
to_write
=
False
to_write
=
False
# Now, remove header from remaining sequences.
# Now, remove header from remaining sequences.
# If header was changed, remove new name
# If header was changed, remove new name
if
header
in
change_vnames
:
if
header
in
old_new
:
header
=
change_vnames
[
header
]
header
=
old_new
[
header
]
remain_vnames_list
.
remove
(
header
)
remain_vnames_list
.
remove
(
header
)
# if header was not changed, remove curent name
# if header was not changed, remove curent name
else
:
else
:
...
@@ -93,8 +93,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
...
@@ -93,8 +93,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
continue
# go to next sequence, as this one will be removed
continue
# go to next sequence, as this one will be removed
# If header in changed vname, replace by new vname
# If header in changed vname, replace by new vname
if
header
in
change_vnames
:
if
header
in
old_new
:
header
=
change_vnames
[
header
]
header
=
old_new
[
header
]
remain_vnames_list
.
remove
(
header
)
remain_vnames_list
.
remove
(
header
)
# If header != ori_header, it means that it was in change_vnames. So, write new header
# If header != ori_header, it means that it was in change_vnames. So, write new header
...
@@ -115,8 +115,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
...
@@ -115,8 +115,8 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
# Write sequences missing in fasta file
# Write sequences missing in fasta file
if
remain_vnames_list
:
if
remain_vnames_list
:
for
vname
in
remain_vnames_list
:
for
vname
in
remain_vnames_list
:
if
vname
in
vname_changes
:
if
vname
in
new_old
:
logger
.
error
(
f
"
{
vname
}
sequence (previously called
{
vname_changes
[
vname
]
}
) is missing in fasta file.
"
logger
.
error
(
f
"
{
vname
}
sequence (previously called
{
new_old
[
vname
]
}
) is missing in fasta file.
"
"
This line will be removed in curated xls file.
"
)
"
This line will be removed in curated xls file.
"
)
problem_sequence_IDs
=
True
problem_sequence_IDs
=
True
else
:
else
:
...
@@ -133,14 +133,14 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
...
@@ -133,14 +133,14 @@ def cure_fasta(fasta_file, cur_fasta_file, to_hand_back, vname_changes, vnames_l
return
remain_vnames_list
,
init_nb_seq
,
final_nb_seq
return
remain_vnames_list
,
init_nb_seq
,
final_nb_seq
def
complete_xls
(
md
,
report
,
to_hand_back
,
vname_changes
,
cur_fasta_file
,
remain_vnames_list
):
def
complete_xls
(
md
,
report
,
to_hand_back
,
new_old
,
cur_fasta_file
,
remain_vnames_list
):
"""
"""
From curated metadatas and covsurver report, complete curated output file
From curated metadatas and covsurver report, complete curated output file
md : dataframe with all metadata curated
md : dataframe with all metadata curated
. vnames are the new ones
report : dataframe with comments and symbols to add to md
report : dataframe with comments and symbols to add to md
to_hand_back : dict {vname: reason}
to_hand_back : dict {vname: reason}
vname_changes
: dict {new_name: old_name}
new_old
: dict {new_name: old_name}
remain_vnames_list : list of xls vnames not found in fasta
remain_vnames_list : list of xls vnames not found in fasta
...
@@ -149,6 +149,7 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain
...
@@ -149,6 +149,7 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain
hb: dataframe with lines of xls metadata removed
hb: dataframe with lines of xls metadata removed
nb_removed : number of lines removed from xls (because hand back, or no fasta sequence)
nb_removed : number of lines removed from xls (because hand back, or no fasta sequence)
"""
"""
old_new
=
{
old
:
new
for
new
,
old
in
new_old
.
items
()}
init_nb
=
0
# number of lines in original xls file
init_nb
=
0
# number of lines in original xls file
nb_removed
=
0
# number of lines removed in curated xls file
nb_removed
=
0
# number of lines removed in curated xls file
# Create new dataframe, where handed-back lines are put.
# Create new dataframe, where handed-back lines are put.
...
@@ -182,26 +183,32 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain
...
@@ -182,26 +183,32 @@ def complete_xls(md, report, to_hand_back, vname_changes, cur_fasta_file, remain
if
"
FASTA
"
not
in
line
[
"
fn
"
]:
if
"
FASTA
"
not
in
line
[
"
fn
"
]:
line
[
"
fn
"
]
=
os
.
path
.
basename
(
cur_fasta_file
)
line
[
"
fn
"
]
=
os
.
path
.
basename
(
cur_fasta_file
)
# For each sequence to hand back, remove its line from md, and copy it to hand-back dataframe
# Check if vname (new vname) in remain_vnames_list -> if yes, remove line
for
vname
in
to_hand_back
:
# Remove lines which do not have a corresponding sequence in fasta file
for
vname
in
remain_vnames_list
:
# Remove line if it exists (not already removed by hand back for example)
line
=
md
.
loc
[
md
.
covv_virus_name
==
vname
]
line
=
md
.
loc
[
md
.
covv_virus_name
==
vname
]
# add line to handback dataframe
# remove line from initial xls file if it exists
if
not
line
.
empty
:
if
not
line
.
empty
:
hb
=
hb
.
append
(
line
)
nb_removed
+=
1
nb_removed
+=
1
out_md
=
out_md
.
drop
(
md
[
md
.
covv_virus_name
==
vname
].
index
)
out_md
=
out_md
.
drop
(
md
[
md
.
covv_virus_name
==
vname
].
index
)
else
:
to_hand_back
[
vname
]
+=
"
; no corresponding fasta sequence.
"
# Remove lines which do not have a corresponding sequence in fasta file
for
vname
in
remain_vnames_list
:
# For each sequence to hand back, remove its line from md, and copy it to hand-back dataframe
# Remove line if it exists (not already removed by hand back for example)
# hb_vname = vname in to_hand_back list, = original name
for
hb_vname
in
to_hand_back
:
# if hb_name has been changed, remove xls entry of corresponding new name
vname
=
hb_vname
if
hb_vname
in
old_new
:
vname
=
old_new
[
hb_vname
]
line
=
md
.
loc
[
md
.
covv_virus_name
==
vname
]
line
=
md
.
loc
[
md
.
covv_virus_name
==
vname
]
# add line to handback dataframe
# remove line from initial xls file if it exists
if
not
line
.
empty
:
if
not
line
.
empty
:
hb
=
hb
.
append
(
line
)
nb_removed
+=
1
nb_removed
+=
1
out_md
=
out_md
.
drop
(
md
[
md
.
covv_virus_name
==
vname
].
index
)
out_md
=
out_md
.
drop
(
md
[
md
.
covv_virus_name
==
vname
].
index
)
return
out_md
,
hb
,
init_nb
,
nb_removed
return
out_md
,
hb
,
init_nb
,
nb_removed
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment