Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
panacota
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Amandine PERRIN
panacota
Commits
5b411ee1
Commit
5b411ee1
authored
5 years ago
by
Amandine PERRIN
Browse files
Options
Downloads
Patches
Plain Diff
Add column to lstinfo
Before, only gembase name and original name. Now, name of sequence annotated
parent
66d300bc
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
PanACoTA/annotate_module/genome_seq_functions.py
+26
-19
26 additions, 19 deletions
PanACoTA/annotate_module/genome_seq_functions.py
PanACoTA/utils.py
+25
-11
25 additions, 11 deletions
PanACoTA/utils.py
with
51 additions
and
30 deletions
PanACoTA/annotate_module/genome_seq_functions.py
+
26
−
19
View file @
5b411ee1
...
...
@@ -46,7 +46,7 @@ def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, prodigal_only, logger, q
Returns
-------
dict
{genome: [spegenus.date,
path
, size, nbcont, l90]}
{genome: [spegenus.date,
orig_name, path_to_seq_to_annotate
, size, nbcont, l90]}
"""
cut
=
nbn
>
0
...
...
@@ -113,7 +113,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l
pattern on which contigs must be cut. ex:
"
NNNNN
"
genomes : dict
{genome: [spegenus.date]} as input, and will be changed to
\
-> {genome: [spegenus.date, path, gsize, nbcont, L90]}
-> {genome: [spegenus.date, path,
path_annotate,
gsize, nbcont, L90]}
prodigal_only : bool
True if we annotate with prodigal, False if we annotate with prokka
logger : logging.Logger
...
...
@@ -143,7 +143,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l
#### NEW CONTIG
# Line corresponding to a new contig
if
line
.
startswith
(
"
>
"
):
# If not first contig, write info to output file (
o
f needed)
# If not first contig, write info to output file (
i
f needed)
if
cur_seq
!=
""
:
num
=
format_contig
(
cut
,
pat
,
cur_seq
,
cur_contig_name
,
contig_sizes
,
gresf
,
num
,
logger
)
...
...
@@ -179,11 +179,13 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l
if
not
l90
:
logger
.
error
(
"
Problem with genome {}. Not possible to get L90
"
.
format
(
genome
))
return
False
# Everything ok for this genome -> complete its list of information in genomes dict
# [spegenus.date] -> [spegenus.date, gpath, gpath_to_annotate, gsize, nbcont, L90]}
else
:
if
grespath
:
genomes
[
genome
]
+=
[
grespath
,
gsize
,
nbcont
,
l90
]
genomes
[
genome
]
+=
[
gpath
,
grespath
,
gsize
,
nbcont
,
l90
]
else
:
genomes
[
genome
]
+=
[
gpath
,
gsize
,
nbcont
,
l90
]
genomes
[
genome
]
+=
[
gpath
,
gpath
,
gsize
,
nbcont
,
l90
]
# If we wrote a new sequence file, close it
if
grespath
:
gresf
.
close
()
...
...
@@ -333,7 +335,7 @@ def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num):
if
len
(
seq
)
==
0
:
continue
new_contig_name
=
"
{}_{}
\n
"
.
format
(
cur_contig_name
,
num
)
contig_sizes
[
cur
_name
]
=
len
(
seq
)
contig_sizes
[
new_contig
_name
]
=
len
(
seq
)
gresf
.
write
(
new_contig_name
)
gresf
.
write
(
seq
+
"
\n
"
)
num
+=
1
...
...
@@ -371,20 +373,32 @@ def rename_all_genomes(genomes):
Parameters
----------
genomes : dict
{genome: [name, path, gsize, nbcont, L90]} as input, and will become
\
{genome: [gembase_name, path, gsize, nbcont, L90]} at the end
{genome: [name, path, path_to_seq, gsize, nbcont, L90]} as input, and will become
\
{genome: [gembase_name, path, path_to_seq, gsize, nbcont, L90]} at the end
Return
------
change 1st field of genomes dict. name -> gembase_name (with strain number)
"""
logger
.
info
(
"
Renaming kept genomes according to their quality.
"
)
# Keep previous genome name (ESCO.0109 -> ESCO)
last_name
=
""
# Keep last strain number
last_strain
=
0
# "SAEN.1015.{}".format(str(last_strain).zfill(5))
for
genome
,
[
name
,
_
,
_
,
_
,
_
]
in
sorted
(
genomes
.
items
(),
key
=
sort_genomes
):
# Sort genomes by species, L90 and nb_contigs
for
genome
,
[
name
,
_
,
_
,
_
,
_
,
_
]
in
sorted
(
genomes
.
items
(),
key
=
sort_genomes
):
# first genome, or new strain name (ex: ESCO vs EXPL)
# -> keep this new name, and add 1 to next strain number
if
last_name
!=
name
.
split
(
"
.
"
)[
0
]:
last_strain
=
1
last_name
=
name
.
split
(
"
.
"
)[
0
]
# same strain name
# -> write this new sequence, and go to next one (strain += 1)
else
:
last_strain
+=
1
# Write information to "genomes" dict.
gembase_name
=
"
.
"
.
join
([
name
,
str
(
last_strain
).
zfill
(
5
)])
genomes
[
genome
][
0
]
=
gembase_name
...
...
@@ -417,7 +431,7 @@ def plot_distributions(genomes, res_path, listfile_base, l90, nbconts):
Parameters
----------
genomes : dict
{genome: [name, path, size, nbcont, l90]}
{genome: [name,
orig_path, to_annotate_
path, size, nbcont, l90]}
res_path : str
path to put all output files
listfile_base : str
...
...
@@ -436,18 +450,11 @@ def plot_distributions(genomes, res_path, listfile_base, l90, nbconts):
- dist1 : matplotlib figure of distribution of L90 values
- dist2 : matplotlib figure of distribution of nb contigs values
"""
"""
genomes:
res_path:
listfile_base:
l90: max value of l90 allowed
nbconts: max value of nb contigs allowed
"""
logger
.
info
(
"
Generating distribution of L90 and #contigs graphs.
"
)
l90_vals
=
[
val
for
_
,
(
_
,
_
,
_
,
_
,
val
)
in
genomes
.
items
()]
l90_vals
=
[
val
for
_
,
(
_
,
_
,
_
,
_
,
_
,
val
)
in
genomes
.
items
()]
outl90
=
os
.
path
.
join
(
res_path
,
"
QC_L90-
"
+
listfile_base
+
"
.png
"
)
nbcont_vals
=
[
val
for
_
,
(
_
,
_
,
_
,
val
,
_
)
in
genomes
.
items
()]
nbcont_vals
=
[
val
for
_
,
(
_
,
_
,
_
,
_
,
val
,
_
)
in
genomes
.
items
()]
outnbcont
=
os
.
path
.
join
(
res_path
,
"
QC_nb-contigs-
"
+
listfile_base
+
"
.png
"
)
dist1
=
utils
.
plot_distr
(
l90_vals
,
l90
,
"
L90 distribution for all genomes
"
,
"
max L90 =
"
)
...
...
This diff is collapsed.
Click to expand it.
PanACoTA/utils.py
+
25
−
11
View file @
5b411ee1
...
...
@@ -395,11 +395,12 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False):
Parameters
----------
genomes : dict
{genome: [gembase_start_name, seq_file, genome_size, nb_contigs, L90]}
{genome: [gembase_start_name, orig_seq_file, to_annotate_seq_file,
genome_size, nb_contigs, L90]}
kept_genomes : list
list of genomes kept
list_file : str
input file containing the list of genomes
path to
input file containing the list of genomes
res_path : str
folder where results must be saved
qc : bool
...
...
@@ -408,40 +409,46 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False):
"""
logger
=
logging
.
getLogger
(
"
utils
"
)
# number of genomes discarded
nb_disc
=
len
(
genomes
)
-
len
(
kept_genomes
)
# Log number of genomes discarded.
if
not
qc
and
nb_disc
<
2
:
logger
.
info
(
"
{} genome was discarded.
"
.
format
(
nb_disc
))
elif
not
qc
:
logger
.
info
(
"
{} genomes were discarded.
"
.
format
(
nb_disc
))
# Get input list file name (without path)
_
,
name_lst
=
os
.
path
.
split
(
list_file
)
# if not QC, write discarded genomes to a file "discarded-[list_file].lst"
if
not
qc
:
outdisc
=
os
.
path
.
join
(
res_path
,
"
discarded-
"
+
"
.
"
.
join
(
name_lst
.
split
(
"
.
"
)[:
-
1
])
+
"
.lst
"
)
logger
.
info
(
"
Writing discarded genomes to {}
"
.
format
(
outdisc
))
# if QC, there is no 'discarded genome', just write information on all analyzed genomes
else
:
outdisc
=
os
.
path
.
join
(
res_path
,
"
info-genomes-
"
+
"
.
"
.
join
(
name_lst
.
split
(
"
.
"
)[:
-
1
])
+
"
.lst
"
)
logger
.
info
(
"
Writ
t
ing information on genomes in {}
"
.
format
(
outdisc
))
logger
.
info
(
"
Writing information on genomes in {}
"
.
format
(
outdisc
))
with
open
(
outdisc
,
"
w
"
)
as
outdf
:
outdf
.
write
(
"
\t
"
.
join
([
"
orig_name
"
,
"
gsize
"
,
"
nb_conts
"
,
"
L90
"
])
+
"
\n
"
)
for
genome
,
values
in
genomes
.
items
():
if
genome
in
kept_genomes
:
continue
_
,
_
,
gsize
,
nbcont
,
l90
=
[
str
(
x
)
for
x
in
values
]
_
,
_
,
_
,
gsize
,
nbcont
,
l90
=
[
str
(
x
)
for
x
in
values
]
outdf
.
write
(
"
\t
"
.
join
([
genome
,
gsize
,
nbcont
,
l90
])
+
"
\n
"
)
def
write_lstinfo
(
list_file
,
genomes
,
outdir
):
"""
Write lstinfo file, with following columns:
gembase_name, orig_name, size, nbcontigs, l90
gembase_name, orig_name,
to_annotate_name,
size, nbcontigs, l90
Parameters
----------
list_file : str
input file containing the list of genomes
genomes : dict
{genome: [gembase_start_name, seq_file, genome_size, nb_contigs, L90]}
{genome: [gembase_start_name, seq_file,
seq_to_annotate,
genome_size, nb_contigs, L90]}
outdir : str
folder where results must be saved
...
...
@@ -450,10 +457,12 @@ def write_lstinfo(list_file, genomes, outdir):
outlst
=
os
.
path
.
join
(
outdir
,
"
LSTINFO-
"
+
"
.
"
.
join
(
name_lst
.
split
(
"
.
"
)[:
-
1
])
+
"
.lst
"
)
with
open
(
outlst
,
"
w
"
)
as
outf
:
outf
.
write
(
"
\t
"
.
join
([
"
gembase_name
"
,
"
orig_name
"
,
"
gsize
"
,
"
nb_conts
"
,
"
L90
"
])
+
"
\n
"
)
outf
.
write
(
"
\t
"
.
join
([
"
gembase_name
"
,
"
orig_name
"
,
"
to_annotate
"
,
"
gsize
"
,
"
nb_conts
"
,
"
L90
"
])
+
"
\n
"
)
for
genome
,
values
in
sorted
(
genomes
.
items
(),
key
=
sort_genomes
):
gembase
,
_
,
gsize
,
nbcont
,
l90
=
[
str
(
x
)
for
x
in
values
]
outf
.
write
(
"
\t
"
.
join
([
gembase
,
genome
,
gsize
,
nbcont
,
l90
])
+
"
\n
"
)
gembase
,
_
,
to_annote
,
gsize
,
nbcont
,
l90
=
[
str
(
x
)
for
x
in
values
]
to_annote_file
=
os
.
path
.
basename
(
to_annote
)
outf
.
write
(
"
\t
"
.
join
([
gembase
,
genome
,
to_annote_file
,
gsize
,
nbcont
,
l90
])
+
"
\n
"
)
def
sort_genomes
(
x
):
...
...
@@ -571,6 +580,7 @@ def read_genomes(list_file, name, date, dbpath, tmp_path):
genomes_inf
=
genomes_inf
.
strip
()
cur_name
,
cur_date
=
read_info
(
name_inf
,
name
,
date
,
genomes_inf
)
# If no separator '::', no information on name and date of genome: use default ones
# Line only contains genome name (filename in given db_path)
else
:
genomes_inf
=
line
.
strip
()
cur_name
=
name
...
...
@@ -640,7 +650,9 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path):
Returns
-------
dict
genomes = {genome: [spegenus.date, path_to_splitSequence, size, nbcont, l90]}
genomes = {genome:
[spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]
}
"""
logger
=
logging
.
getLogger
(
"
utils
"
)
logger
.
info
(
"
Reading given information on your genomes
"
)
...
...
@@ -690,7 +702,9 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path):
logger
.
warning
((
"
{} genome file does not exist in the given database.
"
"
It will be ignored.
"
.
format
(
gpath
)))
continue
genomes
[
gname
]
=
[
spegenus
,
gpath
,
gsize
,
gcont
,
gl90
]
# cur genome information to save:
# [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]
genomes
[
gname
]
=
[
spegenus
,
gpath
,
gpath
,
gsize
,
gcont
,
gl90
]
return
genomes
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment