Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
panacota
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Amandine PERRIN
panacota
Commits
d50d556e
Commit
d50d556e
authored
6 years ago
by
Amandine PERRIN
Browse files
Options
Downloads
Patches
Plain Diff
generates gff file with gembase names
parent
98b81ec5
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
PanACoTA/annotate_module/format_prodigal.py
+36
-18
36 additions, 18 deletions
PanACoTA/annotate_module/format_prodigal.py
PanACoTA/utils.py
+2
-2
2 additions, 2 deletions
PanACoTA/utils.py
with
38 additions
and
20 deletions
PanACoTA/annotate_module/format_prodigal.py
+
36
−
18
View file @
d50d556e
...
@@ -269,9 +269,9 @@ def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, na
...
@@ -269,9 +269,9 @@ def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, na
"""
"""
contigs
=
create_replicons
(
gpath
,
res_rep_file
,
name
)
contigs
=
create_replicons
(
gpath
,
res_rep_file
,
name
)
#
if contigs:
if
contigs
:
#
return create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logger)
return
create_gff
(
gpath
,
gff_file
,
res_gff_file
,
res_lst_file
,
name
,
contigs
,
logger
)
return
contigs
return
False
def
create_replicons
(
gpath
,
res_rep_file
,
name
):
def
create_replicons
(
gpath
,
res_rep_file
,
name
):
...
@@ -348,19 +348,27 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
...
@@ -348,19 +348,27 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
# open file to write new gff file
# open file to write new gff file
# open lst file to read all information saved from prodigal results
# open lst file to read all information saved from prodigal results
with
open
(
gff_file
,
'
r
'
)
as
gf
,
open
(
res_gff_file
,
"
w
"
)
as
rgf
,
open
(
res_lst_file
,
"
r
"
)
as
rlf
:
with
open
(
gff_file
,
'
r
'
)
as
gf
,
open
(
res_gff_file
,
"
w
"
)
as
rgf
,
open
(
res_lst_file
,
"
r
"
)
as
rlf
:
# Write header of gff3 file
# Write header
s
of gff3 file
rgf
.
write
(
"
##gff-version 3
\n
"
)
rgf
.
write
(
"
##gff-version 3
\n
"
)
for
contig
in
contigs
:
for
contig
in
contigs
:
# Write the list of contigs, with their size
# Write the list of contigs, with their size
cont_name
,
end
=
contig
.
split
(
"
\t
"
)
cont_name
,
end
=
contig
.
split
(
"
\t
"
)
rgf
.
write
(
"
##sequence-region
\t
{}
\t
{}
\t
{}
"
.
format
(
cont_name
[
1
:],
"
1
"
,
end
))
rgf
.
write
(
"
##sequence-region
\t
{}
\t
{}
\t
{}
"
.
format
(
cont_name
[
1
:],
"
1
"
,
end
))
# Create an iterator of contig names, that will be used to get contig name
# corresponding to a given gene name (in lst, we only have gene name,
# but in gff we need corresponding contig name)
iter_contig
=
iter
(
contigs
)
contig
=
next
(
iter_contig
)
cname
=
contig
.
split
(
"
\t
"
)[
0
][
1
:]
# Now, convert each line of prodigal gff to gembase formatted gff line
for
linegff
in
gf
:
for
linegff
in
gf
:
iter_contig
=
iter
(
contigs
)
contig
=
next
(
iter_contig
)
# Ignore gff lines starting by #. For the new gff, these are already all written at the
# Ignore gff lines starting by #. For the new gff, these are already all written at the
# beginning of the file.
# beginning of the file.
if
linegff
.
startswith
(
"
#
"
):
if
linegff
.
startswith
(
"
#
"
):
continue
continue
# Get all information from prodigal gff line. Strip each of them as trailing whitespace
# Get all information from prodigal gff line. Strip each of them as trailing whitespace
# can be hidden (leading to information from gff considered as different from
# can be hidden (leading to information from gff considered as different from
# information from lst)
# information from lst)
...
@@ -375,8 +383,16 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
...
@@ -375,8 +383,16 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
fields_l
=
linelst
.
strip
().
split
(
"
\t
"
)
fields_l
=
linelst
.
strip
().
split
(
"
\t
"
)
fields_l
=
[
info
.
strip
()
for
info
in
fields_l
]
fields_l
=
[
info
.
strip
()
for
info
in
fields_l
]
start_l
,
end_l
,
strand_l
,
type_l
,
locus_l
,
_
,
_
=
fields_l
start_l
,
end_l
,
strand_l
,
type_l
,
locus_l
,
_
,
_
=
fields_l
cname
=
contig
.
split
(
"
\t
"
)[
0
][
1
:]
# Get contig name 'cname'.
# -> if same contig as contig of previous gene -> cname does not change
# -> if not same contig, cname not in contig, so get next contig name
# (iterator of contigs created while writing replicon file)
# We use while, instead of just taking the next contig, because
# there could be a contig existing in replicon but not having any gene.
while
cname
not
in
locus_l
:
contig
=
next
(
iter_contig
)
cname
=
contig
.
split
(
"
\t
"
)[
0
][
1
:]
# Get gff and ffn filenames to give information to user if error message
# Get gff and ffn filenames to give information to user if error message
gff
=
os
.
path
.
basename
(
gff_file
)
gff
=
os
.
path
.
basename
(
gff_file
)
...
@@ -394,18 +410,20 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
...
@@ -394,18 +410,20 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
# If 1 element is different (start or end position, or type), print error
# If 1 element is different (start or end position, or type), print error
# message and return False: this genome could not be converted to gff
# message and return False: this genome could not be converted to gff
if
elemg
!=
eleml
:
if
elemg
!=
eleml
:
logger
.
error
(
"
Files {} and {} (in prodigal tmp_files: {}) do not have
the same {} value for gene {}
"
logger
.
error
(
"
Files {} and {} (in prodigal tmp_files: {}) do not have
"
"
({} in gff, {} in ffn))
"
.
format
(
ffn
,
gff
,
tmp
,
label
,
"
the same {} value for gene {} ({} in gff, {} in
"
gname
,
elemg
,
eleml
))
"
ffn))
"
.
format
(
ffn
,
gff
,
tmp
,
label
,
gname
,
elemg
,
eleml
))
return
False
return
False
# Replace prodigal ID with the new gene name (in gembase format), found in the
# corresponding lst line
# Replace prodigal ID with the new gene name (in gembase format), found in the
at_split
=
attributes
.
split
(
"
;
"
)
# corresponding lst line
new
=
[
atr
if
"
ID
"
not
in
atr
else
'
ID={}
'
.
format
(
locus_l
)
for
atr
in
at_split
]
at_split
=
attributes
.
split
(
"
;
"
)
# Get information to put to the new gff line
new
=
[
atr
if
"
ID
"
not
in
atr
else
'
ID={}
'
.
format
(
locus_l
)
for
atr
in
at_split
]
# Get contig name without '>'
info
=
"
\t
"
.
join
([
cname
,
source
,
type_g
,
start_g
,
end_g
,
score
,
strand_g
,
phase
,
"
;
"
.
join
(
new
)])
# Write new line of gff file
rgf
.
write
(
info
+
"
\n
"
)
info
=
"
\t
"
.
join
([
cname
,
source
,
type_g
,
start_g
,
end_g
,
score
,
strand_g
,
phase
,
"
;
"
.
join
(
new
)])
rgf
.
write
(
info
+
"
\n
"
)
return
True
return
True
...
...
This diff is collapsed.
Click to expand it.
PanACoTA/utils.py
+
2
−
2
View file @
d50d556e
...
@@ -896,7 +896,7 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
...
@@ -896,7 +896,7 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
if
prev_cont
:
if
prev_cont
:
cont
=
"
\t
"
.
join
([
prev_cont
,
str
(
cont_size
)])
+
"
\n
"
cont
=
"
\t
"
.
join
([
prev_cont
,
str
(
cont_size
)])
+
"
\n
"
contigs
.
append
(
cont
)
contigs
.
append
(
cont
)
grf
.
write
(
"
\t
"
.
join
([
prev_cont
,
str
(
cont_size
)])
+
"
\n
"
)
grf
.
write
(
cont
)
grf
.
write
(
seq
)
grf
.
write
(
seq
)
prev_cont
=
"
>
"
+
gembase_name
+
"
.
"
+
str
(
contig_num
).
zfill
(
4
)
prev_cont
=
"
>
"
+
gembase_name
+
"
.
"
+
str
(
contig_num
).
zfill
(
4
)
contig_num
+=
1
contig_num
+=
1
...
@@ -909,7 +909,7 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
...
@@ -909,7 +909,7 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
# Write last contig
# Write last contig
cont
=
"
\t
"
.
join
([
prev_cont
,
str
(
cont_size
)])
+
"
\n
"
cont
=
"
\t
"
.
join
([
prev_cont
,
str
(
cont_size
)])
+
"
\n
"
contigs
.
append
(
cont
)
contigs
.
append
(
cont
)
grf
.
write
(
"
\t
"
.
join
([
prev_cont
,
str
(
cont_size
)])
+
"
\n
"
)
grf
.
write
(
cont
)
grf
.
write
(
seq
)
grf
.
write
(
seq
)
return
contigs
return
contigs
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment