Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
panacota
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Amandine PERRIN
panacota
Commits
7ca4d503
Commit
7ca4d503
authored
6 years ago
by
Amandine PERRIN
Browse files
Options
Downloads
Patches
Plain Diff
little improvements for lst/gen generation
parent
44208646
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
PanACoTA/annotate_module/format_prodigal.py
+25
-15
25 additions, 15 deletions
PanACoTA/annotate_module/format_prodigal.py
PanACoTA/utils.py
+14
-3
14 additions, 3 deletions
PanACoTA/utils.py
with
39 additions
and
18 deletions
PanACoTA/annotate_module/format_prodigal.py
+
25
−
15
View file @
7ca4d503
...
...
@@ -5,7 +5,7 @@
Functions to convert prodigal result files to gembase format.
* Proteins: multifasta with all CDS in aa
* Replicons: multifasta of genome
* Replicons:
(
multi
)
fasta of genome
sequences
* Genes: multifasta of all genes in nuc
* gff3: gff files without sequence
* LSTINFO: information on annotation. Columns are:
"
start end strand type locus
...
...
@@ -42,7 +42,8 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
name : str
gembase name of the genome
prod_path : str
directory where prodigal folders are saved
directory where all tmp_files for all sequences are saved (sequences cut at each set
of 5N, prodigal results and logs)
name : str
gembase name of the genome
lst_dir : srt
...
...
@@ -63,11 +64,15 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
bool :
True if genome was correctly formatted, False otherwise
"""
# Get directory where prodigal results for the current genome are saved
prodigal_dir
=
os
.
path
.
join
(
prod_path
,
os
.
path
.
basename
(
gpath
)
+
"
-prodigalRes
"
)
# Get prodigal result files
prot_file
=
os
.
path
.
join
(
prodigal_dir
,
name
+
"
.faa
"
)
gen_file
=
os
.
path
.
join
(
prodigal_dir
,
name
+
"
.ffn
"
)
gff_file
=
os
.
path
.
join
(
prodigal_dir
,
name
+
"
.gff
"
)
# Define names for generated gembase files
res_prot_file
=
os
.
path
.
join
(
prot_dir
,
name
+
"
.prt
"
)
res_gene_file
=
os
.
path
.
join
(
gene_dir
,
name
+
"
.gen
"
)
res_lst_file
=
os
.
path
.
join
(
lst_dir
,
name
+
"
.lst
"
)
...
...
@@ -75,15 +80,16 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
res_gff_file
=
os
.
path
.
join
(
gff_dir
,
name
+
"
.gff
"
)
# First, create gen and lst files,
and get genome contig names
# First, create
.
gen and
.
lst files,
ok
=
create_gene_lst
(
gen_file
,
res_gene_file
,
res_lst_file
,
name
,
logger
)
if
not
ok
:
os
.
remove
(
res_gene_file
)
os
.
remove
(
res_lst_file
)
logger
.
error
(
"
Problems while generating gen
e
and lst files for {}
"
.
format
(
name
))
logger
.
error
(
"
Problems while generating
.
gen and
.
lst files for {}
"
.
format
(
name
))
return
False
# Create replicon file. Same as input sequence but with gembase formatted headers
# From Replicon file, get contig names, and use them to generate gff file
ok
=
create_rep_gff
(
gpath
,
res_rep_file
,
res_gff_file
,
res_lst_file
,
gff_file
,
name
,
logger
)
# ok = create_prt(prot_file, res_prot_file, res_lst_file, logger)
...
...
@@ -94,14 +100,14 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
"""
Generate .gen file, from sequences contained in .ffn, but changing the
headers to match with gembase format.
At the same time, generate lst file
At the same time, generate
.
lst file
, from the information given in prodigal ffn headers
Parameters
----------
gen_file : str
.ffn file generated by prodigal
res_gene_file : str
output file, to write in Genes directory
output
.gen
file, to write in Genes directory
res_lst_file : str
.lst file to write in LSTINFO directory
name : str
...
...
@@ -115,7 +121,7 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
True if conversion went well, False otherwise
"""
# Variable which will contain the current sequence
# Variable which will contain the current
gene
sequence
seq
=
""
# number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not
# re-start at 1 for each new contig)
...
...
@@ -167,6 +173,7 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
prev_end
=
end
prev_cont_num
=
contig_num
prev_annot
=
annot
# Convert strand 1/-1 to D/C
if
int
(
strand
)
==
1
:
strand
=
"
D
"
else
:
...
...
@@ -261,7 +268,7 @@ def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, na
path to the genome sequence
res_rep_file : str
path to the new file, containing
'
gpath
'
sequence, but with
'
gembase_name
'
in headers
res
-
gff_file : str
res
_
gff_file : str
path to the gff file that must be created in result database
res_lst_file : str
path to the lst file that was created in result database in the previous step
...
...
@@ -274,13 +281,14 @@ def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, na
Returns
-------
list
:
bool
:
True if everything went well, False otherwise
"""
contigs
=
create_replicons
(
gpath
,
res_rep_file
,
name
)
if
contigs
:
return
create_gff
(
gpath
,
gff_file
,
res_gff_file
,
res_lst_file
,
name
,
contigs
,
logger
)
logger
.
info
(
contigs
)
# if contigs:
# return create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logger)
def
create_replicons
(
gpath
,
res_rep_file
,
name
):
...
...
@@ -294,18 +302,20 @@ def create_replicons(gpath, res_rep_file, name):
----------
gpath : str
path to the genome sequence
res_rep_file : str
path to the new file, containing
'
gpath
'
sequence, but with
'
gembase_named
'
headers
name : str
genome name to use (species.date.strain)
res_rep_file : str
path to the new file, containing
'
gpath
'
sequence, but with
'
gembase_name
'
in headers
Returns
-------
list :
List of all contigs with their size [
"
contig1;size1
"
,
"
contig2;size2
"
...]
"""
# Change headers to put into gembase format
# Change headers
of replicon file
to put into gembase format
, and save it in res_rep_file
contigs
=
utils
.
get_genome_contigs_and_rename
(
name
,
gpath
,
res_rep_file
)
# Return each contig name with its size.
return
contigs
...
...
@@ -349,7 +359,7 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
Returns
-------
bool
True if everythin went well, False if any problem
True if everythin
g
went well, False if any problem
"""
# open gff generated by prodigal to read it
...
...
This diff is collapsed.
Click to expand it.
PanACoTA/utils.py
+
14
−
3
View file @
7ca4d503
...
...
@@ -850,7 +850,10 @@ def check_out_dirs(resdir):
def
get_genome_contigs_and_rename
(
gembase_name
,
gpath
,
outfile
):
"""
For the given genome (sequence in gpath), rename all its contigs
with the new name:
'
gembase_name
'
, and save the output sequence in outfile
with the new name:
'
gembase_name
'
, and save the output sequence in outfile.
For each contig renamed, save its new name as well as its size. This will be used to generate
gff files
Parameters
----------
...
...
@@ -861,22 +864,30 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
outfile : str
path to the new file, containing
'
gpath
'
sequence, but with
'
gembase_name
'
in headers
Returns
-------
list
List of all contigs with their size [
"
contig1;size1
"
,
"
contig2;size2
"
...]
"""
# Initialize variables
contig_num
=
1
cont_size
=
0
contigs
=
[]
# Read input sequence given to prodigal, and open file where sequences with new
# headers must be written.
with
open
(
gpath
,
"
r
"
)
as
gpf
,
open
(
outfile
,
"
w
"
)
as
grf
:
for
line
in
gpf
:
# When we find a new header line, convert its name to gembase format, and write it
# to
the list of contigs, as well as its size (calculated before)
# to
output replicon file
if
line
.
startswith
(
"
>
"
)
:
new_cont
=
"
>
"
+
gembase_name
+
"
.
"
+
str
(
contig_num
).
zfill
(
4
)
grf
.
write
(
new_cont
+
"
\n
"
)
# If first contig
, just convert its name, but no size ye
t
# If
not
first contig
(size != 0), add its name as well as its size to contigs lis
t
if
cont_size
!=
0
:
contigs
.
append
(
"
{};{}
"
.
format
(
new_cont
,
cont_size
))
contig_num
+=
1
cont_size
=
0
# Sequence line: write it as is in replicon file, and add its size to cont_size.
else
:
grf
.
write
(
line
)
cont_size
+=
len
(
line
.
strip
())
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment