Select Git revision
format_prodigal.py
Amandine PERRIN authored
format_prodigal.py 22.40 KiB
#!/usr/bin/env python3
# coding: utf-8
# ###############################################################################
# This file is part of PanACOTA. #
# #
# Authors: Amandine Perrin #
# Copyright © 2018-2020 Institut Pasteur (Paris). #
# See the COPYRIGHT file for details. #
# #
# PanACOTA is a software providing tools for large scale bacterial comparative #
# genomics. From a set of complete and/or draft genomes, you can: #
# - Do a quality control of your strains, to eliminate poor quality #
# genomes, which would not give any information for the comparative study #
# - Uniformly annotate all genomes #
# - Do a Pan-genome #
# - Do a Core or Persistent genome #
# - Align all Core/Persistent families #
# - Infer a phylogenetic tree from the Core/Persistent families #
# #
# PanACOTA is free software: you can redistribute it and/or modify it under the #
# terms of the Affero GNU General Public License as published by the Free #
# Software Foundation, either version 3 of the License, or (at your option) #
# any later version. #
# #
# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
# for more details. #
# #
# You should have received a copy of the Affero GNU General Public License #
# along with PanACOTA (COPYING file). #
# If not, see <https://www.gnu.org/licenses/>. #
# ###############################################################################
"""
Functions to convert prodigal result files to gembase format.
* Proteins: multifasta with all CDS in aa
* Replicons: (multi)fasta of genome sequences
* Genes: multifasta of all genes in nuc
* gff3: gff files without sequence
* LSTINFO: information on annotation. Columns are: "start end strand type locus
gene_name | product | EC_number | inference2" and strain is C (complement) or D (direct).
Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`
For prodigal: "start end strand type locus NA | NA | NA | NA", as there is no
functional annotation.
@author gem
July 2019
"""
import os
import shutil
import logging
import PanACoTA.utils as utils
import PanACoTA.annotate_module.general_format_functions as gfunc
logger = logging.getLogger("annotate.prodigal_format")
def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
rep_dir, gff_dir):
"""
Format the given genome, and create its corresponding files in the following folders:
- Proteins
- Genes
- Replicons
- LSTINFO
- gff
Parameters
----------
gpath : str
path to the genome sequence which was given to prodigal for annotation
name : str
gembase name of the genome
prod_path : str
directory where all tmp_files for all sequences are saved (sequences cut at each set
of 5N, prodigal results and logs)
lst_dir : str
path to LSTINFO folder
prot_dir : str
path to Proteins folder
gene_dir : str
path to Genes folder
rep_dir : str
path to Replicons folder
gff_dir : str
path to gff3 folder
Returns
-------
bool :
True if genome was correctly formatted, False otherwise
"""
# Get directory where prodigal results for the current genome are saved
prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes")
# Get prodigal result files
prot_file = os.path.join(prodigal_dir, name + ".faa")
gen_file = os.path.join(prodigal_dir, name + ".ffn")
gff_file = os.path.join(prodigal_dir, name + ".gff")
# Define names for generated gembase files
res_prot_file = os.path.join(prot_dir, name + ".prt")
res_gene_file = os.path.join(gene_dir, name + ".gen")
res_lst_file = os.path.join(lst_dir, name + ".lst")
res_rep_file = os.path.join(rep_dir, name + ".fna")
res_gff_file = os.path.join(gff_dir, name + ".gff")
# Generate replicon file (same as input sequence but with gembase formatted headers). From
# this file, get contig names, to be used to generate gff file
contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file)
if not contigs:
try:
os.remove(res_rep_file)
os.remove(res_lst_file)
os.remove(res_gff_file)
os.remove(res_gene_file)
os.remove(res_prot_file)
except OSError:
pass
logger.error("Problems while generating Replicon file for {}".format(name))
return False
# First, create .gen and .lst files. If they could not be formatted,
# remove those files, and return False with error message
ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, gpath, name)
if not ok:
try:
os.remove(res_rep_file)
os.remove(res_gene_file)
os.remove(res_lst_file)
os.remove(res_gff_file)
os.remove(res_prot_file)
except OSError:
pass
logger.error(f"Problems while generating .gen and .lst files for {name}")
return False
# Create gff files.
ok = create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes)
# If problem while formatting the genome (rep or gff file), remove all
# already created files, and return False (genome not formatted) with error message.
if not ok:
try:
os.remove(res_gene_file)
os.remove(res_lst_file)
os.remove(res_rep_file)
os.remove(res_gff_file)
os.remove(res_prot_file)
except OSError:
pass
logger.error("Problems while generating .gff (gff3 folder) "
"file for {}".format(name))
return False
# Generate .prt files (in Proteins directory)
ok = create_prt(prot_file, res_prot_file, res_lst_file)
# If problem while formatting prt file, return False, delete all generated
# formatted files, and write an error message to user.
if not ok:
try:
os.remove(res_gene_file)
os.remove(res_lst_file)
os.remove(res_rep_file)
os.remove(res_gff_file)
os.remove(res_prot_file)
except OSError:
pass
logger.error("Problems while generating .prt file (Proteins folder) "
"for {}".format(name))
return False
return ok
def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
"""
Generate .gen file, from sequences contained in .ffn, but changing the
headers to match with gembase format.
At the same time, generate .lst file, from the information given in prodigal ffn headers
Parameters
----------
contigs : dict
{original_contig_name: gembase_contig_name}
gen_file : str
.ffn file generated by prodigal
res_gen_file : str
generated .gen file, to write in Genes directory
res_lst_file : str
generated .lst file to write in LSTINFO directory
gpath : str
path to the genome given to prodigal. Only used for error message
name : str
gembase name of the genome to format
logger : logging.Logger
logger object to put information
Returns
-------
bool :
True if conversion went well, False otherwise
"""
# Variable which will contain the current gene sequence
seq = ""
# number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not
# re-start at 1 for each new contig)
locus_num = 0
# contig name of the last gene. To check if we are now in a new contig (-> loc = b) or not
prev_cont_name = ""
# Previous ontig number: contig number to use in gembase format
prev_cont_num = 0
contig_num = 0
# Keep start, end, strand and informations (prodigal gives information on start_type,
# gc_cont etc.) from the previous gene, before overwriting it with information
# on the new one
prev_start = ""
prev_end = ""
prev_strand = ""
prev_info = ""
# Update loc when contig changes ('b' if gene at the border of a contig, 'i' if it is inside)
prev_loc = "b"
# To start, the first gene is, by definition, at the border of the contig
loc = "b"
# Open files: .ffn prodigal to read, .gen and .lst gembase to create
with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\
open(res_lst_file, "w") as r_lst:
# Read all lines in ffn file (sequences in nuc. for each gene)
for lineffn in ffn:
# If it is a sequence, save it and go to next line
if not lineffn.startswith(">"):
seq += lineffn
continue
# Otherwise:
# - write header of previous sequence to .gen
# - write previous sequence (in 'seq') to .gen
# - write LSTINFO information to .lst
# - update information (new start, end, contig number etc.) for next gene
else:
# Get information given for the new gene (by .ffn file from prodigal)
(gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#")
# Get contig number from prodigal gene header: prodigal first part of header is:
# <original genome name contig name>_<protein number>
contig_name = gname.strip().split("_")
if len(contig_name) > 1:
contig_name = "_".join(contig_name[:-1])
else:
contig_name = contig_name[0]
# If new contig:
# - previous gene was the last of its contig -> prev_loc = "b" ;
# - the current gene is the first of its contig (loc = "b")
# - we must increment the contig number
if contig_name != prev_cont_name:
# Check that this contig name is in the list, and get its gembase contig number
if contig_name in contigs:
contig_num = contigs[contig_name].split(".")[-1]
# if not in the list, problem, return false
else:
logger.error(f"'{contig_name}' found in {gen_file} does not exist in "
f"{gpath}.")
return False
prev_loc = 'b'
loc = 'b'
# If not new contig. If prev_loc == 'b', previous gene is the first protein
# of this contig.
# Current gene will be inside the contig (except if new contig for the next gene,
# meaning only 1 gene in the contig)
else:
loc = 'i'
# If it is not the first gene of the genome, write previous gene information
if prev_start != "":
# Write line in LSTINFO file, + header and sequence to the gene file
_, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0,
prev_loc, name, prev_cont_num, "NA", prev_info,
"NA", prev_strand, prev_start, prev_end, r_lst)
gfunc.write_header(lstline, r_gen)
r_gen.write(seq)
# -> get new information, save it for the next gene, and go to next line
# Strands are 1/-1 in prodigal, while we use D,C -> convert, so that next time
# we find a new gene, it writes this before updating for this new gene
if int(strand) == 1:
strand = "D"
else:
strand = "C"
# Prepare variables for next gene
locus_num += 1
seq = ""
prev_cont_num = contig_num
prev_cont_name = contig_name
prev_start = start
prev_end = end
prev_strand = strand
prev_loc = loc
prev_info = info
# Write last gene of the genome (-> loc = 'b'),
# Just check that there was at least 1 gene found (prev_start != "").
# Otherwise, nothing to write
if prev_start != "":
prev_loc = "b"
_, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0,
prev_loc, name, prev_cont_num, "NA", prev_info, "NA",
prev_strand, prev_start, prev_end, r_lst)
gfunc.write_header(lstline, r_gen)
r_gen.write(seq)
return True
def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes):
"""
Create .gff3 file.
Format:
##gff-version 3
##sequence-region contig1 start end
##sequence-region contig2 start end
...
seqid(=contig) source type start end score strand phase attributes
All fields tab separated.
Empty fields contain '.'
For example:
ESCO.1017.00200.00001 Prodigal:2.6 CDS start end . + . ID=ESCO.1017.00200.b0001_00001;locus_tag=ESCO.1017.00200.b0001_00001;product=hypothetical protein
genome1_1 Prodigal_v2.6.3 CDS 213 1880 260.0 + 0 ID=1_1;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.534;conf=99.99;score=259.99;cscore=268.89;sscore=-8.89;rscore=-8.50;uscore=-4.34;tscore=3.95;
Parameters
----------
gpath : str
path to the genome sequence given to prodigal. Only used for the error message
gff_file : str
path to gff file generated by prodigal
res-gff_file : str
path to the gff file that must be created in result database
res-lst_file : str
path to the lst file that was created in result database in the previous step
contigs : dict
dict of contig names with their size. ["original_name": "gembase_name"]
sizes : dict
dict of contig gembase names with their sizes {"gembase_name": size}
Returns
-------
bool
True if everything went well, False if any problem
"""
# open gff generated by prodigal to read it
# open file to write new gff file
# open lst file to read all information saved from prodigal results
with open(gff_file, 'r') as gf, open(res_gff_file, "w") as rgf, open(res_lst_file, "r") as rlf:
# Write headers of gff3 file
rgf.write("##gff-version 3\n")
for ori_name, new_name in contigs.items():
# Write the list of contigs, with their size
end = sizes[new_name]
rgf.write(f"##sequence-region\t{new_name}\t1\t{end}\n")
# Now, convert each line of prodigal gff to gembase formatted gff line
for linegff in gf:
# Ignore gff lines starting by #. For the new gff, these are already all written at the
# beginning of the file.
if linegff.startswith("#"):
continue
# We do not write the sequences
if linegff.startswith(">"):
break
# Get all information from prodigal gff line. Strip each of them as trailing whitespace
# can be hidden (leading to information from gff considered as different from
# information from lst)
fields_g = linegff.split("\t")
fields_g = [info.strip() for info in fields_g]
(contig_name, source, type_g, start_g, end_g,
score, strand_g, phase, attributes) = fields_g
# Get information given to this same sequence from the lst file
# (next lst line corresponds to next gff line without #), as, for each format,
# there is 1 line per gene)
linelst = rlf.readline()
fields_l = linelst.split("\t")
fields_l = [info.strip() for info in fields_l]
start_l, end_l, strand_l, type_l, locus_l, _, _ = fields_l
# Get gff and ffn filenames to give information to user if error message
gff = os.path.basename(gff_file)
ffn = ".".join(gff.split(".")[:-1]) + ".ffn"
# Path where gff and ffn generated by prodigal are
tmp = gpath + "-prodigalRes"
# Get gene name given by prodigal to current gene
gname = attributes.split("ID=")[1].split(";")[0]
# Compare information from lst and information from prodigal gff (start,
# end and type of feature). They should correspond
for (elemg, eleml, label) in zip([start_g, end_g, type_g],
[start_l, end_l, type_l],
["start", "end", "type"]):
# If 1 element is different (start or end position, or type), print error
# message and return False: this genome could not be converted to gff
if elemg != eleml:
logger.error(f"Files {ffn} and {gff} (in prodigal tmp_files: {tmp}) do not have "
f"the same {label} value for gene {gname} ({elemg} in gff, {eleml} in "
"ffn))")
return False
# Replace prodigal ID with the new gene name (in gembase format), found in the
# corresponding lst line
at_split = attributes.split(";")
new = [atr if "ID" not in atr else f'ID={locus_l}' for atr in at_split]
# Write new line of gff file
# Write new contig name
cname = contigs[contig_name]
info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g,
phase, ";".join(new)])
rgf.write(info + "\n")
return True
def create_prt(prot_file, res_prot_file, res_lst_file):
"""
Generate .prt file (gembase formatted gene names), from features contained in .lst file generated just before.
Parameters
----------
prot_file : str
.faa file generated by prodigal
res_prot_file : str
output file, to write in Proteins directory
res_lst_file : str
.lst file to get all gene names in gembase format instead of re-generating them
Returns
-------
bool :
True if conversion went well, False otherwise
"""
# Open:
# - prot file to read gene sequences from prodigal results
# - res_prot file to write sequences with gembase headers
# - res_lst_file to get gene gembase names and other infos (strand, size...)
with open(prot_file, "r") as faa, open(res_prot_file, "w") as r_prt,\
open(res_lst_file, "r") as r_lst:
# Read prt file generated by prodigal
for lineprot in faa:
# If protein sequence, write it
if not lineprot.startswith(">"):
r_prt.write(lineprot)
continue
# If header, replace by gembase header
# For that, get next lst line (corresponding to next protein,
# as there is 1 protein per line in .lst -> 1 protein per header in .prt)
linelst = r_lst.readline().strip()
# Try to get info from lstline.
# If lstline empty, it means that the current protein
# is missing from lst file. We already read the last protein of lst file.
if linelst != '':
# If not empty, check lst format, return False if not right format
try:
# If ok, gembase name is in the fifth column of lst file
start, end, _, _, gem_name, _, _ = linelst.split("\t")
except ValueError:
logger.error("Problem in format of lstline ({})".format(linelst))
return False
else:
logger.error("No more protein in lst file. We cannot get information on this "
"protein ({})! Check that you do not have more proteins than genes "
"in prodigal results".format(lineprot.strip()))
return False
# Write this gembase name as a new header
# Size of protein sequence is the third of gene sequence. Check that it is an int.
try :
size_gen = (int(end) - int(start) + 1)
except ValueError:
logger.error("Start and/or end of protein {} position is not a number (start "
"= {}; end = {})".format(gem_name, start, end))
return False
# Find size of protein in number of aa
# If number of nucleotides in gene cannot be divided by 3 to get number of amino-acids, there is a problem with this protein: return False to ignore this genome
size_prot = size_gen/3
if int(size_prot) != size_prot:
logger.error("Gene {} has a number of nucleotides ({}) that is not divisible "
"by 3.".format(gem_name, size_gen))
return False
gfunc.write_header(linelst, r_prt)
# new_header = "\t".join([gem_name, str(int(size_prot)), product, info])
# r_prt.write(">" + new_header + "\n")
# Check that there are no more proteins in lst than in this prt file
linelst = r_lst.readline()
if linelst.strip() != '':
gem_name = linelst.strip().split("\t")[4]
logger.error("Protein {} is in .lst file but its sequence is not in the protein "
"file generated by prodigal.".format(gem_name))
return False
return True