Select Git revision
general_format_functions.py 11.90 KiB
#!/usr/bin/env python3
# coding: utf-8
# ###############################################################################
# This file is part of PanACOTA. #
# #
# Authors: Amandine Perrin #
# Copyright © 2018-2020 Institut Pasteur (Paris). #
# See the COPYRIGHT file for details. #
# #
# PanACOTA is a software providing tools for large scale bacterial comparative #
# genomics. From a set of complete and/or draft genomes, you can: #
# - Do a quality control of your strains, to eliminate poor quality #
# genomes, which would not give any information for the comparative study #
# - Uniformly annotate all genomes #
# - Do a Pan-genome #
# - Do a Core or Persistent genome #
# - Align all Core/Persistent families #
# - Infer a phylogenetic tree from the Core/Persistent families #
# #
# PanACOTA is free software: you can redistribute it and/or modify it under the #
# terms of the Affero GNU General Public License as published by the Free #
# Software Foundation, either version 3 of the License, or (at your option) #
# any later version. #
# #
# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY #
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS #
# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License #
# for more details. #
# #
# You should have received a copy of the Affero GNU General Public License #
# along with PanACOTA (COPYING file). #
# If not, see <https://www.gnu.org/licenses/>. #
# ###############################################################################
"""
Functions to convert prokka or prodigal results to gembase format:
* Proteins: multifasta with all CDS in aa
* Replicons: multifasta of genome
* Genes: multifasta of all genes in nuc
* gff3: gff files without sequence
* LSTINFO:
- if annotated by prokka: information on annotation. Columns are:
"start end strand type locus gene_name | product | EC_number | inference2"
with the same types as prokka file, and strain is C (complement) or D (direct).
Locus is: `<genome_name>.<contig_num><i or b>_<protein_num>`
- if annotated by prodigal
@author gem
May 2019
"""
import os
import logging
import logging.handlers
import progressbar
import multiprocessing
import threading
import PanACoTA.utils as utils
from PanACoTA.annotate_module import format_prokka as fprokka
from PanACoTA.annotate_module import format_prodigal as fprodigal
main_logger = logging.getLogger("annotate.geneffunc")
def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False):
"""
For all genomes which were annotated (by prokka or prodigal), reformat them
in order to have, in 'res_path', the following folders:
* LSTINFO: containing a .lst file for each genome, with all genes
* Replicons: containing all multifasta files
* Genes: containing 1 multi-fasta per genome, with all its genes in nuc
* Proteins: containing 1 multi-fasta per genome, with all its proteins in aa
* gff: containing all gff files
Parameters
----------
genomes_ok : dict
genomes to format (annotation was OK) ->
{genome: [name, gpath, to_annot, size, nbcont, l90]}
res_path : str
path to folder where the 4 directories must be created
annot_path : str
path to folder containing "<genome_name>-[prokka, prodigal]Res" where all prokka/prodigal
results are saved.
prodigal_only: True if it was annotated by prodigal, False if annotated by prokka
threads : int
number of threads to use to while formatting genomes
quiet : bool
True if nothing must be sent to stderr/stdout, False otherwise
Returns
-------
skipped_format : list
list of genomes skipped because they had a problem in format step
"""
main_logger.info("Formatting all genomes")
lst_dir = os.path.join(res_path, "LSTINFO")
prot_dir = os.path.join(res_path, "Proteins")
gene_dir = os.path.join(res_path, "Genes")
rep_dir = os.path.join(res_path, "Replicons")
gff_dir = os.path.join(res_path, "gff3")
os.makedirs(lst_dir, exist_ok=True)
os.makedirs(prot_dir, exist_ok=True)
os.makedirs(gene_dir, exist_ok=True)
os.makedirs(rep_dir, exist_ok=True)
os.makedirs(gff_dir, exist_ok=True)
# If this function goes until here, it means that there is at least 1 genome to annotate
nbgen = len(genomes_ok)
bar = None
if not quiet:
# Create progressbar
widgets = ['Formatting genomes: ',
progressbar.Bar(marker='█', left='', right=''),
' ', progressbar.Counter(), "/{}".format(nbgen), ' (',
progressbar.Percentage(), ") - ", progressbar.Timer()]
bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=100).start()
# Create a Queue to put logs from processes, and handle them after from a single thread
m = multiprocessing.Manager()
q = m.Queue()
# if at least 1 genome ok, try to format it
# arguments for 'handle_genome' function:
# (genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, rep_dir,
# gff_dir, results, prodigal_only, q)
params = [(genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir,
rep_dir, gff_dir, prodigal_only, q)
for genome, (name, _, gpath, _, _, _) in genomes_ok.items()]
# Create pool and launch parallel formating steps
pool = multiprocessing.Pool(threads)
final = pool.map_async(handle_genome, params, chunksize=1)
pool.close()
lp = threading.Thread(target=utils.logger_thread, args=(q,))
lp.start()
if not quiet:
while True:
if final.ready():
break
remaining = final._number_left
bar.update(nbgen - remaining)
bar.finish()
pool.join()
q.put(None)
lp.join()
res = final.get()
# List of genomes for which format step had problems: will be filled after
skipped_format = []
for output in res:
if not output[0]:
skipped_format.append(output[1])
return skipped_format
def handle_genome(args):
"""
For a given genome, check if it has been annotated (in results), if annotation
(by prokka or prodigal) ran without problems (result = True). In that case,
format the genome and get the output to see if everything went ok.
Parameters
----------
args : tuple
(genome, name, gpath, prok_path, lst_dir, prot_dir,\
gene_dir, rep_dir, gff_dir, results, q)\
with:
* genome : original genome name
* name : gembase name of the genome
* gpath : path to the genome sequence which was given to prokka/prodigal for annotation
* annot_path : directory where prokka/prodigal folders are saved
* lst_dir : path to 'LSTINFO' folder
* prot_dir : path to 'Proteins' folder
* gene_dit : path to 'Genes' folder
* rep_dir : path to 'Replicons' folder
* gff_dir : path to 'gff3' folder
* prodigal_only : True if annotated by prodigal, False if annotated by prokka
* q : multiprocessing.managers.AutoProxy[Queue] queue to put logs during subprocess
Returns
-------
(bool, str) :
* True if genome was annotated as expected, False otherwise
* genome name (used to get info from the pool.map_async)
"""
(genome, name, gpath, annot_path, lst_dir, prot_dir,
gene_dir, rep_dir, gff_dir, prodigal_only, q) = args
# Define which formatting must be used, given the annotation software
if prodigal_only:
format_one_genome = fprodigal.format_one_genome
else:
format_one_genome = fprokka.format_one_genome
# Set logger for this process
qh = logging.handlers.QueueHandler(q)
root = logging.getLogger()
root.setLevel(logging.DEBUG)
root.handlers = []
logging.addLevelName(utils.detail_lvl(), "DETAIL")
root.addHandler(qh)
logger = logging.getLogger('format.handle_genome')
# Handle genome
ok_format = format_one_genome(gpath, name, annot_path, lst_dir,
prot_dir, gene_dir, rep_dir, gff_dir)
return ok_format, genome
def write_gene(gtype, locus_num, gene_name, product, cont_loc,
genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile):
"""
Write given gene to output file
Parameters
----------
gtype : str
type of feature (CDS, tRNA, etc.)
locus_num : str
number of the locus given by prokka/prodigal
gene_name : str
gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal)
product : str
found by prokka/Prodigal, "NA" if no product (always the case for prodigal)
cont_loc : str
'i' if the gene is inside a contig, 'b' if its on the border (first or last gene
of the contig)
genome : str
genome name (spegenus.date.strain_num)
cont_num : int
contig number
ecnum : str
EC number, found by prokka, or "NA" if no EC number (-> always for prodigal)
inf2 : str
more information found by prokka/prodigal, or "NA" if no more information
db_xref : str
db_xref given by Prokka ("NA" for prodigal)
strand : str
C (complement) or D (direct)
start : str
start of gene in the contig
end : str
end of gene in the contig
lstopenfile : _io.TextIOWrapper
open file where lstinfo must be written
Returns
-------
str :
lstline
"""
locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc,
str(locus_num).zfill(5))
# If '|' character found in those fields, replace by '_' to avoid problems while parsing
more_info = "| {} | {} | {} | {}".format(product.replace("|", "_"),
ecnum.replace("|", "_"),
inf2.replace("|", "_"),
db_xref.replace("|", "_"))
lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info])
lstopenfile.write(lst_line + "\n")
return lst_line
def write_header(lstline, outfile):
"""
write header to output file. Header is generated from the lst line.
Parameters
----------
lstline : str
current line of lst file
outfile : _io.TextIOWrapper
open file where header must be written
"""
start, end, _, _, name, gene_name, info = lstline.split("\t")
size = int(end) - int(start) + 1
towrite = " ".join([name, str(size), gene_name, info])
outfile.write(">" + towrite + "\n")
def get_contig_name(genome, cont_num):
"""
From a genome name and a contig number, write the header corresponding to this contig
(for 'Replicons' files)
Parameters
----------
genome : str
genome name
cont_num : int
contig number
"""
return ">{}.{}".format(genome, str(cont_num).zfill(4))