Select Git revision
protein_seq_functions.py
protein_seq_functions.py 1.99 KiB
#!/usr/bin/env python3
# coding: utf-8
"""
Functions to build a bank of all proteins to include in the pangenome
@author gem
April 2017
"""
from genomeAPCAT import utils
import logging
import os
logger = logging.getLogger('pangenome.bank')
def build_prt_bank(lstinfo, dbpath, name, spedir, quiet):
"""
Build a file containing all proteins of all genomes contained in lstinfo.
Parameters
----------
lstinfo : str
1 line per genome, only 1st column considered here, as the genome name\
without extension
dbpath : str
Proteins folder, containing all proteins for each genome. Each genome has\
its own protein file, called `<genome_name>.prt`.
name : str
dataset name, used to name the output databank: <outdir>/<name>.All.prt
spedir : str or None
By default, output file is saved in dbpath directory. If it must be saved somewhere\
else, it is specified here.
quiet : bool
True if nothing must be written in stdout/stderr, False otherwise
Returns
-------
str
name (with path) of the protein databank generated
"""
if not spedir:
outdir = dbpath
else:
os.makedirs(spedir, exist_ok=True)
outdir = spedir
outfile = os.path.join(outdir, name + ".All.prt")
if os.path.isfile(outfile):
logger.warning(("Protein bank {} already exists. "
"It will be used by mmseqs.").format(outfile))
return outfile
logger.info("Building bank with all proteins to {}".format(name + ".All.prt"))
genomes = []
with open(lstinfo) as lstf:
for line in lstf:
# skip header
if "_name" in line:
continue
genome = line.strip().split()[0]
genomes.append(genome)
all_names = [os.path.join(dbpath, gen + ".prt") for gen in genomes]
if quiet:
utils.cat(all_names, outfile)
else:
utils.cat(all_names, outfile, title="Building bank")
return outfile