Skip to content
Snippets Groups Projects
Select Git revision
  • d0fc0ff523636427fe4db486560201d96f926b9c
  • master default protected
  • dev
  • install
  • new_master
  • protein_ortho
  • documentation
  • pr18
  • dev-licence
  • docker
  • prodigal_train
  • containers
  • module_all
  • functional_tests
  • opti
  • helpers
  • v1.4.1
  • v1.4.0
  • v1.3.1
  • v1.3.0
  • v1.2.0
  • v1.1.0
  • v1.0.1
  • v1.0
24 results

protein_seq_functions.py

Blame
  • protein_seq_functions.py 1.99 KiB
    #!/usr/bin/env python3
    # coding: utf-8
    
    """
    Functions to build a bank of all proteins to include in the pangenome
    
    @author gem
    April 2017
    """
    from genomeAPCAT import utils
    import logging
    import os
    
    logger = logging.getLogger('pangenome.bank')
    
    
    def build_prt_bank(lstinfo, dbpath, name, spedir, quiet):
        """
        Build a file containing all proteins of all genomes contained in lstinfo.
    
        Parameters
        ----------
        lstinfo : str
            1 line per genome, only 1st column considered here, as the genome name\
            without extension
        dbpath : str
            Proteins folder, containing all proteins for each genome. Each genome has\
            its own protein file, called `<genome_name>.prt`.
        name : str
            dataset name, used to name the output databank: <outdir>/<name>.All.prt
        spedir : str or None
            By default, output file is saved in dbpath directory. If it must be saved somewhere\
            else, it is specified here.
        quiet : bool
            True if nothing must be written in stdout/stderr, False otherwise
    
        Returns
        -------
        str
            name (with path) of the protein databank generated
        """
        if not spedir:
            outdir = dbpath
        else:
            os.makedirs(spedir, exist_ok=True)
            outdir = spedir
        outfile = os.path.join(outdir, name + ".All.prt")
        if os.path.isfile(outfile):
            logger.warning(("Protein bank {} already exists. "
                            "It will be used by mmseqs.").format(outfile))
            return outfile
        logger.info("Building bank with all proteins to {}".format(name + ".All.prt"))
        genomes = []
        with open(lstinfo) as lstf:
            for line in lstf:
                # skip header
                if "_name" in line:
                    continue
                genome = line.strip().split()[0]
                genomes.append(genome)
        all_names = [os.path.join(dbpath, gen + ".prt") for gen in genomes]
        if quiet:
            utils.cat(all_names, outfile)
        else:
            utils.cat(all_names, outfile, title="Building bank")
        return outfile