Skip to content
Snippets Groups Projects
Select Git revision
  • d0fc0ff523636427fe4db486560201d96f926b9c
  • master default protected
  • dev
  • install
  • new_master
  • protein_ortho
  • documentation
  • pr18
  • dev-licence
  • docker
  • prodigal_train
  • containers
  • module_all
  • functional_tests
  • opti
  • helpers
  • v1.4.1
  • v1.4.0
  • v1.3.1
  • v1.3.0
  • v1.2.0
  • v1.1.0
  • v1.0.1
  • v1.0
24 results

mmseqs_functions.py

Blame
  • mmseqs_functions.py 12.01 KiB
    #!/usr/bin/env python3
    # coding: utf-8
    
    """
    Functions to use mmseqs to create a pangenome
    
    @author gem
    April 2017
    """
    import logging
    import os
    import time
    import multiprocessing
    import progressbar
    from genomeAPCAT import utils
    
    logger = logging.getLogger("pangnome.mmseqs")
    
    
    def run_all_pangenome(min_id, clust_mode, outdir, prt_path, threads, panfile=None, quiet=False):
        """
        Run all steps to build a pangenome:
    
        - create mmseqs database from protein bank
        - cluster proteins
        - convert to pangenome
    
    
        Parameters
        ----------
        min_id : float
            minimum percentage of identity to be in the same family
        clust_mode : [0, 1, 2]
            0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
        outdir : str
            directory where output cluster file must be saved
        prt_path : str
            path to file containing all proteins to cluster.
        threads : int
            number of threads which can be used
        panfile : str or None
            name for output pangenome file. Otherwise, will use default name
        quiet : bool
            True if nothing must be written on stdout, False otherwise.
    
        Returns
        -------
        (families, outfile) : tuple
    
            - families : {fam_num: [all members]}
            - outfile : pangenome filename
        """
        # Get general information and file/directory names
        prt_bank = os.path.basename(prt_path)
        start = time.strftime('%Y-%m-%d_%H-%M-%S')
        mmseqdb = os.path.join(outdir, prt_bank + "-msDB")
        information = ("Will run MMseqs2 with:\n"
                       "\t- minimum sequence identity = {}\n"
                       "\t- cluster mode {}").format(min_id, clust_mode)
        if threads > 1:
            information += "\n\t- {} threads".format(threads)
        logger.info(information)
    
        infoname = get_info(threads, min_id, clust_mode, start)
        logmmseq = get_logmmseq(outdir, prt_bank, infoname)
        # Create ffindex of DB if not already done
        create_mmseqs_db(mmseqdb, prt_path, logmmseq)
    
        # Cluster with mmseqs
        if panfile:
            panfile = os.path.join(outdir, panfile)
        families, outfile = do_pangenome(outdir, prt_bank, mmseqdb, min_id,
                                         clust_mode, threads, start, panfile, quiet)
        return families, outfile
    
    
    def get_info(threads, min_id, clust_mode, start):
        """
        Get string containing all information on future run
    
        Parameters
        ----------
        threads : int
            max number of threads to use
        min_id : float
            min percentage of identity to consider 2 proteins in hte same family
        clust_mode : [0, 1, 2]
            0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
        start : str
            string containing starting date and time
    
        Returns
        -------
        str
            string containing info on current run, to put in filenames
        """
        if threads != 1:
            threadinfo = "-th" + str(threads)
        else:
            threadinfo = ""
        infoname = str(min_id) + "-mode" + str(clust_mode) + threadinfo + "_" + start
        return infoname
    
    
    def get_logmmseq(outdir, prt_bank, infoname):
        """
        Get filename of logfile, given information
    
        Parameters
        ----------
        outdir : str
            output directory
        prt_bank : str
            name of file (without path) containing all proteins to cluster
        infoname : str
            string containing information on the current run
    
        Returns
        -------
        str
            path to mmseq logfile
        """
        return os.path.join(outdir, "mmseq_" + prt_bank + "_" + infoname + ".log")
    
    
    def do_pangenome(outdir, prt_bank, mmseqdb, min_id, clust_mode, threads, start, panfile=None,
                     quiet=False):
        """
        Use mmseqs to cluster proteins
    
        Parameters
        ----------
        outdir : str
            directory where output files are saved
        prt_bank : str
            name of the file containing all proteins to cluster, without path
        mmseqdb : str
            path to base filename for output mmseq db
        min_id : float
            min percentage of identity to be considered in the same family (between 0 and 1)
        clust_mode : [0, 1, 2]
            0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
        threads : int
            max number of threads to use
        start : str
            string containing trat date and time
        panfile : str
            if a pangenome file is specified. Otherwise, default pangenome name will be used
        quiet : bool
            true if nothing must be print on stdout/stderr, false otherwise (show progress bar)
    
        Returns
        -------
        (families, outfile) : tuple
    
            - families : {fam_num: [all members]}
            - outfile : pangenome filename
        """
        infoname = get_info(threads, min_id, clust_mode, start)
        logmmseq = get_logmmseq(outdir, prt_bank, infoname)
        mmseqclust = os.path.join(outdir, prt_bank + "-clust-" + infoname)
        tmpdir = os.path.join(outdir, "tmp_" + prt_bank + "_" + infoname)
        os.makedirs(tmpdir, exist_ok=True)
        bar = None
        logger.debug(mmseqclust)
        if os.path.isfile(mmseqclust):
            logger.warning(("mmseqs clustering {} already exists. The program will now convert "
                            "it to a pangenome file.").format(mmseqclust))
        else:
            logger.info("Clustering proteins...")
            if not quiet:
                widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")),
                           "  -  ", progressbar.Timer()]
                bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)
            pool = multiprocessing.Pool(1)
            args = [mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode]
            final = pool.map_async(run_mmseqs_clust, [args], chunksize=1)
            pool.close()
            if not quiet:
                while True:
                    if final.ready():
                        break
                    bar.update()
                bar.finish()
            pool.join()
        # Convert output to tsv file (one line per comparison done)
        #  # Convert output to tsv file (one line per comparison done)
        # -> returns (families, outfile)
        return mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, start, panfile)
    
    
    def run_mmseqs_clust(args):
        """
        Run mmseqs clustering
    
        Parameters
        ----------
        args : tuple
             (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode), with:
    
                * mmseqdb: path to base filename for output mmseq db
                * mmseqclust: path to base filename for output of mmseq clustering
                * tmpdir : path to folder which will contain mmseq temporary files
                * logmmseq : path to file where logs must be written
                * min_id : min percentage of identity to be considered in the same family
                *         (between 0 and 1)
                * threads : max number of threads to use
                * clust_mode : [0, 1, 2], 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
    
        """
        mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode = args
        cmd = ("mmseqs cluster {} {} {} --min-seq-id {} --threads {} --cluster-mode "
               "{}").format(mmseqdb, mmseqclust, tmpdir, min_id, threads, clust_mode)
        msg = "Problem while clustering proteins with mmseqs. See log in {}".format(logmmseq)
        with open(logmmseq, "a") as logm:
            utils.run_cmd(cmd, msg, eof=False, stdout=logm, stderr=logm)
    
    
    def mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, start, outfile=None):
        """
        Convert mmseqs clustering to a pangenome file:
    
        - convert mmseqs results to tsv file
        - convert tsv file to pangenome
    
        Parameters
        ----------
        mmseqdb : str
             path to base filename for output mmseq db
        mmseqclust : str
            path to base filename for output of mmseq clustering
        logmmseq : str
             path to file where logs must be written
        start : str
            string containing start date and time
        outfile : str
            pangenome filename, or None if default one must be used
    
        Returns
        -------
        (families, outfile) : tuple
    
            - families : {fam_num: [all members]}
            - outfile : pangenome filename
        """
        cmd = "mmseqs createtsv {0} {0} {1} {1}.tsv".format(mmseqdb, mmseqclust)
        msg = "Problem while trying to convert mmseq result file to tsv file"
        with open(logmmseq, "a") as logf:
            utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf)
        print("log file create tsv mmseqs:")
        with open(logmmseq, "r") as logf:
            for line in logf:
                print(line)
        # Convert the tsv file to a 'pangenome' file: one line per family
        families, outfile = mmseqs_tsv_to_pangenome(mmseqclust, logmmseq, start, outfile)
        return families, outfile
    
    
    def mmseqs_tsv_to_pangenome(mmseqclust, logmmseq, start, outfile=None):
        """
        Convert the tsv output file of mmseqs to the pangenome file
    
        Parameters
        ----------
        mmseqclust : str
            path to base filename for output of mmseq clustering
        logmmseq : str
            path to file where logs must be written
        start : str
            string containing start date and time
        outfile : str
            pangenome filename, or None if default one must be used
    
        Returns
        -------
        (families, outfile) : tuple
    
            - families : {fam_num: [all members]}
            - outfile : pangenome filename
        """
        logger.info("Converting mmseqs results to pangenome file")
        tsvfile = mmseqclust + ".tsv"
        if not outfile:
            outpath = os.path.dirname(tsvfile)
            base = os.path.basename(tsvfile)
            outfile = os.path.join(outpath, "PanGenome-" + base + ".lst")
        clusters = mmseq_tsv_to_clusters(tsvfile)
        families = clusters_to_file(clusters, outfile)
        end = time.strftime('%Y-%m-%d_%H-%M-%S')
        with open(logmmseq, "a") as logm:
            logm.write("\n------------\n\nStart: {}".format(start) + "\n")
            logm.write("End: {}".format(end) + "\n")
        return families, outfile
    
    
    def mmseq_tsv_to_clusters(mmseq):
        """
        Reads the output of mmseq as a tsv file, and converts it to a python dict
    
        Parameters
        ----------
        mmseq : str
            filename of mmseq clustering output in tsv format
    
        Returns
        -------
        dict
            {representative_of_cluster: [list of members]}
    
        """
        clusters = {}  # {representative: [all members]}
        with open(mmseq) as mmsf:
            for line in mmsf:
                repres, other = line.strip().split()
                if repres in clusters:
                    clusters[repres].append(other)
                else:
                    clusters[repres] = [repres]
        logger.info("Pangenome has {} families.".format(len(clusters)))
        return clusters
    
    
    def clusters_to_file(clust, fileout):
        """
        Write all clusters to a file
    
        Parameters
        ----------
        clust : {first_member: [all members = protein names]}
        fileout : filename of pangenome where families must be written
    
        Returns
        -------
        dict
            families : {famnum: [members]}
        """
        families = {}  # {famnum: [members]}
        with open(fileout, "w") as fout:
            num = 1
            for _, fam in clust.items():
                families[num] = []
                fout.write(str(num))
                for mem in sorted(fam, key=utils.sort_proteins):
                    families[num].append(mem)
                    fout.write(" " + mem)
                fout.write("\n")
                num += 1
        return families
    
    
    def create_mmseqs_db(mmseqdb, prt_path, logmmseq):
        """
        Create ffindex of protein bank if not already done. If done, just write a message
        to tell the user that the current existing file will be used.
    
        Parameters
        ----------
        mmseqdb : str
             path to base filename for output mmseq db
        prt_path : str
            path to the file containing all proteins to cluster
        logmmseq : str
             path to file where logs must be written
        """
        if not os.path.isfile(mmseqdb):
            logger.info("Creating database")
            cmd = "mmseqs createdb {} {}".format(prt_path, mmseqdb)
            msg = ("Problem while trying to convert database {} to mmseqs "
                   "database format.").format(prt_path)
            with open(logmmseq, "w") as logf:
                utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf)
        else:
            logger.warning(("mmseq database {} already exists. The program will "
                            "use it.").format(mmseqdb))