diff --git a/MANIFEST.in b/MANIFEST.in index 8c86eba637cb4ccb534a6f0f93c4e6717b1e6726..ab6b4aab9bc4cbeafde31abff774589cec1e6666 100755 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include genomeAPCAT/align_module/prt2codon.awk \ No newline at end of file +include PanACoTA/align_module/prt2codon.awk \ No newline at end of file diff --git a/genomeAPCAT/__init__.py b/PanACoTA/__init__.py similarity index 100% rename from genomeAPCAT/__init__.py rename to PanACoTA/__init__.py diff --git a/genomeAPCAT/align_module/__init__.py b/PanACoTA/align_module/__init__.py similarity index 100% rename from genomeAPCAT/align_module/__init__.py rename to PanACoTA/align_module/__init__.py diff --git a/genomeAPCAT/align_module/alignment.py b/PanACoTA/align_module/alignment.py similarity index 100% rename from genomeAPCAT/align_module/alignment.py rename to PanACoTA/align_module/alignment.py diff --git a/genomeAPCAT/align_module/get_seqs.py b/PanACoTA/align_module/get_seqs.py similarity index 100% rename from genomeAPCAT/align_module/get_seqs.py rename to PanACoTA/align_module/get_seqs.py diff --git a/genomeAPCAT/align_module/pan_to_pergenome.py b/PanACoTA/align_module/pan_to_pergenome.py similarity index 100% rename from genomeAPCAT/align_module/pan_to_pergenome.py rename to PanACoTA/align_module/pan_to_pergenome.py diff --git a/genomeAPCAT/align_module/post_align.py b/PanACoTA/align_module/post_align.py similarity index 100% rename from genomeAPCAT/align_module/post_align.py rename to PanACoTA/align_module/post_align.py diff --git a/genomeAPCAT/align_module/prt2codon.awk b/PanACoTA/align_module/prt2codon.awk similarity index 100% rename from genomeAPCAT/align_module/prt2codon.awk rename to PanACoTA/align_module/prt2codon.awk diff --git a/genomeAPCAT/annote_module/__init__.py b/PanACoTA/annote_module/__init__.py similarity index 100% rename from genomeAPCAT/annote_module/__init__.py rename to PanACoTA/annote_module/__init__.py diff --git a/genomeAPCAT/annote_module/format_functions.py b/PanACoTA/annote_module/format_functions.py similarity index 100% rename from genomeAPCAT/annote_module/format_functions.py rename to PanACoTA/annote_module/format_functions.py diff --git a/genomeAPCAT/annote_module/genome_seq_functions.py b/PanACoTA/annote_module/genome_seq_functions.py similarity index 100% rename from genomeAPCAT/annote_module/genome_seq_functions.py rename to PanACoTA/annote_module/genome_seq_functions.py diff --git a/genomeAPCAT/annote_module/prokka_functions.py b/PanACoTA/annote_module/prokka_functions.py similarity index 100% rename from genomeAPCAT/annote_module/prokka_functions.py rename to PanACoTA/annote_module/prokka_functions.py diff --git a/genomeAPCAT/corepers_module/__init__.py b/PanACoTA/corepers_module/__init__.py similarity index 100% rename from genomeAPCAT/corepers_module/__init__.py rename to PanACoTA/corepers_module/__init__.py diff --git a/genomeAPCAT/corepers_module/persistent_functions.py b/PanACoTA/corepers_module/persistent_functions.py similarity index 100% rename from genomeAPCAT/corepers_module/persistent_functions.py rename to PanACoTA/corepers_module/persistent_functions.py diff --git a/genomeAPCAT/pangenome_module/__init__.py b/PanACoTA/pangenome_module/__init__.py similarity index 100% rename from genomeAPCAT/pangenome_module/__init__.py rename to PanACoTA/pangenome_module/__init__.py diff --git a/genomeAPCAT/pangenome_module/mmseqs_functions.py b/PanACoTA/pangenome_module/mmseqs_functions.py similarity index 97% rename from genomeAPCAT/pangenome_module/mmseqs_functions.py rename to PanACoTA/pangenome_module/mmseqs_functions.py index 3d3a98b4dbf2cbd05797f75e9dbd801f4c3ddd99..d8822f37100ba5d2be99e9ab250b4df101b335c9 100755 --- a/genomeAPCAT/pangenome_module/mmseqs_functions.py +++ b/PanACoTA/pangenome_module/mmseqs_functions.py @@ -162,6 +162,7 @@ def do_pangenome(outdir, prt_bank, mmseqdb, min_id, clust_mode, threads, start, tmpdir = os.path.join(outdir, "tmp_" + prt_bank + "_" + infoname) os.makedirs(tmpdir, exist_ok=True) bar = None + logger.debug(mmseqclust) if os.path.isfile(mmseqclust): logger.warning(("mmseqs clustering {} already exists. The program will now convert " "it to a pangenome file.").format(mmseqclust)) @@ -183,8 +184,9 @@ def do_pangenome(outdir, prt_bank, mmseqdb, min_id, clust_mode, threads, start, bar.finish() pool.join() # Convert output to tsv file (one line per comparison done) - families, outfile = mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, start, panfile) - return families, outfile + # # Convert output to tsv file (one line per comparison done) + # -> returns (families, outfile) + return mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, start, panfile) def run_mmseqs_clust(args): @@ -200,15 +202,15 @@ def run_mmseqs_clust(args): * mmseqclust: path to base filename for output of mmseq clustering * tmpdir : path to folder which will contain mmseq temporary files * logmmseq : path to file where logs must be written - * min_id : min percentage of identity to be considered in the same family\ - (between 0 and 1) + * min_id : min percentage of identity to be considered in the same family + * (between 0 and 1) * threads : max number of threads to use * clust_mode : [0, 1, 2], 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit' """ mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode = args cmd = ("mmseqs cluster {} {} {} --min-seq-id {} --threads {} --cluster-mode " - "{} --kmer-per-seq 80 --max-seqs 300").format(mmseqdb, mmseqclust, tmpdir, min_id, threads, clust_mode) + "{}").format(mmseqdb, mmseqclust, tmpdir, min_id, threads, clust_mode) msg = "Problem while clustering proteins with mmseqs. See log in {}".format(logmmseq) with open(logmmseq, "a") as logm: utils.run_cmd(cmd, msg, eof=False, stdout=logm, stderr=logm) diff --git a/genomeAPCAT/pangenome_module/post_treatment.py b/PanACoTA/pangenome_module/post_treatment.py similarity index 100% rename from genomeAPCAT/pangenome_module/post_treatment.py rename to PanACoTA/pangenome_module/post_treatment.py diff --git a/genomeAPCAT/pangenome_module/protein_seq_functions.py b/PanACoTA/pangenome_module/protein_seq_functions.py similarity index 100% rename from genomeAPCAT/pangenome_module/protein_seq_functions.py rename to PanACoTA/pangenome_module/protein_seq_functions.py diff --git a/genomeAPCAT/subcommands/__init__.py b/PanACoTA/subcommands/__init__.py similarity index 100% rename from genomeAPCAT/subcommands/__init__.py rename to PanACoTA/subcommands/__init__.py diff --git a/genomeAPCAT/subcommands/align.py b/PanACoTA/subcommands/align.py similarity index 100% rename from genomeAPCAT/subcommands/align.py rename to PanACoTA/subcommands/align.py diff --git a/genomeAPCAT/subcommands/annote.py b/PanACoTA/subcommands/annote.py similarity index 100% rename from genomeAPCAT/subcommands/annote.py rename to PanACoTA/subcommands/annote.py diff --git a/genomeAPCAT/subcommands/corepers.py b/PanACoTA/subcommands/corepers.py similarity index 100% rename from genomeAPCAT/subcommands/corepers.py rename to PanACoTA/subcommands/corepers.py diff --git a/genomeAPCAT/subcommands/pangenome.py b/PanACoTA/subcommands/pangenome.py similarity index 100% rename from genomeAPCAT/subcommands/pangenome.py rename to PanACoTA/subcommands/pangenome.py diff --git a/genomeAPCAT/subcommands/tree.py b/PanACoTA/subcommands/tree.py similarity index 100% rename from genomeAPCAT/subcommands/tree.py rename to PanACoTA/subcommands/tree.py diff --git a/genomeAPCAT/tree_module/__init__.py b/PanACoTA/tree_module/__init__.py similarity index 100% rename from genomeAPCAT/tree_module/__init__.py rename to PanACoTA/tree_module/__init__.py diff --git a/genomeAPCAT/tree_module/fastme_func.py b/PanACoTA/tree_module/fastme_func.py similarity index 100% rename from genomeAPCAT/tree_module/fastme_func.py rename to PanACoTA/tree_module/fastme_func.py diff --git a/genomeAPCAT/tree_module/fasttree_func.py b/PanACoTA/tree_module/fasttree_func.py similarity index 100% rename from genomeAPCAT/tree_module/fasttree_func.py rename to PanACoTA/tree_module/fasttree_func.py diff --git a/genomeAPCAT/tree_module/quicktree_func.py b/PanACoTA/tree_module/quicktree_func.py similarity index 100% rename from genomeAPCAT/tree_module/quicktree_func.py rename to PanACoTA/tree_module/quicktree_func.py diff --git a/genomeAPCAT/utils.py b/PanACoTA/utils.py similarity index 100% rename from genomeAPCAT/utils.py rename to PanACoTA/utils.py diff --git a/genomeAPCAT/utils_pangenome.py b/PanACoTA/utils_pangenome.py similarity index 100% rename from genomeAPCAT/utils_pangenome.py rename to PanACoTA/utils_pangenome.py diff --git a/bin/genomeAPCAT b/bin/genomeAPCAT deleted file mode 100755 index 3bf2de3f8ce2b99ee9fbff27af0f04e0d38f4e66..0000000000000000000000000000000000000000 --- a/bin/genomeAPCAT +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -import sys -from textwrap import dedent - -from genomeAPCAT import __version__ as version - -from genomeAPCAT.subcommands import annote -from genomeAPCAT.subcommands import pangenome -from genomeAPCAT.subcommands import corepers -from genomeAPCAT.subcommands import align -from genomeAPCAT.subcommands import tree - - -def main(): - """ - Start program according to arguments given by user. - """ - action, args = parse_arguments(sys.argv[1:]) - action(args) - - -def parse_arguments(argv): - """ - Extract command-line arguments for different actions. - """ - import argparse - - # Create main parser - - parser = argparse.ArgumentParser( - epilog="For more details, visit the MacSyFinder website and see " - "the MacSyFinder documentation.", - formatter_class=argparse.RawDescriptionHelpFormatter, - description=dedent(''' - - - - ___ _____ ___ _____ _____ -( _`\ ( _ )( _`\ (_ _)( _ ) -| |_) ) _ _ ___ | (_) || ( (_) _ | | | (_) | -| ,__/'/'_` )/' _ `\| _ || | _ /'_`\ | | | _ | -| | ( (_| || ( ) || | | || (_( )( (_) )| | | | | | -(_) `\__,_)(_) (_)(_) (_)(____/'`\___/'(_) (_) (_) - - - Large scale comparative genomics tools - - ------------------------------------------- - ''') ) - - - parser.add_argument('-V', '--version', action='version', - version='genomeAPCAT - v. ' + str(version), - help="Print the version number and exit") - - # Create subparsers, for all submodules - subparsers = parser.add_subparsers(dest='subparser_called') - # dest: to be able to get the subparser called with args.subparser_called - actions = {} # to add the action to do according to the subparser called - checks = {} # to add the function to call to check the subparser arguments - - # QC and annotation part - parser_annote = subparsers.add_parser('annotate', - help="Quality control and annotation of genomes", - add_help=False) - annote.build_parser(parser_annote) - actions["annotate"] = annote.main_from_parse - checks["annotate"] = annote.check_args - - # Pan genome part - parser_pan = subparsers.add_parser('pangenome', help="Generate a pan-genome of your dataset", - add_help=False) - pangenome.build_parser(parser_pan) - actions["pangenome"] = pangenome.main_from_parse - - # Persistent genome part - parser_corepers = subparsers.add_parser('corepers', - help="Compute a Core or Persistent genome of your " - "dataset", - add_help=False) - corepers.build_parser(parser_corepers) - actions["corepers"] = corepers.main_from_parse - checks["corepers"] = corepers.check_args - - # Alignment part - parser_align = subparsers.add_parser('align', - help="Align Core/Persistent familiest", - add_help=False) - align.build_parser(parser_align) - actions["align"] = align.main_from_parse - - # tree part - parser_tree = subparsers.add_parser('tree', - help=("Infer phylogenetic tree based on " - "core/persistent genome"), - add_help=False) - tree.build_parser(parser_tree) - actions["tree"] = tree.main_from_parse - checks["tree"] = tree.check_args - - # Parse arguments and execute corresponding action - arguments = parser.parse_args(argv) - arguments.argv = argv - action_called = arguments.subparser_called - # If checks are needed, do it (if some arguments are not compatible etc.) - if action_called in checks: - checks[action_called](parser, arguments) - - # If subparser called does not exist, error - if action_called not in actions: - parser.error("too few arguments. Use '-h' to get help.") - return actions[action_called], arguments - - -if __name__ == '__main__': - main() diff --git a/make b/make index c257ae17d4105e3ff0d5a7baab7d550248ee0e11..f9bd8fd40452bbc9c57ebb73bc85fdc9eb1341df 100755 --- a/make +++ b/make @@ -46,11 +46,11 @@ def uninstall(): Uninstall PanACoTA python package """ logger.info("Uninstalling PanACoTA...") - cmd = "pip3 uninstall -y genomeAPCAT" + cmd = "pip3 uninstall -y PanACoTA" error = ("A problem occurred while trying to uninstall PanACoTA. If you have " "permission errors, try to add 'sudo' before your command line.") run_cmd(cmd, error) - link_dest = os.path.join(os.sep + "usr", "local", "bin", "genomeAPCAT") + link_dest = os.path.join(os.sep + "usr", "local", "bin", "PanACoTA") if os.path.exists(link_dest): os.remove(link_dest) @@ -99,7 +99,7 @@ def install_all(install_dir, target, dev=False, user=False): "you do not have root access, install with the '--user' option") run_cmd(cmd, error, eof=True) if user: - gapcat_bin = os.path.join(os.getcwd(), "bin", "genomeAPCAT") + gapcat_bin = os.path.join(os.getcwd(), "bin", "PanACoTA") os.symlink(gapcat_bin, os.path.join(install_dir, os.path.basename(gapcat_bin))) if to_install_user: msg = ("Some dependencies needed for some subcommands of PanACoTA are not installed. " diff --git a/setup.py b/setup.py index 7e6dedde6b4c5dc82027331485eb33465c2c822c..b71649549536c85b0939f0ce32917b559e04254c 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ Setup script """ -import genomeAPCAT +import PanACoTA try: from setuptools import setup from setuptools.command.test import test as TestCommand @@ -38,16 +38,16 @@ def parse_requirements(requirements): and not l.startswith('#')] -packages = ['genomeAPCAT', 'genomeAPCAT.annote_module', - 'genomeAPCAT.pangenome_module', 'genomeAPCAT.corepers_module', - 'genomeAPCAT.align_module', 'genomeAPCAT.tree_module', 'genomeAPCAT.subcommands'] +packages = ['PanACoTA', 'PanACoTA.annote_module', + 'PanACoTA.pangenome_module', 'PanACoTA.corepers_module', + 'PanACoTA.align_module', 'PanACoTA.tree_module', 'PanACoTA.subcommands'] requires = parse_requirements("requirements.txt") -scripts = ['bin/genomeAPCAT'] +scripts = ['bin/PanACoTA'] classifiers = [ "Environment :: Console", "Intended Audience :: Science/Research", - "License :: ???", + "License :: AGPL v3", "Programming Language :: Python :: 3", "Operating System :: OS Independent", "Topic :: Scientific/Engineering :: Bio-Informatics", @@ -57,9 +57,9 @@ with open('README.md') as f: long_description = f.read() setup( - name='genomeAPCAT', + name='PanACoTA', packages=packages, - version=genomeAPCAT.__version__, + version=PanACoTA.__version__, description="Large scale comparative genomics tools: annotate genomes, do pangenome, " "core/persistent genome, align core/persistent families, infer phylogenetic tree.", long_description=long_description, diff --git a/test/test_unit/test_pangenome/test_mmseq_func.py b/test/test_unit/test_pangenome/test_mmseq_func.py index 5bca1d950be4b9b1dad0a59ebae3cf237c363278..56fe9a7403cb46a569148ba9d0a3680edb3b5775 100755 --- a/test/test_unit/test_pangenome/test_mmseq_func.py +++ b/test/test_unit/test_pangenome/test_mmseq_func.py @@ -98,7 +98,7 @@ FAMILIES4G = [["GEN2.1017.00001.i0002_00004", "GEN4.1111.00001.i0001_00002", def test_create_mmseqdb(caplog): """ Test that mmseq DB is created. We do not check its content as it could change - according to mmseq versions, and we are here testing genomeAPCAT, not mmseqs + according to mmseq versions, and we are here testing PanACoTA, not mmseqs """ caplog.set_level(logging.DEBUG) filename = "test_create_mmseqsdb.msdb" @@ -285,317 +285,331 @@ def test_tsv2pangenome_default(): os.remove(logmmseq) -# def test_mmseq2pan_givenout(): -# """ -# From mmseq clust output, convert to pangenome (with steps inside, already tested by the other -# functions called).+ write pangenome to ouput file -# """ -# outfile1 = "test_mmseq2pan.lst" -# mmseqclust = os.path.join(PATH_TEST_FILES, "mmseq_clust-out") -# mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") -# start = time.strftime('%Y-%m-%d_%H-%M-%S') -# logmmseq = "test_mmseq2pan-out.log" -# fams, outf = mmseqs.mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, start, outfile1) -# assert outfile1 == outf -# for num, fam in fams.items(): -# assert num in list(range(1, 17)) -# found = False -# for expfam in list(EXP_CLUSTERS.values()): -# if fam == expfam: -# found = True -# break -# assert found -# exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") -# with open(exp_pan, "r") as ep, open(outf, "r") as pan: -# lines_exp = [] -# lines_out = [] -# for line_exp, line in zip(ep, pan): -# lines_exp.append(tuple(line_exp.split()[1:])) -# lines_out.append(tuple(line.split()[1:])) -# assert set(lines_exp) == set(lines_out) -# os.remove(outf) -# os.remove(logmmseq) +def test_mmseq2pan_givenout(): + """ + From mmseq clust output, convert to pangenome (with steps inside, already tested by the other + functions called).+ write pangenome to ouput file + """ + outfile1 = "test_mmseq2pan.lst" + mmseqclust = os.path.join(PATH_TEST_FILES, "mmseq_clust-out") + mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") + start = time.strftime('%Y-%m-%d_%H-%M-%S') + logmmseq = "test_mmseq2pan-out.log" + fams, outf = mmseqs.mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, start, outfile1) + # assert output filename was not changed + assert outfile1 == outf + for num, fam in fams.items(): + # Check that the number of families return by the function is as expected + assert num in list(range(1, 17)) + found = False + # Check that all expected families are found in fams + for expfam in list(EXP_CLUSTERS.values()): + if fam == expfam: + found = True + break + assert found + exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") + #Check that families written in output file are as expected + with open(exp_pan, "r") as ep, open(outf, "r") as pan: + lines_exp = [] + lines_out = [] + for line_exp, line in zip(ep, pan): + lines_exp.append(tuple(line_exp.split()[1:])) + lines_out.append(tuple(line.split()[1:])) + assert set(lines_exp) == set(lines_out) + os.remove(outf) + os.remove(logmmseq) -# def test_run_clust(): -# """ -# Checks that, when we run mmseq clust, it creates all files needed for after to do -# the pangenome. We do not check the content of the mmseq output files, as it could -# depend on its version, and we are here testing genomeAPCAT. -# """ -# mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") -# mmseqclust = "test_mmseq_cluster-out" -# tmpdir = "test_mmseq_tmp" -# os.makedirs(tmpdir) -# logmmseq = "test_mmseq_cluster.log" -# min_id = 0.8 -# threads = 1 -# clust_mode = 1 -# args = (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode) -# assert not os.path.isfile(mmseqclust) -# mmseqs.run_mmseqs_clust(args) -# assert os.path.isfile(mmseqclust) -# assert os.path.isfile(mmseqclust + ".index") -# assert os.path.isfile(logmmseq) -# assert os.path.isdir(tmpdir) -# shutil.rmtree(tmpdir) -# os.remove(mmseqclust) -# os.remove(mmseqclust + ".index") -# os.remove(logmmseq) - - -# def test_get_logmmseq(): -# """ -# Check that the given log filename is as expected according to given information -# """ -# outdir = "toto" -# prt_bank = "bank_prt" -# infoname = "GENO115" -# log = mmseqs.get_logmmseq(outdir, prt_bank, infoname) -# assert log == "toto/mmseq_bank_prt_GENO115.log" +def test_run_clust(): + """ + Checks that, when we run mmseq clust, it creates all files needed for after to do + the pangenome. We do not check the content of the mmseq output files, as it could + depend on its version, and we are here testing PanACoTA. + """ + mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") + mmseqclust = "test_mmseq_cluster-out" + tmpdir = "test_mmseq_tmp" + os.makedirs(tmpdir) + logmmseq = "test_mmseq_cluster.log" + min_id = 0.8 + threads = 1 + clust_mode = 1 + args = (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode) + # Check that output of mmseq does not already exist + assert not os.path.isfile(mmseqclust) + # Run run_mmseqs_clust on previous arguments + mmseqs.run_mmseqs_clust(args) + # Check that all expected files and temporary directory are created + # and check that no more outfile is create, in order to remove all of them !! !! + generated_outfiles = glob.glob(mmseqclust + "*") + assert len(generated_outfiles) == 3 + assert set(generated_outfiles) == set([mmseqclust, mmseqclust + ".index", + mmseqclust + ".dbtype"]) + assert os.path.isfile(logmmseq) + assert os.path.isdir(tmpdir) + # Remove files created by this test + shutil.rmtree(tmpdir) + for file in generated_outfiles: + os.remove(file) + os.remove(logmmseq) -# def test_get_info(): -# """ -# Check that string given by get_info is as expected according to info given in input -# """ -# threads = 1 -# min_id = 0.8 -# clust_mode = 1 -# start = "STARTTIME" -# info = mmseqs.get_info(threads, min_id, clust_mode, start) -# assert info == "0.8-mode1_STARTTIME" +def test_get_logmmseq(): + """ + Check that the given log filename is as expected according to given information + """ + outdir = "toto" + prt_bank = "bank_prt" + infoname = "GENO115" + log = mmseqs.get_logmmseq(outdir, prt_bank, infoname) + assert log == "toto/mmseq_bank_prt_GENO115.log" -# def test_get_info_parallel(): -# """ -# Check that string given by get_info is as expected according to info given in input -# """ -# threads = 12 -# min_id = 0.8 -# clust_mode = 1 -# start = "STARTTIME" -# info = mmseqs.get_info(threads, min_id, clust_mode, start) -# assert info == "0.8-mode1-th12_STARTTIME" +def test_get_info(): + """ + Check that string given by get_info is as expected according to info given in input + """ + threads = 1 + min_id = 0.8 + clust_mode = 1 + start = "STARTTIME" + info = mmseqs.get_info(threads, min_id, clust_mode, start) + assert info == "0.8-mode1_STARTTIME" -# def test_do_pangenome(caplog): -# """ -# Check that expected output files are created, -# and compare output pangenome to the expected one. -# """ -# caplog.set_level(logging.DEBUG) -# outdir = "test_do_pangenome_outdir" -# prt_bank = "exp_EXEM.All.prt" -# mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") -# min_id = 0.8 -# clust_mode = 1 -# threads = 1 -# start = "STARTTIME" -# quiet = False -# assert not os.path.isdir(outdir) -# fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, -# clust_mode, threads, start, quiet=quiet) -# # Check creation of output directory -# assert os.path.isdir(outdir) -# # Check creation of tmp directory -# tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") -# assert os.path.isdir(tmp_dir) -# # Check presence of pangenome file -# exp_out = os.path.join(outdir, "PanGenome-exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME.tsv.lst") -# assert exp_out == outfile -# assert os.path.isfile(outfile) -# # Check families returned -# for num, fam in fams.items(): -# assert num in list(range(1, 17)) -# found = False -# for expfam in list(EXP_CLUSTERS.values()): -# if fam == expfam: -# found = True -# break -# assert found -# # Check content of output pangenome file -# exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") -# with open(exp_pan, "r") as ep, open(outfile, "r") as pan: -# lines_exp = [] -# lines_out = [] -# for line_exp, line in zip(ep, pan): -# lines_exp.append(tuple(line_exp.split()[1:])) -# lines_out.append(tuple(line.split()[1:])) -# assert set(lines_exp) == set(lines_out) -# assert "Clustering proteins..." in caplog.text -# shutil.rmtree(outdir) +def test_get_info_parallel(): + """ + Check that string given by get_info is as expected according to info given in input + """ + threads = 12 + min_id = 0.8 + clust_mode = 1 + start = "STARTTIME" + info = mmseqs.get_info(threads, min_id, clust_mode, start) + assert info == "0.8-mode1-th12_STARTTIME" -# def test_do_pangenome_given_panfile(caplog): -# """ -# Check that expected output files are created, -# and compare output pangenome to the expected one. -# """ -# caplog.set_level(logging.DEBUG) -# outdir = "test_do_pangenome_outdir" -# prt_bank = "exp_EXEM.All.prt" -# mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") -# min_id = 0.8 -# clust_mode = 1 -# threads = 1 -# start = "STARTTIME" -# quiet = False -# panfile = "test_res_pangenome" -# assert not os.path.isdir(outdir) -# fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, -# clust_mode, threads, start, quiet=quiet, panfile=panfile) -# print(outfile) -# # Check creation of output directory -# assert os.path.isdir(outdir) -# # Check creation of tmp directory -# tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") -# assert os.path.isdir(tmp_dir) -# # Check presence of pangenome file -# assert panfile == outfile -# assert os.path.isfile(outfile) -# # Check families returned -# for num, fam in fams.items(): -# assert num in list(range(1, 17)) -# found = False -# for expfam in list(EXP_CLUSTERS.values()): -# if fam == expfam: -# found = True -# break -# assert found -# # Check content of output pangenome file -# exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") -# with open(exp_pan, "r") as ep, open(outfile, "r") as pan: -# lines_exp = [] -# lines_out = [] -# for line_exp, line in zip(ep, pan): -# lines_exp.append(tuple(line_exp.split()[1:])) -# lines_out.append(tuple(line.split()[1:])) -# assert set(lines_exp) == set(lines_out) -# assert "Clustering proteins..." in caplog.text -# shutil.rmtree(outdir) -# os.remove(panfile) +def test_do_pangenome_defaultname(caplog): + """ + Check that expected output files are created, + and compare output pangenome to the expected one. + """ + caplog.set_level(logging.DEBUG) + outdir = "test_do_pangenome_outdir" + prt_bank = "exp_EXEM.All.prt" + mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") + min_id = 0.8 + clust_mode = 1 + threads = 1 + start = "STARTTIME" + quiet = False + assert not os.path.isdir(outdir) + fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, + clust_mode, threads, start, quiet=quiet) + # Check creation of output directory + assert os.path.isdir(outdir) + # Check creation of tmp directory + tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") + assert os.path.isdir(tmp_dir) + # Check presence of pangenome file + exp_out = os.path.join(outdir, "PanGenome-exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME.tsv.lst") + assert exp_out == outfile + assert os.path.isfile(outfile) + # Check families returned + for num, fam in fams.items(): + assert num in list(range(1, 17)) + found = False + for expfam in list(EXP_CLUSTERS.values()): + if fam == expfam: + found = True + break + assert found + # Check content of output pangenome file + exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") + with open(exp_pan, "r") as ep, open(outfile, "r") as pan: + lines_exp = [] + lines_out = [] + for line_exp, line in zip(ep, pan): + lines_exp.append(tuple(line_exp.split()[1:])) + lines_out.append(tuple(line.split()[1:])) + assert set(lines_exp) == set(lines_out) + assert "Clustering proteins..." in caplog.text + shutil.rmtree(outdir) -# def test_do_pangenome_quiet(caplog): -# """ -# Check that expected output files are created, -# and compare output pangenome to the expected one. -# Check that no error appears when choosing quiet option. -# """ -# caplog.set_level(logging.DEBUG) -# outdir = "test_do_pangenome_outdir" -# prt_bank = "exp_EXEM.All.prt" -# mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") -# min_id = 0.8 -# clust_mode = 1 -# threads = 1 -# start = "STARTTIME" -# quiet = True -# assert not os.path.isdir(outdir) -# fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, -# clust_mode, threads, start, quiet=quiet) -# # Check creation of output directory -# assert os.path.isdir(outdir) -# # Check creation of tmp directory -# tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") -# assert os.path.isdir(tmp_dir) -# # Check presence of pangenome file -# exp_out = os.path.join(outdir, "PanGenome-exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME.tsv.lst") -# assert exp_out == outfile -# assert os.path.isfile(outfile) -# # Check families returned -# for num, fam in fams.items(): -# assert num in list(range(1, 17)) -# found = False -# for expfam in list(EXP_CLUSTERS.values()): -# if fam == expfam: -# found = True -# break -# assert found -# # Check content of output pangenome file -# exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") -# with open(exp_pan, "r") as ep, open(outfile, "r") as pan: -# lines_exp = [] -# lines_out = [] -# for line_exp, line in zip(ep, pan): -# lines_exp.append(tuple(line_exp.split()[1:])) -# lines_out.append(tuple(line.split()[1:])) -# assert set(lines_exp) == set(lines_out) -# assert "Clustering proteins..." in caplog.text -# shutil.rmtree(outdir) +def test_do_pangenome_given_panfile(caplog): + """ + Check that expected output files are created, + and compare output pangenome to the expected one. + No possibility to check if quiet was not applied... + """ + caplog.set_level(logging.DEBUG) + outdir = "test_do_pangenome_outdir_pan-name" + prt_bank = "exp_EXEM.All.prt" + mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") + min_id = 0.8 + clust_mode = 1 + threads = 1 + start = "STARTTIME" + quiet = False + panfile = "test_res_pangenome_given-name" + # Check outdir does not exit before + assert not os.path.isdir(outdir) + fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, + clust_mode, threads, start, quiet=quiet, panfile=panfile) + # Check creation of output directory + assert os.path.isdir(outdir) + # Check creation of tmp directory + tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") + assert os.path.isdir(tmp_dir) + # Check presence of pangenome file + assert panfile == outfile + assert os.path.isfile(outfile) + # Check families returned + for num, fam in fams.items(): + assert num in list(range(1, 17)) + found = False + for expfam in list(EXP_CLUSTERS.values()): + if fam == expfam: + found = True + break + assert found + # Check content of output pangenome file + exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") + with open(exp_pan, "r") as ep, open(outfile, "r") as pan: + lines_exp = [] + lines_out = [] + for line_exp, line in zip(ep, pan): + lines_exp.append(tuple(line_exp.split()[1:])) + lines_out.append(tuple(line.split()[1:])) + assert set(lines_exp) == set(lines_out) + assert "Clustering proteins..." in caplog.text + shutil.rmtree(outdir) + os.remove(panfile) -# def test_do_pangenome_exist(caplog): -# """ -# Check that if the mmseq output file of clustering already exists, it does not -# run mmseq again, but just converts it to pangenome. -# """ -# caplog.set_level(logging.DEBUG) -# outdir = "test_do_pangenome_outdir_exist" -# prt_bank = "exp_EXEM.All.prt" -# mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") -# min_id = 0.8 -# clust_mode = 1 -# threads = 1 -# start = "STARTTIME" -# # Create clustering results in outdir -# os.makedirs(outdir) -# orig_clust = os.path.join(PATH_TEST_FILES, "mmseq_clust-out") -# out_clust = os.path.join(outdir, "exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME") -# shutil.copyfile(orig_clust, out_clust) -# shutil.copyfile(orig_clust + ".index", out_clust + ".index") -# fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, -# clust_mode, threads, start) -# assert ("mmseqs clustering test_do_pangenome_outdir_exist/exp_EXEM.All.prt-clust-0.8-" -# "mode1_STARTTIME already exists. The program will now convert it to a " -# "pangenome file.") in caplog.text -# # Check creation of empty tmp directory -# tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") -# assert os.path.isdir(tmp_dir) -# assert glob.glob(os.path.join(tmp_dir, "*")) == [] -# # Check presence of pangenome file -# exp_out = os.path.join(outdir, "PanGenome-exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME.tsv.lst") -# assert exp_out == outfile -# assert os.path.isfile(outfile) -# # Check families returned -# for num, fam in fams.items(): -# assert num in list(range(1, 17)) -# found = False -# for expfam in list(EXP_CLUSTERS.values()): -# if fam == expfam: -# found = True -# break -# assert found -# # Check content of output pangenome file -# exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") -# with open(exp_pan, "r") as ep, open(outfile, "r") as pan: -# lines_exp = [] -# lines_out = [] -# for line_exp, line in zip(ep, pan): -# lines_exp.append(tuple(line_exp.split()[1:])) -# lines_out.append(tuple(line.split()[1:])) -# assert set(lines_exp) == set(lines_out) -# assert "Clustering proteins..." not in caplog.text -# shutil.rmtree(outdir) +def test_do_pangenome_quiet(caplog): + """ + Check that expected output files are created, + and compare output pangenome to the expected one. + Check that no error appears when choosing quiet option. + No possibility to check if quiet was applied... + """ + caplog.set_level(logging.DEBUG) + outdir = "test_do_pangenome_outdir_quiet" + prt_bank = "exp_EXEM.All.prt" + mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") + min_id = 0.8 + clust_mode = 1 + threads = 1 + start = "STARTTIME" + quiet = True + assert not os.path.isdir(outdir) + fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, + clust_mode, threads, start, quiet=quiet) + # Check creation of output directory + assert os.path.isdir(outdir) + # Check creation of tmp directory + tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") + assert os.path.isdir(tmp_dir) + # Check presence of pangenome file + exp_out = os.path.join(outdir, "PanGenome-exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME.tsv.lst") + assert exp_out == outfile + assert os.path.isfile(outfile) + # Check families returned + for num, fam in fams.items(): + assert num in list(range(1, 17)) + found = False + for expfam in list(EXP_CLUSTERS.values()): + if fam == expfam: + found = True + break + assert found + # Check content of output pangenome file + exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") + with open(exp_pan, "r") as ep, open(outfile, "r") as pan: + lines_exp = [] + lines_out = [] + for line_exp, line in zip(ep, pan): + lines_exp.append(tuple(line_exp.split()[1:])) + lines_out.append(tuple(line.split()[1:])) + assert set(lines_exp) == set(lines_out) + assert "Clustering proteins..." in caplog.text + shutil.rmtree(outdir) -# def test_run_all_pangenome(caplog): -# """ -# Check that, given a prt bank, it creates mmseq db, mmseq clustering, and -# outputs the expected pangenome file. -# """ -# caplog.set_level(logging.DEBUG) -# min_id = 0.8 -# clust_mode = 1 -# outdir = "test_run_allpangenome" -# os.makedirs(outdir) -# prt_path = os.path.join(PATH_EXP_FILES, "exp_EXEM.All.prt") -# threads = 1 -# panfile = None -# quiet = False -# fams, outfile = mmseqs.run_all_pangenome(min_id, clust_mode, outdir, prt_path, -# threads, panfile=panfile, quiet=quiet) +def test_do_pangenome_exist(caplog): + """ + Check that if the mmseq output file of clustering already exists, it does not + run mmseq again, but just converts it to pangenome. + """ + caplog.set_level(logging.DEBUG) + outdir = "test_do_pangenome_outdir_exist" + prt_bank = "exp_EXEM.All.prt" + mmseqdb = os.path.join(PATH_TEST_FILES, "mmseq_db") + min_id = 0.8 + clust_mode = 1 + threads = 1 + start = "STARTTIME" + # Create clustering results in outdir + os.makedirs(outdir) + orig_clust = os.path.join(PATH_TEST_FILES, "mmseq_clust-out") + out_clust = os.path.join(outdir, "exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME") + shutil.copyfile(orig_clust, out_clust) + shutil.copyfile(orig_clust + ".index", out_clust + ".index") + fams, outfile = mmseqs.do_pangenome(outdir, prt_bank, mmseqdb, min_id, + clust_mode, threads, start) + assert ("mmseqs clustering test_do_pangenome_outdir_exist/exp_EXEM.All.prt-clust-0.8-" + "mode1_STARTTIME already exists. The program will now convert it to a " + "pangenome file.") in caplog.text + # Check creation of empty tmp directory + tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_STARTTIME") + assert os.path.isdir(tmp_dir) + assert glob.glob(os.path.join(tmp_dir, "*")) == [] + # Check presence of pangenome file + exp_out = os.path.join(outdir, "PanGenome-exp_EXEM.All.prt-clust-0.8-mode1_STARTTIME.tsv.lst") + assert exp_out == outfile + assert os.path.isfile(outfile) + # Check families returned + for num, fam in fams.items(): + assert num in list(range(1, 17)) + found = False + for expfam in list(EXP_CLUSTERS.values()): + if fam == expfam: + found = True + break + assert found + # Check content of output pangenome file + exp_pan = os.path.join(PATH_EXP_FILES, "exp_pangenome-4genomes.lst") + with open(exp_pan, "r") as ep, open(outfile, "r") as pan: + lines_exp = [] + lines_out = [] + for line_exp, line in zip(ep, pan): + lines_exp.append(tuple(line_exp.split()[1:])) + lines_out.append(tuple(line.split()[1:])) + assert set(lines_exp) == set(lines_out) + assert "Clustering proteins..." not in caplog.text + shutil.rmtree(outdir) + + +def test_run_all_pangenome(caplog): + """ + Check that, given a prt bank, it creates mmseq db, mmseq clustering, and + outputs the expected pangenome file. + """ + caplog.set_level(logging.DEBUG) + min_id = 0.8 + clust_mode = 1 + outdir = "test_run_allpangenome" + os.makedirs(outdir) + prt_path = os.path.join(PATH_EXP_FILES, "exp_EXEM.All.prt") + threads = 1 + panfile = None + quiet = False + fams, outfile = mmseqs.run_all_pangenome(min_id, clust_mode, outdir, prt_path, + threads, panfile=panfile, quiet=quiet) + assert "toto" not in caplog.text # # check that tmp dir was created and not empty # tmp_dir = os.path.join(outdir, "tmp_exp_EXEM.All.prt_0.8-mode1_*") # assert glob.glob(os.path.join(tmp_dir, "*")) != []