Skip to content
Snippets Groups Projects
Commit 3cf4faa5 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

adapt func test to new pangenome filename

parent 035865d8
No related branches found
No related tags found
No related merge requests found
......@@ -40,6 +40,7 @@ Functions to build a bank of all proteins to include in the pangenome
April 2017
"""
from PanACoTA import utils
from PanACoTA import utils_pangenome as utilsp
import logging
import os
......@@ -82,14 +83,7 @@ def build_prt_bank(lstinfo, dbpath, name, spedir, quiet):
"It will be used by mmseqs."))
return outfile
logger.info(f"Building bank with all proteins to {outfile}")
genomes = []
with open(lstinfo) as lstf:
for line in lstf:
# skip header
if "_name" in line:
continue
genome = line.strip().split()[0].strip()
genomes.append(genome)
genomes = utilsp.read_lstinfo(lstinfo, logger)
all_names = [os.path.join(dbpath, gen + ".prt") for gen in genomes]
if quiet:
utils.cat(all_names, outfile)
......
......@@ -188,3 +188,35 @@ def read_gene(gene, num, fams_by_strain, all_strains):
fams_by_strain[num][strain] = [gene]
if strain not in all_strains:
all_strains.add(strain)
def read_lstinfo(lstinfo, logger):
"""
Read lstinfo file and return list of genomes
Parameters
----------
lstinfo : str
File containing the list of all genomes to include in the pan-genome,
1 genome per line. Here, only the first column will be used.
Returns
-------
list
list of genomes
"""
genomes = []
if not os.path.isfile(lstinfo):
logger.error(f"{lstinfo} file not found.")
sys.exit(1)
with open(lstinfo) as lstf:
for line in lstf:
# skip header
if "_name" in line:
continue
genome = line.split()[0].strip()
genomes.append(genome)
if genomes == []:
logger.error(f"No genome found in {lstinfo} file.")
sys.exit(1)
return genomes
......@@ -68,28 +68,26 @@ def test_main_from_parse():
assert os.path.isfile(prtbank)
# Check presence of tmp folder
tmp_base = os.path.join(GENEPATH, "tmp_testFromParsePAN4.All.prt_0.8-mode1_*")
assert len(glob.glob(tmp_base)) == 1
tmp_base = os.path.join(GENEPATH, "tmp_testFromParsePAN4.All.prt_0.8-mode1")
assert os.path.isdir(tmp_base)
# check presence of mmseq cluster files
cluster = os.path.join(GENEPATH, name + ".All.prt-clust-0.8-mode1_*")
cluster = os.path.join(GENEPATH, name + ".All.prt-clust-0.8-mode1*")
clust_files = glob.glob(cluster)
assert len(clust_files) == 4
# Check presence of pangenome files (pangenome, matrices, summary)
pan_files = glob.glob(os.path.join(GENEPATH, "PanGenome-testFromParsePAN4*"))
to_check = [".tsv.lst", ".tsv.lst.quali.txt", ".tsv.lst.quanti.txt", ".tsv.lst.summary.txt"]
to_check = [".lst", ".lst.quali.txt", ".lst.quanti.txt", ".lst.summary.txt"]
found = []
pangenome_file = ""
for f in pan_files:
for c in to_check:
if f.endswith(c):
found.append(c)
if f.endswith(".tsv.lst"):
pangenome_file = f
panfile = os.path.join(GENEPATH, "PanGenome-testFromParsePAN4.All.prt-clust-0.8-mode1.lst")
assert set(found) == set(to_check)
# Check content of pangenome
exp_pan = os.path.join(EXP_FILES, "exp_pangenome-4genomes.lst")
# Check that all families are as expected. Compare lines without the family number
with open(exp_pan, "r") as ep, open(pangenome_file, "r") as panf:
with open(exp_pan, "r") as ep, open(panfile, "r") as panf:
lines_exp = []
lines_out = []
for line_exp in ep:
......@@ -116,30 +114,27 @@ def test_main(caplog):
used_dbpath = os.path.join(GENEPATH, "database")
# copy db_path folder to output folder, as it will modify it
shutil.copytree(DBPATH, used_dbpath)
out_panfile = os.path.join(outdir, "PanGenome-testPAN4.All.prt-clust-0.8-mode1_")
out_panfile = os.path.join(outdir, "PanGenome-testPAN4.All.prt-clust-0.8-mode1.lst")
assert pan.main(cmd, lstinfo, name, used_dbpath, min_id, outdir, clust_mode,
spe_dir, threads, verbose=2).startswith(out_panfile)
spe_dir, threads, verbose=2) == out_panfile
# Checl creation of prt bank
prtbank = os.path.join(used_dbpath, "testPAN4.All.prt")
assert os.path.isfile(prtbank)
# Check presence of tmp folder
tmp_base = os.path.join(outdir, "tmp_testPAN4.All.prt_0.8-mode1_*")
assert len(glob.glob(tmp_base)) == 1
tmp_base = os.path.join(outdir, "tmp_testPAN4.All.prt_0.8-mode1")
assert os.path.isdir(tmp_base)
# Check presence of pangenome files (pangenome, matrices, summary)
pan_files = glob.glob(os.path.join(GENEPATH, "PanGenome-testPAN4*"))
to_check = [".tsv.lst", ".tsv.lst.quali.txt", ".tsv.lst.quanti.txt", ".tsv.lst.summary.txt"]
to_check = [".lst", ".lst.quali.txt", ".lst.quanti.txt", ".lst.summary.txt"]
found = []
pangenome_file = ""
for f in pan_files:
for c in to_check:
if f.endswith(c):
found.append(c)
if f.endswith(".tsv.lst"):
pangenome_file = f
assert set(found) == set(to_check)
# Check content of pangenome
exp_pan = os.path.join(EXP_FILES, "exp_pangenome-4genomes.lst")
with open(exp_pan, "r") as ep, open(pangenome_file, "r") as panf:
with open(exp_pan, "r") as ep, open(out_panfile, "r") as panf:
lines_exp = []
lines_out = []
for line_exp in ep:
......@@ -182,9 +177,9 @@ def test_main_prt_exist(caplog):
dest_prt_bank = os.path.join(used_dbpath, "test2PAN4.All.prt")
shutil.copyfile(src_prt_bank, dest_prt_bank)
out_panfile = os.path.join(outdir, "PanGenome-test2PAN4.All.prt-clust-0.8-mode1_")
out_panfile = os.path.join(outdir, "PanGenome-test2PAN4.All.prt-clust-0.8-mode1.lst")
assert pan.main(cmd, lstinfo, name, used_dbpath, min_id, outdir, clust_mode, spe_dir,
threads, verbose=15).startswith(out_panfile)
threads, verbose=15) == out_panfile
# Check presence of mmseq DB files
msdb = os.path.join(GENEPATH, "test2PAN4.All.prt-msDB")
......@@ -194,27 +189,24 @@ def test_main_prt_exist(caplog):
assert os.path.isfile(msdb + "_h")
assert os.path.isfile(msdb + "_h.index")
# Check presence of mmseq cluster files
cluster = os.path.join(outdir, "test2PAN4.All.prt-clust-0.8-mode1_*")
cluster = os.path.join(outdir, "test2PAN4.All.prt-clust-0.8-mode1*")
clust_files = glob.glob(cluster)
assert len(clust_files) == 4
# Check presence of pangenome files (pangenome, matrices, summary)
pan_files = glob.glob(os.path.join(GENEPATH, "PanGenome-test2PAN4*"))
to_check = [".tsv.lst", ".tsv.lst.quali.txt", ".tsv.lst.quanti.txt", ".tsv.lst.summary.txt"]
to_check = [".lst", ".lst.quali.txt", ".lst.quanti.txt", ".lst.summary.txt"]
found = []
pangenome_file = ""
for f in pan_files:
for c in to_check:
if f.endswith(c):
found.append(c)
if f.endswith(".tsv.lst"):
pangenome_file = f
assert set(found) == set(to_check)
# Check presence of tmp folder
tmp_base = os.path.join(outdir, "tmp_test2PAN4.All.prt_0.8-mode1_*")
assert len(glob.glob(tmp_base)) == 1
tmp_base = os.path.join(outdir, "tmp_test2PAN4.All.prt_0.8-mode1")
assert os.path.isdir(tmp_base)
# Check content of pangenome
exp_pan = os.path.join(EXP_FILES, "exp_pangenome-4genomes.lst")
with open(exp_pan, "r") as ep, open(pangenome_file, "r") as panf:
with open(exp_pan, "r") as ep, open(out_panfile, "r") as panf:
lines_exp = []
lines_out = []
for line_exp in ep:
......@@ -253,9 +245,9 @@ def test_main_spedir(caplog):
# copy db_path folder to output folder, as it will modify it
shutil.copytree(DBPATH, used_dbpath)
out_panfile = os.path.join(outdir, "PanGenome-test3PAN4.All.prt-clust-0.8-mode1_")
out_panfile = os.path.join(outdir, "PanGenome-test3PAN4.All.prt-clust-0.8-mode1.lst")
assert pan.main(cmd, lstinfo, name, used_dbpath, min_id, outdir, clust_mode, spe_dir,
threads, verbose=15).startswith(out_panfile)
threads, verbose=15) == out_panfile
# Checl creation of prt bank
prtbank = os.path.join(spe_dir, "test3PAN4.All.prt")
assert os.path.isfile(prtbank)
......@@ -267,27 +259,24 @@ def test_main_spedir(caplog):
assert os.path.isfile(msdb + "_h")
assert os.path.isfile(msdb + "_h.index")
# Check presence of mmseq cluster files
cluster = os.path.join(outdir, "test3PAN4.All.prt-clust-0.8-mode1_*")
cluster = os.path.join(outdir, "test3PAN4.All.prt-clust-0.8-mode1*")
clust_files = glob.glob(cluster)
assert len(clust_files) == 4
# Check presence of tmp folder
tmp_base = os.path.join(outdir, "tmp_test3PAN4.All.prt_0.8-mode1_*")
assert len(glob.glob(tmp_base)) == 1
tmp_base = os.path.join(outdir, "tmp_test3PAN4.All.prt_0.8-mode1")
assert os.path.isdir(tmp_base)
# Check presence of pangenome files (pangenome, matrices, summary)
pan_files = glob.glob(os.path.join(GENEPATH, "PanGenome-test3PAN4*"))
to_check = [".tsv.lst", ".tsv.lst.quali.txt", ".tsv.lst.quanti.txt", ".tsv.lst.summary.txt"]
to_check = [".lst", ".lst.quali.txt", ".lst.quanti.txt", ".lst.summary.txt"]
found = []
pangenome_file = ""
for f in pan_files:
for c in to_check:
if f.endswith(c):
found.append(c)
if f.endswith(".tsv.lst"):
pangenome_file = f
assert set(found) == set(to_check)
# Check content of pangenome
exp_pan = os.path.join(EXP_FILES, "exp_pangenome-4genomes.lst")
with open(exp_pan, "r") as ep, open(pangenome_file, "r") as panf:
with open(exp_pan, "r") as ep, open(out_panfile, "r") as panf:
lines_exp = []
lines_out = []
for line_exp in ep:
......@@ -338,12 +327,12 @@ def test_main_outfile(caplog):
assert os.path.isfile(msdb + "_h")
assert os.path.isfile(msdb + "_h.index")
# Check presence of mmseq cluster files
cluster = os.path.join(outdir, "test4PAN4.All.prt-clust-0.8-mode1_*")
cluster = os.path.join(outdir, "test4PAN4.All.prt-clust-0.8-mode1*")
clust_files = glob.glob(cluster)
assert len(clust_files) == 4
# Check presence of tmp folder
tmp_base = os.path.join(outdir, "tmp_test4PAN4.All.prt_0.8-mode1_*")
assert len(glob.glob(tmp_base)) == 1
tmp_base = os.path.join(outdir, "tmp_test4PAN4.All.prt_0.8-mode1")
assert os.path.isdir(tmp_base)
# Check presence of pangenome files (pangenome, matrices, summary)
outf = os.path.join(outdir, outfile)
assert os.path.isfile(outf)
......@@ -402,23 +391,21 @@ def test_pangenome_all():
assert os.path.isfile(msdb + "_h")
assert os.path.isfile(msdb + "_h.index")
# Check presence of mmseq cluster files
cluster = os.path.join(outdir, "testAllPAN4.All.prt-clust-0.8-mode1_*")
cluster = os.path.join(outdir, "testAllPAN4.All.prt-clust-0.8-mode1*")
clust_files = glob.glob(cluster)
assert len(clust_files) == 4
# Check presence of tmp folder
tmp_base = os.path.join(outdir, "tmp_testAllPAN4.All.prt_0.8-mode1_*")
assert len(glob.glob(tmp_base)) == 1
tmp_base = os.path.join(outdir, "tmp_testAllPAN4.All.prt_0.8-mode1")
assert os.path.isdir(tmp_base)
# Check presence of pangenome files (pangenome, matrices, summary)
pan_files = glob.glob(os.path.join(GENEPATH, "PanGenome-testAllPAN4*"))
to_check = [".tsv.lst", ".tsv.lst.quali.txt", ".tsv.lst.quanti.txt", ".tsv.lst.summary.txt"]
to_check = [".lst", ".lst.quali.txt", ".lst.quanti.txt", ".lst.summary.txt"]
found = []
pangenome_file = ""
pangenome_file = os.path.join(outdir, "PanGenome-testAllPAN4.All.prt-clust-0.8-mode1.lst")
for f in pan_files:
for c in to_check:
if f.endswith(c):
found.append(c)
if f.endswith(".tsv.lst"):
pangenome_file = f
assert set(found) == set(to_check)
# Check content of pangenome
exp_pan = os.path.join(EXP_FILES, "exp_pangenome-4genomes.lst")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment