diff --git a/PanACoTA/align_module/pan_to_pergenome.py b/PanACoTA/align_module/pan_to_pergenome.py index 86d20b50603783ef2e532204283d936c9c8edf8b..1975c9cacaf3119a2472b57a0cbe2b6cec1de2a3 100755 --- a/PanACoTA/align_module/pan_to_pergenome.py +++ b/PanACoTA/align_module/pan_to_pergenome.py @@ -108,7 +108,7 @@ def get_per_genome(persgen, list_gen, dname, outdir): # Sort proteins by strain logger.info("Reading PersGenome and constructing lists of missing genomes in each family.") - all_prots, fam_genomes, several = proteins_per_strain(persgen) + all_prots, fam_genomes, several = proteins_per_strain(persgen, all_genomes) # Write output files write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes) write_missing_genomes(fam_genomes, several, all_genomes, aldir, dname) @@ -139,7 +139,7 @@ def get_all_genomes(list_gen): return all_genomes -def proteins_per_strain(persgen): +def proteins_per_strain(persgen, all_genomes): """ From the persistentGenome file, get all persistent proteins, and classify them according to the strain from which they are. @@ -154,33 +154,36 @@ def proteins_per_strain(persgen): (all_prots, fam_genomes, several) : tuple * all_prots: dict, {strain: {member: fam_num}} - * fam_genomes: dict, {fam_num: [genomes having a member in fam]} - * several: dict, {fam_num: [genomes having several members in fam]} + * fam_genomes: dict, {fam_num: set(genomes having a member in fam)} + * several: dict, {fam_num: set(genomes having several members in fam)} """ logger.info("Getting all persistent proteins and classify by strain.") + all_genomes = frozenset(all_genomes) all_prots = {} # {strain: {member: fam_num}} - fam_genomes = {} # {fam_num: [genomes having a member in fam]} - several = {} # {fam_num: [genomes having several members in fam]} + fam_genomes = {} # {fam_num: set(genomes having a member in fam)} + several = {} # {fam_num: set(genomes having several members in fam)} with open(persgen, "r") as pgf: for line in pgf: fam_num = line.split()[0] members = line.strip().split()[1:] - fam_genomes[fam_num] = [] - several[fam_num] = [] + fam_genomes[fam_num] = set() + several[fam_num] = set() for mem in members: # if format is ESCO.1512.00001.i0002_12124, strain is 3 first fields # separated by '.' if "." in mem and len(mem.split(".")) >= 3: strain = ".".join(mem.split(".")[:3]) + if strain not in all_genomes: + strain = "_".join(mem.split("_")[:-1]) # if format is not like this, it must be something_00001: else: strain = "_".join(mem.split("_")[:-1]) # if strain not already in fam_genomes, add it if strain not in fam_genomes[fam_num]: - fam_genomes[fam_num].append(strain) + fam_genomes[fam_num].add(strain) # If strain already in fam_genomes, it has several members: add it to several elif strain not in several[fam_num]: - several[fam_num].append(strain) + several[fam_num].add(strain) if strain not in all_prots: all_prots[strain] = {} if mem in all_prots[strain]: @@ -221,9 +224,9 @@ def write_getentry_files(all_prots, several, listdir, aldir, dname, all_genomes) error.append(strain) if error: for gen in error: - logger.error(("There is not any protein for genome {} in any family! " - "The program will close, please fix this problem to be able to " - "run the alignments").format(gen)) + logger.error(f"There is not any protein for genome {gen} in any family! " + "The program will close, please fix this problem to be able to " + "run the alignments") sys.exit(1) diff --git a/test/test_unit/test_align/test_pan2pergenome.py b/test/test_unit/test_align/test_pan2pergenome.py index 58e60c5c309e1904b6b8564310ea0d50549c9d6e..12b59dfd0dce9749ccc532c0fcda361224ae2352 100755 --- a/test/test_unit/test_align/test_pan2pergenome.py +++ b/test/test_unit/test_align/test_pan2pergenome.py @@ -100,27 +100,29 @@ def test_get_per_genome(caplog): assert set(fams) == set(exp_fams) -def test_prot_per_strain(): +def test_prot_per_strain_gembase(): """ - Test parser of persistent genome file + Test parser of persistent genome file when genome names are in gembase format + -> genome files = ESCO.0421.00001.prt and prot names = ESCO.0421.00001.0001i_00001 """ pers = os.path.join("test", "data", "persgenome", "exp_files", "exp_pers-floor-mixed.txt") - all_prots, fams_genomes, several = p2p.proteins_per_strain(pers) - exp_several = {'1': ["GENO.1216.00002"], - '3': [], - '5': [], - '8': [], - '10': [], - '11': [], - '12': ["GENO.1216.00003"]} + all_genomes = ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"] + all_prots, fams_genomes, several = p2p.proteins_per_strain(pers, all_genomes) + exp_several = {'1': {"GENO.1216.00002"}, + '3': set(), + '5': set(), + '8': set(), + '10': set(), + '11': set(), + '12': {"GENO.1216.00003"}} assert several == exp_several - exp_fams = {'1': ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"], - '3': ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"], - '5': ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"], - '8': ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002"], - '10': ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002"], - '11': ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002"], - '12': ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"]} + exp_fams = {'1': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"}, + '3': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"}, + '5': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"}, + '8': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002"}, + '10': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002"}, + '11': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002"}, + '12': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"}} assert fams_genomes == exp_fams exp_prots = {"GEN4.1111.00001": {"GEN4.1111.00001.b0001_00001": '1', "GEN4.1111.00001.b0001_00009": '3', @@ -157,18 +159,118 @@ def test_prot_per_strain(): assert all_prots == exp_prots +def test_prot_per_strain_completeg(): + """ + Test parser of persistent genome file when genome names are in gembase complete genomes format + -> genome files = ESCO.0421.00001.C001.prt and prot names = ESCO.0421.00001.C001_00001 + """ + pers = os.path.join(TESTPATH, "pers_genome_complete-gembase.txt") + all_genomes = ["GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002", "GENO.1216.00003.C001"] + all_prots, fams_genomes, several = p2p.proteins_per_strain(pers, all_genomes) + exp_several = {'1': {"GENO.1216.00002.C002"}, + '3': set(), + '5': set(), + '8': set(), + '10': set(), + '11': set(), + '12': {"GENO.1216.00003.C001"}} + assert several == exp_several + exp_fams = {'1': {"GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002", "GENO.1216.00003.C001"}, + '3': {"GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002", "GENO.1216.00003.C001"}, + '5': {"GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002", "GENO.1216.00003.C001"}, + '8': {"GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002"}, + '10': {"GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002"}, + '11': {"GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002"}, + '12': {"GEN4.1111.00001.C001", "GEN4.1111.00001.P001", "GENO.1216.00002.C002", "GENO.1216.00003.C001"}} + assert fams_genomes == exp_fams + exp_prots = {"GEN4.1111.00001.C001": {"GEN4.1111.00001.C001_00001": '1', + "GEN4.1111.00001.C001_00009": '3', + "GEN4.1111.00001.C001_00002": '5', + "GEN4.1111.00001.C001_00007": '8', + "GEN4.1111.00001.C001_00004": '10', + "GEN4.1111.00001.C001_00005": '11', + "GEN4.1111.00001.C001_00008": '12' + }, + "GEN4.1111.00001.P001": {"GEN4.1111.00001.P001_00002": '1', + "GEN4.1111.00001.P001_00011": '3', + "GEN4.1111.00001.P001_00003": '5', + "GEN4.1111.00001.P001_00009": '8', + "GEN4.1111.00001.P001_00004": '10', + "GEN4.1111.00001.P001_00005": '11', + "GEN4.1111.00001.P001_00010": '12' + }, + "GENO.1216.00002.C002": {"GENO.1216.00002.C002_00001": '1', + "GENO.1216.00002.C002_00002": '1', + "GENO.1216.00002.C002_00010": '3', + "GENO.1216.00002.C002_00003": '5', + "GENO.1216.00002.C002_00008": '8', + "GENO.1216.00002.C002_00005": '10', + "GENO.1216.00002.C002_00006": '11', + "GENO.1216.00002.C002_00009": '12' + }, + "GENO.1216.00003.C001": {"GENO.1216.00003.C001_00003": '1', + "GENO.1216.00003.C001_01010": '3', + "GENO.1216.00003.C001_00010": '5', + "GENO.1216.00003.C001_00004": '12', + "GENO.1216.00003.C001_01000": '12' + } + } + assert all_prots == exp_prots + + +def test_prot_per_strain_noformat(): + """ + Test parser of persistent genome file when genome names are anything + -> genome files = my.genome-name.prt and prot names = my.genome-name_00001 + """ + pers = os.path.join(TESTPATH, "pers_genome_noformat.txt") + all_genomes = ["my.genome-name", "my-genome", "genome1", "genome3"] + all_prots, fams_genomes, several = p2p.proteins_per_strain(pers, all_genomes) + exp_several = {'1': {"genome1"}, + '5': set(), + '12':set(), + '32': set()} + assert several == exp_several + exp_fams = {'1': {"my.genome-name", "my-genome", "genome1", "genome3"}, + '5': {"my.genome-name", "my-genome", "genome3"}, + '12': {"my.genome-name", "my-genome", "genome1"}, + '32': {"my.genome-name", "my-genome", "genome1", "genome3"}} + assert fams_genomes == exp_fams + exp_prots = {"my.genome-name": {"my.genome-name_1": '1', + "my.genome-name_2": '5', + "my.genome-name_3": '12', + "my.genome-name_4": '32' + }, + "my-genome": {"my-genome_1": '1', + "my-genome_2": '5', + "my-genome_3": '12', + "my-genome_50": '32' + }, + "genome1": {"genome1_1": '1', "genome1_10": "1", + "genome1_3": '12', + "genome1_4": '32', + }, + "genome3": {"genome3_1": '1', + "genome3_3": '5', + "genome3_4": '32', + } + } + assert all_prots == exp_prots + + def test_prot_per_strain_member_bis(caplog): """ Test parser of persistent genome file when a same member is in 2 different families """ caplog.set_level(logging.DEBUG) pers = os.path.join(TESTPATH, "pers_genome_member-bis.txt") - all_prots, fams_genomes, several = p2p.proteins_per_strain(pers) + all_genomes = ["t", "j"] + all_prots, fams_genomes, several = p2p.proteins_per_strain(pers, all_genomes) assert "problem: ESCO2_2 already exists, in family 5. Conflict with family 32" in caplog.text - exp_several = {'1': [], '5': [], '12': [], '32': []} + exp_several = {'1': set(), '5': set(), '12': set(), '32': set()} assert several == exp_several - exp_fams = {'1': ["ESCO_1", "ESCO2", "ESCO3", "ESCO4"], '5': ["ESCO_1", "ESCO2", "ESCO4"], - '12': ["ESCO_1", "ESCO2", "ESCO3"], '32': ["ESCO_1", "ESCO2", "ESCO3", "ESCO4"]} + exp_fams = {'1': {"ESCO_1", "ESCO2", "ESCO3", "ESCO4"}, '5': {"ESCO_1", "ESCO2", "ESCO4"}, + '12': {"ESCO_1", "ESCO2", "ESCO3"}, '32': {"ESCO_1", "ESCO2", "ESCO3", "ESCO4"}} assert fams_genomes == exp_fams exp_prots = {"ESCO_1": {"ESCO_1_1": '1', "ESCO_1_2": '5', "ESCO_1_3": '12', "ESCO_1_4": '32'}, "ESCO2": {"ESCO2_1": '1', "ESCO2_2": '32', "ESCO2_3": '12'}, @@ -177,6 +279,34 @@ def test_prot_per_strain_member_bis(caplog): assert all_prots == exp_prots +def test_prot_per_strain_gembase_member_bis(caplog): + """ + Test parser of persistent genome file when a same member is in 2 different families, with gembase format + """ + caplog.set_level(logging.DEBUG) + pers = os.path.join(TESTPATH, "pers_genome_gembase_member-bis.txt") + all_genomes = ["GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"] + all_prots, fams_genomes, several = p2p.proteins_per_strain(pers, all_genomes) + assert "problem: GEN4.1111.00001.0001i_00005 already exists, in family 12. Conflict with family 32" in caplog.text + exp_several = {'1': {"GENO.1216.00002"}, '5': set(), '12': set(), '32': {"GENO.1216.00003"}} + assert several == exp_several + exp_fams = {'1': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"}, + '5': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"}, + '12': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002"}, + '32': {"GEN4.1111.00001", "GENO.0817.00001", "GENO.1216.00002", "GENO.1216.00003"}} + assert fams_genomes == exp_fams + exp_prots = {"GEN4.1111.00001": {"GEN4.1111.00001.0001b_00001": '1', "GEN4.1111.00001.0001b_00009": '5', + "GEN4.1111.00001.0001i_00005": '12', "GEN4.1111.00001.0001i_00005": '32'}, + "GENO.0817.00001": {"GENO.0817.00001.0001b_00002": '1', "GENO.0817.00001.0002b_00011": '5', + "GENO.0817.00001.0002i_00005": '12', "GENO.0817.00001.i0002_00010": '32'}, + "GENO.1216.00002": {"GENO.1216.00002.0001b_00001": '1', "GENO.1216.00002.i0001_00002": "1", + "GENO.1216.00002.0002b_00010": '5', "GENO.1216.00002.0001i_00006": '12', + "GENO.1216.00002.b0002_00009": '32'}, + "GENO.1216.00003": {"GENO.1216.00003.i0001_00003": '1', "GENO.1216.00003.i0001_01010": '5', + "GENO.1216.00003.i0001_00004": '32', "GENO.1216.00003.i0001_01000": '32'}} + assert all_prots == exp_prots + + def test_get_genomes(): """ Test parser of list of genomes