diff --git a/PanACoTA/annotate_module/format_prodigal.py b/PanACoTA/annotate_module/format_prodigal.py index d1005c142086ec6e4f26210a9299c25558930431..512f32d9a60298394d5eae07181862db4bf17119 100644 --- a/PanACoTA/annotate_module/format_prodigal.py +++ b/PanACoTA/annotate_module/format_prodigal.py @@ -29,7 +29,7 @@ logger = logging.getLogger("annotate.prodigal_format") def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, - rep_dir, gff_dir): + rep_dir, gff_dir, logger, changed_name=False): """ Format the given genome, and create its corresponding files in the following folders: @@ -79,9 +79,24 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, res_rep_file = os.path.join(rep_dir, name + ".fna") res_gff_file = os.path.join(gff_dir, name + ".gff") + # Generate replicon file (same as input sequence but with gembase formatted headers). From + # this file, get contig names, to be used to generate gff file + contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file) + if not contigs: + try: + os.remove(res_lst_file) + os.remove(res_gff_file) + os.remove(res_gene_file) + os.remove(res_prt_file) + os.remove(res_rep_file) + except OSError: + pass + logger.error("Problems while generating Replicon file for {}".format(name)) + return False + # First, create .gen and .lst files. If they could not be formatted, # remove those files, and return False with error message - ok = create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger) + ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, name, logger) if not ok: try: os.remove(res_gen_file) @@ -91,10 +106,8 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, logger.error(f"Problems while generating .gen and .lst files for {name}") return False - # Create replicon and gff files. Replicon is the same as input sequence but with - # gembase formatted headers - # From Replicon file, get contig names, and use them to generate gff file - ok = create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name) + # Create gff files. + ok = create_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name) # If problem while formatting the genome (rep or gff file), remove all # already created files, and return False (genome not formatted) with error message. if not ok: @@ -128,7 +141,7 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, return ok -def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): +def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name, logger): """ Generate .gen file, from sequences contained in .ffn, but changing the headers to match with gembase format. @@ -136,12 +149,16 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): Parameters ---------- + contigs : dict + {original_contig_name: gembase_contig_name} gen_file : str .ffn file generated by prodigal res_gen_file : str generated .gen file, to write in Genes directory res_lst_file : str generated .lst file to write in LSTINFO directory + gpath : str + path to the genome given to prodigal name : str gembase name of the genome to format logger : logging.Logger @@ -173,7 +190,6 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): prev_loc = "b" # To start, the first gene is, by definition, at the border of the contig loc = "b" - # Open files: .ffn prodigal to read, .gen and .lst gembase to create with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\ open(res_lst_file, "w") as r_lst: @@ -194,19 +210,26 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): # Get contig number from prodigal gene header: prodigal first part of header is: # <original genome name contig name>_<protein number> contig_name = "_".join(gname.strip().split("_")[:-1]) + # If new contig: # - previous gene was the last of its contig -> prev_loc = "b" ; # - the current gene is the first of its contig (loc = "b") # - we must increment the contig number if contig_name != prev_cont_name: - contig_num += 1 + # Check that this contig name is in the list, and get its gembase contig number + if contig_name in contigs: + contig_num = contigs[contig_name].split(".")[-1] + # if not in the list, problem, return false + else: + logger.error(f"{contig_name} found in {gen_file} does not exist in {gpath}.") + return False prev_loc = 'b' loc = 'b' # If not new contig. If prev_loc == 'b', previous gene is the first protein # of this contig. # Current gene will be inside the contig (except if new contig for the next gene, # meaning only 1 gene in the contig) - if contig_name == prev_cont_name and prev_loc == "b": + else: loc = 'i' # If it is the first gene of the genome, we do not have any "previous gene" @@ -259,69 +282,7 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): return True -def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name): - """ - Create replicon file by changing the headers in the input sequence. - While changing the headers, get their name and size to put in gff file. - - Then, create gff file with the gene names and contig sizes - - Parameters - ---------- - gpath : str - path to the genome sequence - res_rep_file : str - path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers - res_gff_file : str - path to the gff file that must be created in result database - res_lst_file : str - path to the lst file that was created in result database in the previous step - gff_file : str - path to gff file generated by prodigal - name : str - genome name to use (species.date.strain) - - Returns - ------- - bool : - True if everything went well, False otherwise - - """ - contigs = create_replicons(gpath, res_rep_file, name) - if contigs: - return create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs) - return False - - -def create_replicons(gpath, res_rep_file, name): - """ - Create Replicons file: - - Read input sequence, and replace headers with the new contig headers (found while creating - lst and gen files) - - Parameters - ---------- - gpath : str - path to the genome sequence - res_rep_file : str - path to the new file, containing 'gpath' sequence, but with 'gembase_named' headers - name : str - genome name to use (species.date.strain) - - Returns - ------- - list : - List of all contigs with their size ["contig1;size1", "contig2;size2" ...] - - """ - # Change headers of replicon file to put into gembase format, and save it in res_rep_file - contigs, _ = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file) - # Return each contig name with its size. - return contigs - - -def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs): +def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes): """ Create .gff3 file. @@ -350,9 +311,10 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs): path to the gff file that must be created in result database res-lst_file : str path to the lst file that was created in result database in the previous step - contigs : list of str - list of contig names with their size. ["contig1"\t"size1", "contig2"\t"size2" ...] - + contigs : dict + dict of contig names with their size. ["original_name": "gembase_name"] + sizes : dict + dict of contig gembase names with their sizes {"gembase_name": size} Returns ------- bool diff --git a/test/data/annotate/exp_files/res_create_gene_lst_prodigal.lst b/test/data/annotate/exp_files/res_create_gene_lst_prodigal.lst new file mode 100644 index 0000000000000000000000000000000000000000..69198e4186d33f84fc8449e853fceeace73cce2c --- /dev/null +++ b/test/data/annotate/exp_files/res_create_gene_lst_prodigal.lst @@ -0,0 +1,14 @@ +287 787 D CDS test.0417.00002.0001b_00001 NA | NA | NA | i | NA +4416 6068 D CDS test.0417.00002.0001i_00002 NA | NA | NA | i | NA +9000 12000 C CDS test.0417.00002.0001b_00003 NA | NA | NA | i | NA +77 1237 D CDS test.0417.00002.0002b_00004 NA | NA | NA | info | NA +1279 2346 D CDS test.0417.00002.0002i_00005 NA | NA | NA | fefer | NA +2419 3000 D CDS test.0417.00002.0002i_00006 NA | NA | NA | a | NA +3500 4000 D CDS test.0417.00002.0002i_00007 NA | NA | NA | vfvfdv | NA +4632 5000 C CDS test.0417.00002.0002b_00008 NA | NA | NA | fdf | NA +3399 4538 D CDS test.0417.00002.0003b_00009 NA | NA | NA | fdfd | NA +4535 7888 D CDS test.0417.00002.0004b_00010 NA | NA | NA | info | NA +7854 9491 D CDS test.0417.00002.0007b_00011 NA | NA | NA | ffff | NA +9525 11285 D CDS test.0417.00002.0007i_00012 NA | NA | NA | sdfdfd | NA +11249 12328 D CDS test.0417.00002.0007b_00013 NA | NA | NA | b | NA + diff --git a/test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa b/test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa new file mode 100644 index 0000000000000000000000000000000000000000..53043bbcf60aaa1b5778f049488964c2ee5d228e --- /dev/null +++ b/test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa @@ -0,0 +1,33 @@ +>JGIKIPgffgIJ_00001 # 287 # 787 # 1 # i +aacgcgcatccagcgctagagacgctctcgacgagc +cagatctgca +cacaacaggtcgcgctcg +>JGIKIPgffgIJ_00005 # 4416 # 6068 # 1 # i +cagagatgcccc +ccgcgttt +>JGIKIPgffgIJ_00008 # 9000 # 12000 # -1 #i +aaccggcctcgcgcatcgcatcagcag +>toto_00009 # 77 # 1237 # 1 # info +acgctagagagctcgcgctaagagatc +>toto_00010 # 1279 # 2346 # 1 # fefer +CCGAATAGCGCGCTCAGAGAGAGAGGA +CGATAGCTCTCGCA +CCGCATAGC +>toto_00011 # 2419 # 3000 # 1 # a +AAGCTCGCTAGACNNCTCGATACNNCTCGGATAGC +>toto_00011 # 3500 # 4000 # 1 # vfvfdv +CAGATAGCNNCGCTAGAGGAGAGCTCTCGGAGA +>toto_00011 # 4632 # 5000 # -1 # fdf +CCGAGATCGCGCGCGCTCTTCTCGAGA +>other_header_00013 # 3399 # 4538 # 1 # fdfd +AGCTCTCGAGAGGAGCGCTCGA +CGCTCGCGCA +CCTATAGGAACCACCGGGG +>my contig_00014 # 4535 # 7888 # 1 # info +CAGGATAGCGCGCTCAGAG +>contname_03015 # 7854 # 9491 # 1 # ffff +AGATCCGCGCGCTATAGAGC +>contname_03016 # 9525 # 11285 # 1 # sdfdfd +AAGGATCTCTCGCGAGAGGA +>contname_03017 # 11249 # 12328 # 1 # b +CCAGGATAGCGCGCGC diff --git a/test/test_unit/test_annotate/test_format_prodigal.py b/test/test_unit/test_annotate/test_format_prodigal.py new file mode 100644 index 0000000000000000000000000000000000000000..43f8bb2d9216f03868e5ea65b543584c47be3017 --- /dev/null +++ b/test/test_unit/test_annotate/test_format_prodigal.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +""" +Unit tests for general_format_functions.py +""" + +import os +import logging +import shutil +from io import StringIO +import pytest + +import PanACoTA.annotate_module.format_prodigal as prodigalfunc +import PanACoTA.utils as utils +import test.test_unit.utilities_for_tests as tutil + +ANNOTEDIR = os.path.join("test", "data", "annotate") +GENOMES_DIR = os.path.join(ANNOTEDIR, "genomes") +EXP_ANNOTE = os.path.join(ANNOTEDIR, "exp_files") +TEST_ANNOTE = os.path.join(ANNOTEDIR, "test_files") +GENEPATH = os.path.join(ANNOTEDIR, "generated_by_unit-tests") + +LOGFILE_BASE = os.path.join(GENEPATH, "logs", "logfile") +LOGFILES = [LOGFILE_BASE + ext for ext in [".log", ".log.debug", ".log.details", ".log.err"]] + +@pytest.fixture(autouse=True) +def setup_teardown_module(): + """ + Remove log files at the end of this test module + + Before each test: + - init logger + - create directory to put generated files + + After: + - remove all log files + - remove directory with generated results + """ + os.mkdir(GENEPATH) + # utils.init_logger(LOGFILE_BASE, 0, 'test_fastme', verbose=1) + print("setup") + + yield + # shutil.rmtree(GENEPATH) + # for f in LOGFILES: + # if os.path.exists(f): + # os.remove(f) + print("teardown") + + +def test_create_gen_lst(caplog): + """ + Check that generated gen and lst files are as expected. + In the test file, all genomes have names different from gembase name + This test file contains the following aspects: + - gene in D strand (start < end) + - gene in C strand (start > end) + - CDS features + - contigs with more than 2 genes + - contig with only 2 genes (both 'b' loc) + - contig with 1 gene ('b' loc) + - contig without gene (should be skipped) + """ + caplog.set_level(logging.DEBUG) + logger = logging.getLogger("test_prodigal") + genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.faa") + # contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50", + # "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50", + # "test.0417.00002.0007\t50"] + contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001", + "toto": "test.0417.00002.0002", + "other_header": "test.0417.00002.0003", + "my contig": "test.0417.00002.0004", + "bis": "test.0417.00002.0005", + "ter": "test.0417.00002.0006", + "contname": "test.0417.00002.0007" + } + name = "test.0417.00002" + res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen") + res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst") + gpath = "original_genome_name" + assert prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger) + exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") + assert tutil.compare_order_content(exp_lst, res_lst_file) + + +def test_create_gen_lst_cont_unknown(caplog): + """ + A contig name in the gen file does not exist -> error message, and all result files must be removed for this genome + """ + caplog.set_level(logging.DEBUG) + logger = logging.getLogger("test_prodigal") + genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.faa") + contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001", + "toto": "test.0417.00002.0002", + "other_header": "test.0417.00002.0003", + "my_contig": "test.0417.00002.0004", + "bis": "test.0417.00002.0005", + "ter": "test.0417.00002.0006", + "contname": "test.0417.00002.0007" + } + name = "test.0417.00002" + res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen") + res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst") + gpath = "original_genome_name" + assert not prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger) + assert ("my contig fount in test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa " + "does not exist in original_genome_name") + + +def test_create_gen_lst(caplog): + """ + Check that generated gen and lst files are as expected. + In the test file, all genomes have names different from gembase name + This test file contains the following aspects: + - gene in D strand (start < end) + - gene in C strand (start > end) + - CDS features + - contigs with more than 2 genes + - contig with only 2 genes (both 'b' loc) + - contig with 1 gene ('b' loc) + - contig without gene (should be skipped) + """ + caplog.set_level(logging.DEBUG) + logger = logging.getLogger("test_prodigal") + genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.gff") + # contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50", + # "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50", + # "test.0417.00002.0007\t50"] + contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001", + "toto": "test.0417.00002.0002", + "other_header": "test.0417.00002.0003", + "my contig": "test.0417.00002.0004", + "bis": "test.0417.00002.0005", + "ter": "test.0417.00002.0006", + "contname": "test.0417.00002.0007" + } + name = "test.0417.00002" + res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen") + res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst") + gpath = "original_genome_name" + assert prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger) + exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") + assert tutil.compare_order_content(exp_lst, res_lst_file) + +# # def test_tbl_to_lst_not_changed_names(caplog): +# """ +# Check that generated lstinfo file is as expected, when the genome name is the same as +# it already was in the genome given to prokka. +# The test tblfile contains the following aspects: +# - gene in D strand (start < end) +# - gene in C strand (start > end) +# - CDS features (some with all info = ECnumber, gene name, product etc. ; +# some with missing info) +# - tRNA type +# - repeat_region type (*2) +# - contigs with more than 2 genes +# - contig with only 2 genes (both 'b' loc) +# - contig with 1 gene ('b' loc) +# - contig without gene (should be skipped) +# """ +# caplog.set_level(logging.DEBUG) +# logger = logging.getLogger("test_prokka") +# tblfile = os.path.join(TEST_ANNOTE, "original_name.fna-prokkaRes", "prokka_out_for_test.tbl") +# lstfile = os.path.join(GENEPATH, "res_test_tbl2lst.lst") +# contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50", +# "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50", +# "test.0417.00002.0007\t50"] +# name = "test.0417.00002" +# assert prokkafunc.tbl2lst(tblfile, lstfile, contigs, name, logger, changed_name=False) +# exp_lst = os.path.join(EXP_ANNOTE, "res_tbl2lst.lst") +# assert tutil.compare_order_content(exp_lst, lstfile) +