Skip to content
Snippets Groups Projects
Commit a515235a authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Safer way to generate rep, gen and lst files from prodigal results

parent 017bad59
No related tags found
No related merge requests found
...@@ -29,7 +29,7 @@ logger = logging.getLogger("annotate.prodigal_format") ...@@ -29,7 +29,7 @@ logger = logging.getLogger("annotate.prodigal_format")
def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
rep_dir, gff_dir): rep_dir, gff_dir, logger, changed_name=False):
""" """
Format the given genome, and create its corresponding files in the following folders: Format the given genome, and create its corresponding files in the following folders:
...@@ -79,9 +79,24 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, ...@@ -79,9 +79,24 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
res_rep_file = os.path.join(rep_dir, name + ".fna") res_rep_file = os.path.join(rep_dir, name + ".fna")
res_gff_file = os.path.join(gff_dir, name + ".gff") res_gff_file = os.path.join(gff_dir, name + ".gff")
# Generate replicon file (same as input sequence but with gembase formatted headers). From
# this file, get contig names, to be used to generate gff file
contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file)
if not contigs:
try:
os.remove(res_lst_file)
os.remove(res_gff_file)
os.remove(res_gene_file)
os.remove(res_prt_file)
os.remove(res_rep_file)
except OSError:
pass
logger.error("Problems while generating Replicon file for {}".format(name))
return False
# First, create .gen and .lst files. If they could not be formatted, # First, create .gen and .lst files. If they could not be formatted,
# remove those files, and return False with error message # remove those files, and return False with error message
ok = create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger) ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, name, logger)
if not ok: if not ok:
try: try:
os.remove(res_gen_file) os.remove(res_gen_file)
...@@ -91,10 +106,8 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, ...@@ -91,10 +106,8 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
logger.error(f"Problems while generating .gen and .lst files for {name}") logger.error(f"Problems while generating .gen and .lst files for {name}")
return False return False
# Create replicon and gff files. Replicon is the same as input sequence but with # Create gff files.
# gembase formatted headers ok = create_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name)
# From Replicon file, get contig names, and use them to generate gff file
ok = create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name)
# If problem while formatting the genome (rep or gff file), remove all # If problem while formatting the genome (rep or gff file), remove all
# already created files, and return False (genome not formatted) with error message. # already created files, and return False (genome not formatted) with error message.
if not ok: if not ok:
...@@ -128,7 +141,7 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, ...@@ -128,7 +141,7 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
return ok return ok
def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name, logger):
""" """
Generate .gen file, from sequences contained in .ffn, but changing the Generate .gen file, from sequences contained in .ffn, but changing the
headers to match with gembase format. headers to match with gembase format.
...@@ -136,12 +149,16 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): ...@@ -136,12 +149,16 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
Parameters Parameters
---------- ----------
contigs : dict
{original_contig_name: gembase_contig_name}
gen_file : str gen_file : str
.ffn file generated by prodigal .ffn file generated by prodigal
res_gen_file : str res_gen_file : str
generated .gen file, to write in Genes directory generated .gen file, to write in Genes directory
res_lst_file : str res_lst_file : str
generated .lst file to write in LSTINFO directory generated .lst file to write in LSTINFO directory
gpath : str
path to the genome given to prodigal
name : str name : str
gembase name of the genome to format gembase name of the genome to format
logger : logging.Logger logger : logging.Logger
...@@ -173,7 +190,6 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): ...@@ -173,7 +190,6 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
prev_loc = "b" prev_loc = "b"
# To start, the first gene is, by definition, at the border of the contig # To start, the first gene is, by definition, at the border of the contig
loc = "b" loc = "b"
# Open files: .ffn prodigal to read, .gen and .lst gembase to create # Open files: .ffn prodigal to read, .gen and .lst gembase to create
with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\ with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\
open(res_lst_file, "w") as r_lst: open(res_lst_file, "w") as r_lst:
...@@ -194,19 +210,26 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): ...@@ -194,19 +210,26 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
# Get contig number from prodigal gene header: prodigal first part of header is: # Get contig number from prodigal gene header: prodigal first part of header is:
# <original genome name contig name>_<protein number> # <original genome name contig name>_<protein number>
contig_name = "_".join(gname.strip().split("_")[:-1]) contig_name = "_".join(gname.strip().split("_")[:-1])
# If new contig: # If new contig:
# - previous gene was the last of its contig -> prev_loc = "b" ; # - previous gene was the last of its contig -> prev_loc = "b" ;
# - the current gene is the first of its contig (loc = "b") # - the current gene is the first of its contig (loc = "b")
# - we must increment the contig number # - we must increment the contig number
if contig_name != prev_cont_name: if contig_name != prev_cont_name:
contig_num += 1 # Check that this contig name is in the list, and get its gembase contig number
if contig_name in contigs:
contig_num = contigs[contig_name].split(".")[-1]
# if not in the list, problem, return false
else:
logger.error(f"{contig_name} found in {gen_file} does not exist in {gpath}.")
return False
prev_loc = 'b' prev_loc = 'b'
loc = 'b' loc = 'b'
# If not new contig. If prev_loc == 'b', previous gene is the first protein # If not new contig. If prev_loc == 'b', previous gene is the first protein
# of this contig. # of this contig.
# Current gene will be inside the contig (except if new contig for the next gene, # Current gene will be inside the contig (except if new contig for the next gene,
# meaning only 1 gene in the contig) # meaning only 1 gene in the contig)
if contig_name == prev_cont_name and prev_loc == "b": else:
loc = 'i' loc = 'i'
# If it is the first gene of the genome, we do not have any "previous gene" # If it is the first gene of the genome, we do not have any "previous gene"
...@@ -259,69 +282,7 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): ...@@ -259,69 +282,7 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
return True return True
def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name): def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes):
"""
Create replicon file by changing the headers in the input sequence.
While changing the headers, get their name and size to put in gff file.
Then, create gff file with the gene names and contig sizes
Parameters
----------
gpath : str
path to the genome sequence
res_rep_file : str
path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers
res_gff_file : str
path to the gff file that must be created in result database
res_lst_file : str
path to the lst file that was created in result database in the previous step
gff_file : str
path to gff file generated by prodigal
name : str
genome name to use (species.date.strain)
Returns
-------
bool :
True if everything went well, False otherwise
"""
contigs = create_replicons(gpath, res_rep_file, name)
if contigs:
return create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs)
return False
def create_replicons(gpath, res_rep_file, name):
"""
Create Replicons file:
Read input sequence, and replace headers with the new contig headers (found while creating
lst and gen files)
Parameters
----------
gpath : str
path to the genome sequence
res_rep_file : str
path to the new file, containing 'gpath' sequence, but with 'gembase_named' headers
name : str
genome name to use (species.date.strain)
Returns
-------
list :
List of all contigs with their size ["contig1;size1", "contig2;size2" ...]
"""
# Change headers of replicon file to put into gembase format, and save it in res_rep_file
contigs, _ = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file)
# Return each contig name with its size.
return contigs
def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs):
""" """
Create .gff3 file. Create .gff3 file.
...@@ -350,9 +311,10 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs): ...@@ -350,9 +311,10 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs):
path to the gff file that must be created in result database path to the gff file that must be created in result database
res-lst_file : str res-lst_file : str
path to the lst file that was created in result database in the previous step path to the lst file that was created in result database in the previous step
contigs : list of str contigs : dict
list of contig names with their size. ["contig1"\t"size1", "contig2"\t"size2" ...] dict of contig names with their size. ["original_name": "gembase_name"]
sizes : dict
dict of contig gembase names with their sizes {"gembase_name": size}
Returns Returns
------- -------
bool bool
......
287 787 D CDS test.0417.00002.0001b_00001 NA | NA | NA | i | NA
4416 6068 D CDS test.0417.00002.0001i_00002 NA | NA | NA | i | NA
9000 12000 C CDS test.0417.00002.0001b_00003 NA | NA | NA | i | NA
77 1237 D CDS test.0417.00002.0002b_00004 NA | NA | NA | info | NA
1279 2346 D CDS test.0417.00002.0002i_00005 NA | NA | NA | fefer | NA
2419 3000 D CDS test.0417.00002.0002i_00006 NA | NA | NA | a | NA
3500 4000 D CDS test.0417.00002.0002i_00007 NA | NA | NA | vfvfdv | NA
4632 5000 C CDS test.0417.00002.0002b_00008 NA | NA | NA | fdf | NA
3399 4538 D CDS test.0417.00002.0003b_00009 NA | NA | NA | fdfd | NA
4535 7888 D CDS test.0417.00002.0004b_00010 NA | NA | NA | info | NA
7854 9491 D CDS test.0417.00002.0007b_00011 NA | NA | NA | ffff | NA
9525 11285 D CDS test.0417.00002.0007i_00012 NA | NA | NA | sdfdfd | NA
11249 12328 D CDS test.0417.00002.0007b_00013 NA | NA | NA | b | NA
>JGIKIPgffgIJ_00001 # 287 # 787 # 1 # i
aacgcgcatccagcgctagagacgctctcgacgagc
cagatctgca
cacaacaggtcgcgctcg
>JGIKIPgffgIJ_00005 # 4416 # 6068 # 1 # i
cagagatgcccc
ccgcgttt
>JGIKIPgffgIJ_00008 # 9000 # 12000 # -1 #i
aaccggcctcgcgcatcgcatcagcag
>toto_00009 # 77 # 1237 # 1 # info
acgctagagagctcgcgctaagagatc
>toto_00010 # 1279 # 2346 # 1 # fefer
CCGAATAGCGCGCTCAGAGAGAGAGGA
CGATAGCTCTCGCA
CCGCATAGC
>toto_00011 # 2419 # 3000 # 1 # a
AAGCTCGCTAGACNNCTCGATACNNCTCGGATAGC
>toto_00011 # 3500 # 4000 # 1 # vfvfdv
CAGATAGCNNCGCTAGAGGAGAGCTCTCGGAGA
>toto_00011 # 4632 # 5000 # -1 # fdf
CCGAGATCGCGCGCGCTCTTCTCGAGA
>other_header_00013 # 3399 # 4538 # 1 # fdfd
AGCTCTCGAGAGGAGCGCTCGA
CGCTCGCGCA
CCTATAGGAACCACCGGGG
>my contig_00014 # 4535 # 7888 # 1 # info
CAGGATAGCGCGCTCAGAG
>contname_03015 # 7854 # 9491 # 1 # ffff
AGATCCGCGCGCTATAGAGC
>contname_03016 # 9525 # 11285 # 1 # sdfdfd
AAGGATCTCTCGCGAGAGGA
>contname_03017 # 11249 # 12328 # 1 # b
CCAGGATAGCGCGCGC
#!/usr/bin/env python3
# coding: utf-8
"""
Unit tests for general_format_functions.py
"""
import os
import logging
import shutil
from io import StringIO
import pytest
import PanACoTA.annotate_module.format_prodigal as prodigalfunc
import PanACoTA.utils as utils
import test.test_unit.utilities_for_tests as tutil
ANNOTEDIR = os.path.join("test", "data", "annotate")
GENOMES_DIR = os.path.join(ANNOTEDIR, "genomes")
EXP_ANNOTE = os.path.join(ANNOTEDIR, "exp_files")
TEST_ANNOTE = os.path.join(ANNOTEDIR, "test_files")
GENEPATH = os.path.join(ANNOTEDIR, "generated_by_unit-tests")
LOGFILE_BASE = os.path.join(GENEPATH, "logs", "logfile")
LOGFILES = [LOGFILE_BASE + ext for ext in [".log", ".log.debug", ".log.details", ".log.err"]]
@pytest.fixture(autouse=True)
def setup_teardown_module():
"""
Remove log files at the end of this test module
Before each test:
- init logger
- create directory to put generated files
After:
- remove all log files
- remove directory with generated results
"""
os.mkdir(GENEPATH)
# utils.init_logger(LOGFILE_BASE, 0, 'test_fastme', verbose=1)
print("setup")
yield
# shutil.rmtree(GENEPATH)
# for f in LOGFILES:
# if os.path.exists(f):
# os.remove(f)
print("teardown")
def test_create_gen_lst(caplog):
"""
Check that generated gen and lst files are as expected.
In the test file, all genomes have names different from gembase name
This test file contains the following aspects:
- gene in D strand (start < end)
- gene in C strand (start > end)
- CDS features
- contigs with more than 2 genes
- contig with only 2 genes (both 'b' loc)
- contig with 1 gene ('b' loc)
- contig without gene (should be skipped)
"""
caplog.set_level(logging.DEBUG)
logger = logging.getLogger("test_prodigal")
genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.faa")
# contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50",
# "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50",
# "test.0417.00002.0007\t50"]
contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001",
"toto": "test.0417.00002.0002",
"other_header": "test.0417.00002.0003",
"my contig": "test.0417.00002.0004",
"bis": "test.0417.00002.0005",
"ter": "test.0417.00002.0006",
"contname": "test.0417.00002.0007"
}
name = "test.0417.00002"
res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen")
res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst")
gpath = "original_genome_name"
assert prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger)
exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst")
assert tutil.compare_order_content(exp_lst, res_lst_file)
def test_create_gen_lst_cont_unknown(caplog):
"""
A contig name in the gen file does not exist -> error message, and all result files must be removed for this genome
"""
caplog.set_level(logging.DEBUG)
logger = logging.getLogger("test_prodigal")
genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.faa")
contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001",
"toto": "test.0417.00002.0002",
"other_header": "test.0417.00002.0003",
"my_contig": "test.0417.00002.0004",
"bis": "test.0417.00002.0005",
"ter": "test.0417.00002.0006",
"contname": "test.0417.00002.0007"
}
name = "test.0417.00002"
res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen")
res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst")
gpath = "original_genome_name"
assert not prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger)
assert ("my contig fount in test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa "
"does not exist in original_genome_name")
def test_create_gen_lst(caplog):
"""
Check that generated gen and lst files are as expected.
In the test file, all genomes have names different from gembase name
This test file contains the following aspects:
- gene in D strand (start < end)
- gene in C strand (start > end)
- CDS features
- contigs with more than 2 genes
- contig with only 2 genes (both 'b' loc)
- contig with 1 gene ('b' loc)
- contig without gene (should be skipped)
"""
caplog.set_level(logging.DEBUG)
logger = logging.getLogger("test_prodigal")
genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.gff")
# contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50",
# "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50",
# "test.0417.00002.0007\t50"]
contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001",
"toto": "test.0417.00002.0002",
"other_header": "test.0417.00002.0003",
"my contig": "test.0417.00002.0004",
"bis": "test.0417.00002.0005",
"ter": "test.0417.00002.0006",
"contname": "test.0417.00002.0007"
}
name = "test.0417.00002"
res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen")
res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst")
gpath = "original_genome_name"
assert prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger)
exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst")
assert tutil.compare_order_content(exp_lst, res_lst_file)
# # def test_tbl_to_lst_not_changed_names(caplog):
# """
# Check that generated lstinfo file is as expected, when the genome name is the same as
# it already was in the genome given to prokka.
# The test tblfile contains the following aspects:
# - gene in D strand (start < end)
# - gene in C strand (start > end)
# - CDS features (some with all info = ECnumber, gene name, product etc. ;
# some with missing info)
# - tRNA type
# - repeat_region type (*2)
# - contigs with more than 2 genes
# - contig with only 2 genes (both 'b' loc)
# - contig with 1 gene ('b' loc)
# - contig without gene (should be skipped)
# """
# caplog.set_level(logging.DEBUG)
# logger = logging.getLogger("test_prokka")
# tblfile = os.path.join(TEST_ANNOTE, "original_name.fna-prokkaRes", "prokka_out_for_test.tbl")
# lstfile = os.path.join(GENEPATH, "res_test_tbl2lst.lst")
# contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50",
# "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50",
# "test.0417.00002.0007\t50"]
# name = "test.0417.00002"
# assert prokkafunc.tbl2lst(tblfile, lstfile, contigs, name, logger, changed_name=False)
# exp_lst = os.path.join(EXP_ANNOTE, "res_tbl2lst.lst")
# assert tutil.compare_order_content(exp_lst, lstfile)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment