diff --git a/PanACoTA/annotate_module/format_prodigal.py b/PanACoTA/annotate_module/format_prodigal.py index 374fe545e8f01fbc46655ae748cb27499494e976..30679831ce87064a8216cb72a7f5d8c5a9f96514 100644 --- a/PanACoTA/annotate_module/format_prodigal.py +++ b/PanACoTA/annotate_module/format_prodigal.py @@ -52,6 +52,7 @@ July 2019 import os import shutil +import glob import logging import PanACoTA.utils as utils @@ -100,9 +101,9 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes") # Get prodigal result files - prot_file = os.path.join(prodigal_dir, name + ".faa") - gen_file = os.path.join(prodigal_dir, name + ".ffn") - gff_file = os.path.join(prodigal_dir, name + ".gff") + prot_file = glob.glob(os.path.join(prodigal_dir, "*.faa"))[0] + gen_file = glob.glob(os.path.join(prodigal_dir, "*.ffn"))[0] + gff_file = glob.glob(os.path.join(prodigal_dir, "*.gff"))[0] # Define names for generated gembase files res_prot_file = os.path.join(prot_dir, name + ".prt") @@ -275,9 +276,9 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name): # If it is not the first gene of the genome, write previous gene information if prev_start != "": # Write line in LSTINFO file, + header and sequence to the gene file - _, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0, - prev_loc, name, prev_cont_num, "NA", prev_info, - "NA", prev_strand, prev_start, prev_end, r_lst) + lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", + prev_loc, name, prev_cont_num, "NA", prev_info, + "NA", prev_strand, prev_start, prev_end, r_lst) gfunc.write_header(lstline, r_gen) r_gen.write(seq) # -> get new information, save it for the next gene, and go to next line @@ -302,9 +303,9 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name): # Otherwise, nothing to write if prev_start != "": prev_loc = "b" - _, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0, - prev_loc, name, prev_cont_num, "NA", prev_info, "NA", - prev_strand, prev_start, prev_end, r_lst) + lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", + prev_loc, name, prev_cont_num, "NA", prev_info, "NA", + prev_strand, prev_start, prev_end, r_lst) gfunc.write_header(lstline, r_gen) r_gen.write(seq) return True diff --git a/PanACoTA/annotate_module/format_prokka.py b/PanACoTA/annotate_module/format_prokka.py index fc3cc146601996da30e2c763a9b10c8c76ee523a..33ec320449b8e1f50af4a12ddfb1cf3b525d50b1 100644 --- a/PanACoTA/annotate_module/format_prokka.py +++ b/PanACoTA/annotate_module/format_prokka.py @@ -238,9 +238,6 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath): bool : True if genome name used in lstfile and prokka tblfile are the same, False otherwise """ - # Number CRISPRs. By default, 0 CRISPR -> next one will be CRISPR1 - crispr_num = 1 - crispr = False # Protein localisation in contig (b = border ; i = inside) cont_loc = "b" prev_cont_loc = "b" @@ -325,11 +322,11 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath): # If not first gene of the contig, write the previous gene to .lst file # The first gene will be written while reading the 2nd gene if start != "-1" and end != "-1" and not crispr: - crispr_num, lstline = general.write_gene(feature_type, locus_num, - gene_name, product, crispr_num, - prev_cont_loc, genome, - prev_cont_num, ecnum, inf2, - db_xref, strand, start, end, lstf) + lstline = general.write_gene(feature_type, locus_num, + gene_name, product, + prev_cont_loc, genome, + prev_cont_num, ecnum, inf2, + db_xref, strand, start, end, lstf) # Get new values for the next gene: start, end, strand and feature type start, end, feature_type = elems @@ -359,9 +356,9 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath): # Write last feature if start != -1 and end != -1: prev_cont_loc = "b" - crispr_num, _ = general.write_gene(feature_type, locus_num, gene_name, product, - crispr_num, prev_cont_loc, genome, prev_cont_num, - ecnum, inf2, db_xref, strand, start, end, lstf) + general.write_gene(feature_type, locus_num, gene_name, product, + prev_cont_loc, genome, prev_cont_num, + ecnum, inf2, db_xref, strand, start, end, lstf) return True @@ -516,7 +513,6 @@ def create_gen(ffnseq, lstfile, genseq): """ problem = False write = True # Write next sequence - crispr_id = 1 with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen: for line_ffn in ffn: # Ignore gene that we do not want to write (should be a crispr) diff --git a/PanACoTA/annotate_module/general_format_functions.py b/PanACoTA/annotate_module/general_format_functions.py index c11c0560605f2fc4d06e44c849897776d76697ff..68dd0c4cee9dab4202677b0b995493005b9720ff 100644 --- a/PanACoTA/annotate_module/general_format_functions.py +++ b/PanACoTA/annotate_module/general_format_functions.py @@ -65,8 +65,7 @@ from PanACoTA.annotate_module import format_prodigal as fprodigal main_logger = logging.getLogger("annotate.geneffunc") -def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False, - changed_name=False): +def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False): """ For all genomes which were annotated (by prokka or prodigal), reformat them in order to have, in 'res_path', the following folders: @@ -80,7 +79,8 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q Parameters ---------- genomes_ok : dict - genomes to format (annotation was OK) -> {genome: [name, gpath, size, nbcont, l90]} + genomes to format (annotation was OK) -> + {genome: [name, gpath, to_annot, size, nbcont, l90]} res_path : str path to folder where the 4 directories must be created annot_path : str @@ -91,9 +91,6 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q number of threads to use to while formatting genomes quiet : bool True if nothing must be sent to stderr/stdout, False otherwise - changed_name : bool - True if contig names have been changed (cutn != 0) -> contig names end by '_num', - False otherwise. Returns ------- @@ -216,7 +213,7 @@ def handle_genome(args): return ok_format, genome -def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, +def write_gene(gtype, locus_num, gene_name, product, cont_loc, genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile): """ Write given gene to output file @@ -231,11 +228,6 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal) product : str found by prokka/Prodigal, "NA" if no product (always the case for prodigal) - crispr_num : int - current crispr number. In prokka tbl, CRISPRs are not numbered, they all - have the same name. We name them by adding a unique number to each CRISPR. If the current - gene to add is a CRISPR, this number will be incremented and returned. If not, this same - number will be returned. cont_loc : str 'i' if the gene is inside a contig, 'b' if its on the border (first or last gene of the contig) @@ -260,16 +252,9 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, Returns ------- - tuple : - Current crispr number, lstline + str : + lstline """ - # if last gene was a crispr - if gtype == "repeat_region": - gtype = "CRISPR" - locus_num = "CRISPR" + str(crispr_num) - gene_name = "crispr" - product = "crispr-array" - crispr_num += 1 locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc, str(locus_num).zfill(5)) # If '|' character found in those fields, replace by '_' to avoid problems while parsing @@ -279,7 +264,7 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, db_xref.replace("|", "_")) lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info]) lstopenfile.write(lst_line + "\n") - return crispr_num, lst_line + return lst_line def write_header(lstline, outfile): diff --git a/test/data/annotate/exp_files/res_create_gff_prodigal.gff b/test/data/annotate/exp_files/res_create_gff_prodigal.gff index ebf97f8c9e8f22f15fe2eac1b8320a30c51d8ba7..5ec00adf5f8ab81d91ef00803ba931a6c04dc26d 100644 --- a/test/data/annotate/exp_files/res_create_gff_prodigal.gff +++ b/test/data/annotate/exp_files/res_create_gff_prodigal.gff @@ -1,11 +1,11 @@ ##gff-version 3 -##sequence-region test.0417.00002.0001 1 14000 -##sequence-region test.0417.00002.0002 1 5000 -##sequence-region test.0417.00002.0003 1 4600 -##sequence-region test.0417.00002.0004 1 8000 -##sequence-region test.0417.00002.0005 1 1 -##sequence-region test.0417.00002.0006 1 10 -##sequence-region test.0417.00002.0007 1 15000 +##sequence-region test.0417.00002.0001 1 84 +##sequence-region test.0417.00002.0002 1 103 +##sequence-region test.0417.00002.0003 1 122 +##sequence-region test.0417.00002.0004 1 35 +##sequence-region test.0417.00002.0005 1 198 +##sequence-region test.0417.00002.0006 1 128 +##sequence-region test.0417.00002.0007 1 85 test.0417.00002.0001 Prodigal:2.6 CDS 287 787 . + 0 ID=test.0417.00002.0001b_00001;inference=ab initio prediction:Prodigal:2.6;locus_tag=EPKOMDHM_00001;product=hypothetical protein test.0417.00002.0001 Prodigal:2.6 CDS 4416 6068 . + 0 ID=test.0417.00002.0001i_00002;Name=yiaD;gene=yiaD;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P37665;locus_tag=EPKOMDHM_00005;product=putative lipoprotein YiaD test.0417.00002.0001 Prodigal:2.6 CDS 9000 12002 . - 0 ID=test.0417.00002.0001b_00003;Name=vgrG1;gene=vgrG1;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:Q9HI36;locus_tag=EPKOMDHM_00006;product=Major exported protein diff --git a/test/test_unit/test_annotate/test_format_prodigal.py b/test/test_unit/test_annotate/test_format_prodigal.py index 09b89e85de0b5d35a5f8e9e63a51a3506ea46466..9644df5e6ffd5acfe5d23e46fc1148161740820e 100644 --- a/test/test_unit/test_annotate/test_format_prodigal.py +++ b/test/test_unit/test_annotate/test_format_prodigal.py @@ -129,13 +129,13 @@ def test_create_gff(caplog): "ter": "test.0417.00002.0006", "contname": "test.0417.00002.0007" } - sizes = {"test.0417.00002.0001": 14000, - "test.0417.00002.0002": 5000, - "test.0417.00002.0003": 4600, - "test.0417.00002.0004": 8000, - "test.0417.00002.0005": 1, - "test.0417.00002.0006": 10, - "test.0417.00002.0007": 15000, + sizes = {"test.0417.00002.0001": 84, + "test.0417.00002.0002": 103, + "test.0417.00002.0003": 122, + "test.0417.00002.0004": 35, + "test.0417.00002.0005": 198, + "test.0417.00002.0006": 128, + "test.0417.00002.0007": 85, } res_gff_file = os.path.join(GENEPATH, "prodigal_res.gff") exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") @@ -383,7 +383,15 @@ def test_format_1genome_emptygpath(caplog): # Create empty file, that we give to prodigal for formatting step gpath = os.path.join(GENEPATH, "original_name-empty.fna") open(gpath, "w").close() - prod_path = TEST_ANNOTE + prod_path = GENEPATH + # Create prodigal result files (empty, then won't be read) + prodigal_dir = gpath + "-prodigalRes" + os.makedirs(prodigal_dir) + prodigal_faa = os.path.join(gpath + "-prodigalRes", "notread.faa") + prodigal_ffn = os.path.join(gpath + "-prodigalRes", "notread.ffn") + prodigal_gff = os.path.join(gpath + "-prodigalRes", "notread.gff") + for file in [prodigal_faa, prodigal_gff, prodigal_ffn]: + open(file, "w").close() # Generate result folders prot_dir = os.path.join(GENEPATH, "Proteins") lst_dir = os.path.join(GENEPATH, "LSTINFO") @@ -428,12 +436,14 @@ def test_format_1genome_wrongffn(caplog): name = "prodigal.outtest.ok" # path to original genome, given to prodigal for annotation orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") - # In GENEPATH folder, create the original genome given to prodigal - # (copy from test_file) + orig_prodpath = orig_gpath + "-prodigalRes" used_gpath = os.path.join(GENEPATH, "original_name.fna") used_respath = used_gpath + "-prodigalRes" - os.makedirs(used_respath) + # Add original genome, and prodigal results to result folder shutil.copyfile(orig_gpath, used_gpath) + shutil.copytree(orig_prodpath, used_respath) + # In GENEPATH folder, create the original genome given to prodigal + # (copy from test_file) # Create gen_file with a header not existing with open(os.path.join(used_respath, "prodigal.outtest.ok.ffn"), "w") as ori: ori.write(">wrongheader # 1 # 2 # 1 # toto") @@ -481,16 +491,14 @@ def test_format_1genome_wronggff(caplog): # path to original genome, given to prodigal for annotation orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") + orig_prodpath = orig_gpath + "-prodigalRes" # In generated_by_tests folder, create the original genome given to prodigal # (copy from test_file) used_gpath = os.path.join(GENEPATH, "original_name.fna") used_respath = used_gpath + "-prodigalRes" - os.makedirs(used_respath) + # Add original genome, and prodigal results to result folder shutil.copyfile(orig_gpath, used_gpath) - # Copy ffn file generated by prodigal: - orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn") - used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn") - shutil.copyfile(orig_ffn, used_ffn) + shutil.copytree(orig_prodpath, used_respath) # Copy gff file, but modify to get wrong start position orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff") used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff") @@ -548,21 +556,14 @@ def test_format_1genome_wrongprt(caplog): # path to original genome, given to prodigal for annotation orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") + orig_prodpath = orig_gpath + "-prodigalRes" # In generated_by_tests folder, create the original genome given to prodigal # (copy from test_file) used_gpath = os.path.join(GENEPATH, "original_name.fna") used_respath = used_gpath + "-prodigalRes" - os.makedirs(used_respath) + # Add original genome, and prodigal results to result folder shutil.copyfile(orig_gpath, used_gpath) - # Copy ffn file generated by prodigal: - orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn") - used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn") - shutil.copyfile(orig_ffn, used_ffn) - # Copy gff file generated by prodigal: - orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff") - used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff") - shutil.copyfile(orig_gff, used_gff) - # Copy prt file, but removing first proteins + shutil.copytree(orig_prodpath, used_respath) orig_faa = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.faa") used_faa = os.path.join(used_respath, "prodigal.outtest.ok.faa") with open(orig_faa, "r") as faa, open(used_faa, "w") as faar: