Skip to content
Snippets Groups Projects
Commit ae649ae1 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Adapt tests with removal of crispr consideration

parent b972323d
Branches
No related tags found
No related merge requests found
...@@ -52,6 +52,7 @@ July 2019 ...@@ -52,6 +52,7 @@ July 2019
import os import os
import shutil import shutil
import glob
import logging import logging
import PanACoTA.utils as utils import PanACoTA.utils as utils
...@@ -100,9 +101,9 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, ...@@ -100,9 +101,9 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes") prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes")
# Get prodigal result files # Get prodigal result files
prot_file = os.path.join(prodigal_dir, name + ".faa") prot_file = glob.glob(os.path.join(prodigal_dir, "*.faa"))[0]
gen_file = os.path.join(prodigal_dir, name + ".ffn") gen_file = glob.glob(os.path.join(prodigal_dir, "*.ffn"))[0]
gff_file = os.path.join(prodigal_dir, name + ".gff") gff_file = glob.glob(os.path.join(prodigal_dir, "*.gff"))[0]
# Define names for generated gembase files # Define names for generated gembase files
res_prot_file = os.path.join(prot_dir, name + ".prt") res_prot_file = os.path.join(prot_dir, name + ".prt")
...@@ -275,7 +276,7 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name): ...@@ -275,7 +276,7 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
# If it is not the first gene of the genome, write previous gene information # If it is not the first gene of the genome, write previous gene information
if prev_start != "": if prev_start != "":
# Write line in LSTINFO file, + header and sequence to the gene file # Write line in LSTINFO file, + header and sequence to the gene file
_, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",
prev_loc, name, prev_cont_num, "NA", prev_info, prev_loc, name, prev_cont_num, "NA", prev_info,
"NA", prev_strand, prev_start, prev_end, r_lst) "NA", prev_strand, prev_start, prev_end, r_lst)
gfunc.write_header(lstline, r_gen) gfunc.write_header(lstline, r_gen)
...@@ -302,7 +303,7 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name): ...@@ -302,7 +303,7 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
# Otherwise, nothing to write # Otherwise, nothing to write
if prev_start != "": if prev_start != "":
prev_loc = "b" prev_loc = "b"
_, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",
prev_loc, name, prev_cont_num, "NA", prev_info, "NA", prev_loc, name, prev_cont_num, "NA", prev_info, "NA",
prev_strand, prev_start, prev_end, r_lst) prev_strand, prev_start, prev_end, r_lst)
gfunc.write_header(lstline, r_gen) gfunc.write_header(lstline, r_gen)
......
...@@ -238,9 +238,6 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath): ...@@ -238,9 +238,6 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath):
bool : bool :
True if genome name used in lstfile and prokka tblfile are the same, False otherwise True if genome name used in lstfile and prokka tblfile are the same, False otherwise
""" """
# Number CRISPRs. By default, 0 CRISPR -> next one will be CRISPR1
crispr_num = 1
crispr = False
# Protein localisation in contig (b = border ; i = inside) # Protein localisation in contig (b = border ; i = inside)
cont_loc = "b" cont_loc = "b"
prev_cont_loc = "b" prev_cont_loc = "b"
...@@ -325,8 +322,8 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath): ...@@ -325,8 +322,8 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath):
# If not first gene of the contig, write the previous gene to .lst file # If not first gene of the contig, write the previous gene to .lst file
# The first gene will be written while reading the 2nd gene # The first gene will be written while reading the 2nd gene
if start != "-1" and end != "-1" and not crispr: if start != "-1" and end != "-1" and not crispr:
crispr_num, lstline = general.write_gene(feature_type, locus_num, lstline = general.write_gene(feature_type, locus_num,
gene_name, product, crispr_num, gene_name, product,
prev_cont_loc, genome, prev_cont_loc, genome,
prev_cont_num, ecnum, inf2, prev_cont_num, ecnum, inf2,
db_xref, strand, start, end, lstf) db_xref, strand, start, end, lstf)
...@@ -359,8 +356,8 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath): ...@@ -359,8 +356,8 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath):
# Write last feature # Write last feature
if start != -1 and end != -1: if start != -1 and end != -1:
prev_cont_loc = "b" prev_cont_loc = "b"
crispr_num, _ = general.write_gene(feature_type, locus_num, gene_name, product, general.write_gene(feature_type, locus_num, gene_name, product,
crispr_num, prev_cont_loc, genome, prev_cont_num, prev_cont_loc, genome, prev_cont_num,
ecnum, inf2, db_xref, strand, start, end, lstf) ecnum, inf2, db_xref, strand, start, end, lstf)
return True return True
...@@ -516,7 +513,6 @@ def create_gen(ffnseq, lstfile, genseq): ...@@ -516,7 +513,6 @@ def create_gen(ffnseq, lstfile, genseq):
""" """
problem = False problem = False
write = True # Write next sequence write = True # Write next sequence
crispr_id = 1
with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen: with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen:
for line_ffn in ffn: for line_ffn in ffn:
# Ignore gene that we do not want to write (should be a crispr) # Ignore gene that we do not want to write (should be a crispr)
......
...@@ -65,8 +65,7 @@ from PanACoTA.annotate_module import format_prodigal as fprodigal ...@@ -65,8 +65,7 @@ from PanACoTA.annotate_module import format_prodigal as fprodigal
main_logger = logging.getLogger("annotate.geneffunc") main_logger = logging.getLogger("annotate.geneffunc")
def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False, def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False):
changed_name=False):
""" """
For all genomes which were annotated (by prokka or prodigal), reformat them For all genomes which were annotated (by prokka or prodigal), reformat them
in order to have, in 'res_path', the following folders: in order to have, in 'res_path', the following folders:
...@@ -80,7 +79,8 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q ...@@ -80,7 +79,8 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q
Parameters Parameters
---------- ----------
genomes_ok : dict genomes_ok : dict
genomes to format (annotation was OK) -> {genome: [name, gpath, size, nbcont, l90]} genomes to format (annotation was OK) ->
{genome: [name, gpath, to_annot, size, nbcont, l90]}
res_path : str res_path : str
path to folder where the 4 directories must be created path to folder where the 4 directories must be created
annot_path : str annot_path : str
...@@ -91,9 +91,6 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q ...@@ -91,9 +91,6 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q
number of threads to use to while formatting genomes number of threads to use to while formatting genomes
quiet : bool quiet : bool
True if nothing must be sent to stderr/stdout, False otherwise True if nothing must be sent to stderr/stdout, False otherwise
changed_name : bool
True if contig names have been changed (cutn != 0) -> contig names end by '_num',
False otherwise.
Returns Returns
------- -------
...@@ -216,7 +213,7 @@ def handle_genome(args): ...@@ -216,7 +213,7 @@ def handle_genome(args):
return ok_format, genome return ok_format, genome
def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, def write_gene(gtype, locus_num, gene_name, product, cont_loc,
genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile): genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile):
""" """
Write given gene to output file Write given gene to output file
...@@ -231,11 +228,6 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, ...@@ -231,11 +228,6 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal) gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal)
product : str product : str
found by prokka/Prodigal, "NA" if no product (always the case for prodigal) found by prokka/Prodigal, "NA" if no product (always the case for prodigal)
crispr_num : int
current crispr number. In prokka tbl, CRISPRs are not numbered, they all
have the same name. We name them by adding a unique number to each CRISPR. If the current
gene to add is a CRISPR, this number will be incremented and returned. If not, this same
number will be returned.
cont_loc : str cont_loc : str
'i' if the gene is inside a contig, 'b' if its on the border (first or last gene 'i' if the gene is inside a contig, 'b' if its on the border (first or last gene
of the contig) of the contig)
...@@ -260,16 +252,9 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, ...@@ -260,16 +252,9 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
Returns Returns
------- -------
tuple : str :
Current crispr number, lstline lstline
""" """
# if last gene was a crispr
if gtype == "repeat_region":
gtype = "CRISPR"
locus_num = "CRISPR" + str(crispr_num)
gene_name = "crispr"
product = "crispr-array"
crispr_num += 1
locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc, locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc,
str(locus_num).zfill(5)) str(locus_num).zfill(5))
# If '|' character found in those fields, replace by '_' to avoid problems while parsing # If '|' character found in those fields, replace by '_' to avoid problems while parsing
...@@ -279,7 +264,7 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, ...@@ -279,7 +264,7 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
db_xref.replace("|", "_")) db_xref.replace("|", "_"))
lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info]) lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info])
lstopenfile.write(lst_line + "\n") lstopenfile.write(lst_line + "\n")
return crispr_num, lst_line return lst_line
def write_header(lstline, outfile): def write_header(lstline, outfile):
......
##gff-version 3 ##gff-version 3
##sequence-region test.0417.00002.0001 1 14000 ##sequence-region test.0417.00002.0001 1 84
##sequence-region test.0417.00002.0002 1 5000 ##sequence-region test.0417.00002.0002 1 103
##sequence-region test.0417.00002.0003 1 4600 ##sequence-region test.0417.00002.0003 1 122
##sequence-region test.0417.00002.0004 1 8000 ##sequence-region test.0417.00002.0004 1 35
##sequence-region test.0417.00002.0005 1 1 ##sequence-region test.0417.00002.0005 1 198
##sequence-region test.0417.00002.0006 1 10 ##sequence-region test.0417.00002.0006 1 128
##sequence-region test.0417.00002.0007 1 15000 ##sequence-region test.0417.00002.0007 1 85
test.0417.00002.0001 Prodigal:2.6 CDS 287 787 . + 0 ID=test.0417.00002.0001b_00001;inference=ab initio prediction:Prodigal:2.6;locus_tag=EPKOMDHM_00001;product=hypothetical protein test.0417.00002.0001 Prodigal:2.6 CDS 287 787 . + 0 ID=test.0417.00002.0001b_00001;inference=ab initio prediction:Prodigal:2.6;locus_tag=EPKOMDHM_00001;product=hypothetical protein
test.0417.00002.0001 Prodigal:2.6 CDS 4416 6068 . + 0 ID=test.0417.00002.0001i_00002;Name=yiaD;gene=yiaD;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P37665;locus_tag=EPKOMDHM_00005;product=putative lipoprotein YiaD test.0417.00002.0001 Prodigal:2.6 CDS 4416 6068 . + 0 ID=test.0417.00002.0001i_00002;Name=yiaD;gene=yiaD;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P37665;locus_tag=EPKOMDHM_00005;product=putative lipoprotein YiaD
test.0417.00002.0001 Prodigal:2.6 CDS 9000 12002 . - 0 ID=test.0417.00002.0001b_00003;Name=vgrG1;gene=vgrG1;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:Q9HI36;locus_tag=EPKOMDHM_00006;product=Major exported protein test.0417.00002.0001 Prodigal:2.6 CDS 9000 12002 . - 0 ID=test.0417.00002.0001b_00003;Name=vgrG1;gene=vgrG1;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:Q9HI36;locus_tag=EPKOMDHM_00006;product=Major exported protein
......
...@@ -129,13 +129,13 @@ def test_create_gff(caplog): ...@@ -129,13 +129,13 @@ def test_create_gff(caplog):
"ter": "test.0417.00002.0006", "ter": "test.0417.00002.0006",
"contname": "test.0417.00002.0007" "contname": "test.0417.00002.0007"
} }
sizes = {"test.0417.00002.0001": 14000, sizes = {"test.0417.00002.0001": 84,
"test.0417.00002.0002": 5000, "test.0417.00002.0002": 103,
"test.0417.00002.0003": 4600, "test.0417.00002.0003": 122,
"test.0417.00002.0004": 8000, "test.0417.00002.0004": 35,
"test.0417.00002.0005": 1, "test.0417.00002.0005": 198,
"test.0417.00002.0006": 10, "test.0417.00002.0006": 128,
"test.0417.00002.0007": 15000, "test.0417.00002.0007": 85,
} }
res_gff_file = os.path.join(GENEPATH, "prodigal_res.gff") res_gff_file = os.path.join(GENEPATH, "prodigal_res.gff")
exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst") exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst")
...@@ -383,7 +383,15 @@ def test_format_1genome_emptygpath(caplog): ...@@ -383,7 +383,15 @@ def test_format_1genome_emptygpath(caplog):
# Create empty file, that we give to prodigal for formatting step # Create empty file, that we give to prodigal for formatting step
gpath = os.path.join(GENEPATH, "original_name-empty.fna") gpath = os.path.join(GENEPATH, "original_name-empty.fna")
open(gpath, "w").close() open(gpath, "w").close()
prod_path = TEST_ANNOTE prod_path = GENEPATH
# Create prodigal result files (empty, then won't be read)
prodigal_dir = gpath + "-prodigalRes"
os.makedirs(prodigal_dir)
prodigal_faa = os.path.join(gpath + "-prodigalRes", "notread.faa")
prodigal_ffn = os.path.join(gpath + "-prodigalRes", "notread.ffn")
prodigal_gff = os.path.join(gpath + "-prodigalRes", "notread.gff")
for file in [prodigal_faa, prodigal_gff, prodigal_ffn]:
open(file, "w").close()
# Generate result folders # Generate result folders
prot_dir = os.path.join(GENEPATH, "Proteins") prot_dir = os.path.join(GENEPATH, "Proteins")
lst_dir = os.path.join(GENEPATH, "LSTINFO") lst_dir = os.path.join(GENEPATH, "LSTINFO")
...@@ -428,12 +436,14 @@ def test_format_1genome_wrongffn(caplog): ...@@ -428,12 +436,14 @@ def test_format_1genome_wrongffn(caplog):
name = "prodigal.outtest.ok" name = "prodigal.outtest.ok"
# path to original genome, given to prodigal for annotation # path to original genome, given to prodigal for annotation
orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna")
# In GENEPATH folder, create the original genome given to prodigal orig_prodpath = orig_gpath + "-prodigalRes"
# (copy from test_file)
used_gpath = os.path.join(GENEPATH, "original_name.fna") used_gpath = os.path.join(GENEPATH, "original_name.fna")
used_respath = used_gpath + "-prodigalRes" used_respath = used_gpath + "-prodigalRes"
os.makedirs(used_respath) # Add original genome, and prodigal results to result folder
shutil.copyfile(orig_gpath, used_gpath) shutil.copyfile(orig_gpath, used_gpath)
shutil.copytree(orig_prodpath, used_respath)
# In GENEPATH folder, create the original genome given to prodigal
# (copy from test_file)
# Create gen_file with a header not existing # Create gen_file with a header not existing
with open(os.path.join(used_respath, "prodigal.outtest.ok.ffn"), "w") as ori: with open(os.path.join(used_respath, "prodigal.outtest.ok.ffn"), "w") as ori:
ori.write(">wrongheader # 1 # 2 # 1 # toto") ori.write(">wrongheader # 1 # 2 # 1 # toto")
...@@ -481,16 +491,14 @@ def test_format_1genome_wronggff(caplog): ...@@ -481,16 +491,14 @@ def test_format_1genome_wronggff(caplog):
# path to original genome, given to prodigal for annotation # path to original genome, given to prodigal for annotation
orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna")
orig_prodpath = orig_gpath + "-prodigalRes"
# In generated_by_tests folder, create the original genome given to prodigal # In generated_by_tests folder, create the original genome given to prodigal
# (copy from test_file) # (copy from test_file)
used_gpath = os.path.join(GENEPATH, "original_name.fna") used_gpath = os.path.join(GENEPATH, "original_name.fna")
used_respath = used_gpath + "-prodigalRes" used_respath = used_gpath + "-prodigalRes"
os.makedirs(used_respath) # Add original genome, and prodigal results to result folder
shutil.copyfile(orig_gpath, used_gpath) shutil.copyfile(orig_gpath, used_gpath)
# Copy ffn file generated by prodigal: shutil.copytree(orig_prodpath, used_respath)
orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn")
used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn")
shutil.copyfile(orig_ffn, used_ffn)
# Copy gff file, but modify to get wrong start position # Copy gff file, but modify to get wrong start position
orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff") orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff")
used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff") used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff")
...@@ -548,21 +556,14 @@ def test_format_1genome_wrongprt(caplog): ...@@ -548,21 +556,14 @@ def test_format_1genome_wrongprt(caplog):
# path to original genome, given to prodigal for annotation # path to original genome, given to prodigal for annotation
orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna")
orig_prodpath = orig_gpath + "-prodigalRes"
# In generated_by_tests folder, create the original genome given to prodigal # In generated_by_tests folder, create the original genome given to prodigal
# (copy from test_file) # (copy from test_file)
used_gpath = os.path.join(GENEPATH, "original_name.fna") used_gpath = os.path.join(GENEPATH, "original_name.fna")
used_respath = used_gpath + "-prodigalRes" used_respath = used_gpath + "-prodigalRes"
os.makedirs(used_respath) # Add original genome, and prodigal results to result folder
shutil.copyfile(orig_gpath, used_gpath) shutil.copyfile(orig_gpath, used_gpath)
# Copy ffn file generated by prodigal: shutil.copytree(orig_prodpath, used_respath)
orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn")
used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn")
shutil.copyfile(orig_ffn, used_ffn)
# Copy gff file generated by prodigal:
orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff")
used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff")
shutil.copyfile(orig_gff, used_gff)
# Copy prt file, but removing first proteins
orig_faa = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.faa") orig_faa = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.faa")
used_faa = os.path.join(used_respath, "prodigal.outtest.ok.faa") used_faa = os.path.join(used_respath, "prodigal.outtest.ok.faa")
with open(orig_faa, "r") as faa, open(used_faa, "w") as faar: with open(orig_faa, "r") as faa, open(used_faa, "w") as faar:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment