Skip to content
Snippets Groups Projects
Commit 98b81ec5 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Format ok for Replicon, LSTINFO, Genes.

gff3 started, but problems with contig names
parent 89fc00f9
No related branches found
No related tags found
No related merge requests found
......@@ -79,11 +79,15 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
res_gff_file = os.path.join(gff_dir, name + ".gff")
# First, create .gen and .lst files,
# First, create .gen and .lst files. If they could not be formatted,
# remove those files, and return False with error message
ok = create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger)
if not ok:
os.remove(res_gene_file)
os.remove(res_lst_file)
try:
os.remove(res_gen_file)
os.remove(res_lst_file)
except OSError:
pass
logger.error("Problems while generating .gen and .lst files for {}".format(name))
return False
......@@ -91,12 +95,24 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
# gembase formatted headers
# From Replicon file, get contig names, and use them to generate gff file
ok = create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name, logger)
# If problem while formatting the genome (rep or gff file), remove all already created files, and return False with error message.
if not ok:
try:
os.remove(res_gene_file)
os.remove(res_lst_file)
os.remove(res_rep_file)
os.remove(res_gff_file)
except OSError:
pass
logger.error("Problems while generating .fna (Replicons folder) and .gff (gff3 folder) "
"files for {}".format(name))
return False
# ok = create_prt(prot_file, res_prot_file, res_lst_file, logger)
# utils.rename_genome_contigs(gembase_name, gpath, outfile):\
def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
"""
Generate .gen file, from sequences contained in .ffn, but changing the
headers to match with gembase format.
......@@ -106,7 +122,7 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
----------
gen_file : str
.ffn file generated by prodigal
res_gene_file : str
res_gen_file : str
generated .gen file, to write in Genes directory
res_lst_file : str
generated .lst file to write in LSTINFO directory
......@@ -128,18 +144,20 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
locus_num = 1
# contig number of the last gene. To check if we are now in a new contig (-> loc = b) or not
prev_cont_num = 0
# Keep start, strand and end from the previous gene, before overwriting it with the new one
# Keep start, end, strand and informations (prodigal gives information on start_type,
# gc_cont etc.) from the previous gene, before overwriting it with information
# on the new one
prev_start = ""
prev_end = ""
prev_strand = ""
prev_annot = ""
prev_info = ""
# Update loc when contig changes ('b' if gene at the border of a contig, 'i' if it is inside)
prev_loc = "b"
# To start, the first gene is, by definition, at the border of the contig
loc = "b"
# Open files: ffn prodigal to read, .gen and .lst gembase to create
with open(gen_file, "r") as ffn, open(res_gene_file, "w") as r_gen,\
# Open files: .ffn prodigal to read, .gen and .lst gembase to create
with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\
open(res_lst_file, "w") as r_lst:
# Read all lines in ffn file (sequences in nuc. for each gene)
for lineffn in ffn:
......@@ -149,18 +167,19 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
continue
# Otherwise:
# - write header of previous sequence to .gen
# - write previous sequence to .gen
# - write previous sequence (in 'seq') to .gen
# - write LSTINFO information to .lst
# - update information (new start, end, contig number etc.) for next gene
else:
# Get information given for the new gene (by ffn file from prodigal)
(gname, start, end, strand, annot) = lineffn.strip().split(">")[-1].split("#")
# Get information given for the new gene (by .ffn file from prodigal)
(gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#")
# Get contig number from prodigal gene header: prodigal first part of header is:
# <original genome name>_<contig number>_protein number
# <original genome name>_<contig number>_<protein number>
contig_num = int(gname.strip().split("_")[-2])
# If first gene of a new contig, previous gene was the last of
# its contig -> prev_loc = "b"
# If new contig:
# - previous gene was the last of its contig -> prev_loc = "b" ;
# - the current gene is the first of its contig (loc = "b")
if contig_num != prev_cont_num:
prev_loc = 'b'
loc = 'b'
......@@ -171,13 +190,14 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
if contig_num == prev_cont_num and prev_loc == "b":
loc = 'i'
# If it is the first gene, we do not have any "previous gene" to write .
# If it is the first gene of the genome, we do not have any "previous gene"
# to write .
# -> get new information, save it for the next gene, and go to next line
if prev_start == "":
prev_start = start
prev_end = end
prev_cont_num = contig_num
prev_annot = annot
prev_info = info
# Convert strand 1/-1 to D/C
if int(strand) == 1:
strand = "D"
......@@ -186,9 +206,9 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
prev_strand = strand
continue
# Write line in lstinfo, + header and sequence to the gene file
# Write line in LSTINFO file, + header and sequence to the gene file
_, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0,
prev_loc, name, prev_cont_num, "NA", prev_annot,
prev_loc, name, prev_cont_num, "NA", prev_info,
prev_strand, prev_start, prev_end, r_lst)
gfunc.write_header(lstline, r_gen)
r_gen.write(seq)
......@@ -207,11 +227,11 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
prev_end = end
prev_strand = strand
prev_loc = loc
prev_annot = annot
# Write last sequence (-> loc = 'b')
prev_info = info
# Write last gene of the genome (-> loc = 'b')
prev_loc = "b"
_, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0,
prev_loc, name, prev_cont_num, "NA", prev_annot, prev_strand,
prev_loc, name, prev_cont_num, "NA", prev_info, prev_strand,
prev_start, prev_end, r_lst)
gfunc.write_header(lstline, r_gen)
r_gen.write(seq)
......@@ -249,9 +269,9 @@ def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, na
"""
contigs = create_replicons(gpath, res_rep_file, name)
logger.info(contigs)
# if contigs:
# return create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logger)
return contigs
def create_replicons(gpath, res_rep_file, name):
......@@ -292,7 +312,7 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
##sequence-region contig1 start end
##sequence-region contig2 start end
...
seqid (contig) source type start end score strand phase attributes
seqid(=contig) source type start end score strand phase attributes
All fields tab separated.
Empty fields contain '.'
......@@ -307,7 +327,6 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
----------
gpath : str
path to the genome sequence given to prodigal
v
res-gff_file : str
path to the gff file that must be created in result database
res-lst_file : str
......@@ -315,7 +334,7 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
name : str
genome name to use (species.date.strain)
contigs : list
list of contig names with their size. ["contig1;size1", "contig2;size2" ...]
list of contig names with their size. ["contig1"\t"size1", "contig2"\t"size2" ...]
logger : logging.Logger
logger object to put information
......@@ -333,9 +352,11 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
rgf.write("##gff-version 3\n")
for contig in contigs:
# Write the list of contigs, with their size
cont_name, end = contig.split(";")
rgf.write("##sequence-region\t{}\t{}\t{}\n".format(cont_name[1:], "1", end))
cont_name, end = contig.split("\t")
rgf.write("##sequence-region\t{}\t{}\t{}".format(cont_name[1:], "1", end))
for linegff in gf:
iter_contig = iter(contigs)
contig = next(iter_contig)
# Ignore gff lines starting by #. For the new gff, these are already all written at the
# beginning of the file.
if linegff.startswith("#"):
......@@ -354,9 +375,15 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
fields_l = linelst.strip().split("\t")
fields_l = [info.strip() for info in fields_l]
start_l, end_l, strand_l, type_l, locus_l, _, _ = fields_l
cname = contig.split("\t")[0][1:]
# Get gff and ffn filenames to give information to user if error message
gff = os.path.basename(gff_file)
ffn = ".".join(gff.split(".")[:-1]) + ".ffn"
# Path where gff and ffn generated by prodigal are
tmp = gpath + "-prodigalRes"
# Get gene name given by prodigal to current gene
gname = attributes.split("ID=")[1].split(";")[0]
# Compare information from lst and information from prodigal gff (start,
......@@ -364,6 +391,8 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
for (elemg, eleml, label) in zip([start_g, end_g, type_g],
[start_l, end_l, type_l],
["start", "end", "type"]):
# If 1 element is different (start or end position, or type), print error
# message and return False: this genome could not be converted to gff
if elemg != eleml:
logger.error("Files {} and {} (in prodigal tmp_files: {}) do not have the same {} value for gene {}"
" ({} in gff, {} in ffn))".format(ffn, gff, tmp, label,
......@@ -375,10 +404,9 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
new = [atr if "ID" not in atr else 'ID={}'.format(locus_l) for atr in at_split]
# Get information to put to the new gff line
# Get contig name without '>'
cname = "".join(cont_name.split('>')[1:])
info = "\t".join([cname, source, type_g, start_g, end_g, score, strand_g, phase, ";".join(new)])
rgf.write(info + "\n")
return True
return True
def create_prt(prot_file, res_prot_file, res_lst_file, logger):
......
......@@ -194,14 +194,14 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
locus_num : str
number of the locus given by prokka/prodigal
gene_name : str
gene name found by prokka/prodigal ("NA" if no gene name)
gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal)
product : str
found by prokka/Prodigal, "NA" if no product (always the case for prodigal)
crispr_num : int
current crispr number. In prokka tbl, CRISPRs are not numbered, they all
have the same name. We name them by adding a unique number to each CRISPR. If the current
gene to add is a CRISPR, this number will be incremented and returned. If not, this same
name will be returned.
number will be returned.
cont_loc : str
'i' if the gene is inside a contig, 'b' if its on the border (first or last gene
of the contig)
......@@ -210,14 +210,14 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
cont_num : int
contig number
ecnum : str
EC number, found by prokka, or "NA" if no EC number
EC number, found by prokka, or "NA" if no EC number (-> always for prodigal)
inf2 : str
more information found by prokka, or "NA" if no more information
more information found by prokka/prodigal, or "NA" if no more information
strand : str
C (complement) or D (direct)
start : int
start : str
start of gene in the contig
end : int
end : str
end of gene in the contig
lstopenfile : _io.TextIOWrapper
open file where lstinfo must be written
......@@ -240,8 +240,7 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
more_info = "| {} | {} | {}".format(product.replace("|", "_"),
ecnum.replace("|", "_"),
inf2.replace("|", "_"))
lst_line = "\t".join([str(start), str(end), strand, gtype,
locus_name, gene_name, more_info])
lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info])
lstopenfile.write(lst_line + "\n")
return crispr_num, lst_line
......@@ -255,8 +254,7 @@ def write_header(lstline, outfile):
lstline : str
current line of lst file
outfile : _io.TextIOWrapper
open file where header must be written-
open file where header must be written
"""
start, end, _, _, name, gene_name, info = lstline.split("\t")
size = int(end) - int(start) + 1
......
......@@ -867,12 +867,22 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
Returns
-------
list
List of all contigs with their size ["contig1;size1", "contig2;size2" ...]
List of all contigs with their size ["contig1'\t'size1", "contig2'\t'size2" ...]
"""
# Initialize variables
# Contig number
contig_num = 1
# contig size
cont_size = 0
# List of contigs [<name>\t<size>]
contigs = []
# Name of previous contig (to put to contigs, as we need to wait for the next
# contig to know the size of the previous one)
prev_cont = ""
# sequence of previous contig
seq = ""
# Read input sequence given to prodigal, and open file where sequences with new
# headers must be written.
with open(gpath, "r") as gpf, open(outfile, "w") as grf:
......@@ -880,18 +890,27 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
# When we find a new header line, convert its name to gembase format, and write it
# to output replicon file
if line.startswith(">") :
new_cont = ">" + gembase_name + "." + str(contig_num).zfill(4)
grf.write(new_cont + "\n")
# If not first contig (size != 0), add its name as well as its size to contigs list
if cont_size != 0:
contigs.append("{};{}".format(new_cont, cont_size))
# If not first contig (contigs not empty):
# - add its name as well as its size to contigs list
# - write header ("<contig name> <size>") to replicon file
if prev_cont:
cont = "\t".join([prev_cont, str(cont_size)]) + "\n"
contigs.append(cont)
grf.write("\t".join([prev_cont, str(cont_size)]) + "\n")
grf.write(seq)
prev_cont = ">" + gembase_name + "." + str(contig_num).zfill(4)
contig_num += 1
cont_size = 0
seq = ""
# Sequence line: write it as is in replicon file, and add its size to cont_size.
else:
grf.write(line)
seq += line
cont_size += len(line.strip())
contigs.append("{};{}".format(new_cont, cont_size))
# Write last contig
cont = "\t".join([prev_cont, str(cont_size)]) + "\n"
contigs.append(cont)
grf.write("\t".join([prev_cont, str(cont_size)]) + "\n")
grf.write(seq)
return contigs
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment