Skip to content
Snippets Groups Projects
Commit 7ca4d503 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

little improvements for lst/gen generation

parent 44208646
Branches
Tags
No related merge requests found
......@@ -5,7 +5,7 @@
Functions to convert prodigal result files to gembase format.
* Proteins: multifasta with all CDS in aa
* Replicons: multifasta of genome
* Replicons: (multi)fasta of genome sequences
* Genes: multifasta of all genes in nuc
* gff3: gff files without sequence
* LSTINFO: information on annotation. Columns are: "start end strand type locus
......@@ -42,7 +42,8 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
name : str
gembase name of the genome
prod_path : str
directory where prodigal folders are saved
directory where all tmp_files for all sequences are saved (sequences cut at each set
of 5N, prodigal results and logs)
name : str
gembase name of the genome
lst_dir : srt
......@@ -63,11 +64,15 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
bool :
True if genome was correctly formatted, False otherwise
"""
# Get directory where prodigal results for the current genome are saved
prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes")
# Get prodigal result files
prot_file = os.path.join(prodigal_dir, name + ".faa")
gen_file = os.path.join(prodigal_dir, name + ".ffn")
gff_file = os.path.join(prodigal_dir, name + ".gff")
# Define names for generated gembase files
res_prot_file = os.path.join(prot_dir, name + ".prt")
res_gene_file = os.path.join(gene_dir, name + ".gen")
res_lst_file = os.path.join(lst_dir, name + ".lst")
......@@ -75,15 +80,16 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
res_gff_file = os.path.join(gff_dir, name + ".gff")
# First, create gen and lst files, and get genome contig names
# First, create .gen and .lst files,
ok = create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger)
if not ok:
os.remove(res_gene_file)
os.remove(res_lst_file)
logger.error("Problems while generating gene and lst files for {}".format(name))
logger.error("Problems while generating .gen and .lst files for {}".format(name))
return False
# Create replicon file. Same as input sequence but with gembase formatted headers
# From Replicon file, get contig names, and use them to generate gff file
ok = create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name, logger)
# ok = create_prt(prot_file, res_prot_file, res_lst_file, logger)
......@@ -94,14 +100,14 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
"""
Generate .gen file, from sequences contained in .ffn, but changing the
headers to match with gembase format.
At the same time, generate lst file
At the same time, generate .lst file, from the information given in prodigal ffn headers
Parameters
----------
gen_file : str
.ffn file generated by prodigal
res_gene_file : str
output file, to write in Genes directory
output .gen file, to write in Genes directory
res_lst_file : str
.lst file to write in LSTINFO directory
name : str
......@@ -115,7 +121,7 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
True if conversion went well, False otherwise
"""
# Variable which will contain the current sequence
# Variable which will contain the current gene sequence
seq = ""
# number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not
# re-start at 1 for each new contig)
......@@ -167,6 +173,7 @@ def create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger):
prev_end = end
prev_cont_num = contig_num
prev_annot = annot
# Convert strand 1/-1 to D/C
if int(strand) == 1:
strand = "D"
else:
......@@ -261,7 +268,7 @@ def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, na
path to the genome sequence
res_rep_file : str
path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers
res-gff_file : str
res_gff_file : str
path to the gff file that must be created in result database
res_lst_file : str
path to the lst file that was created in result database in the previous step
......@@ -274,13 +281,14 @@ def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, na
Returns
-------
list :
bool :
True if everything went well, False otherwise
"""
contigs = create_replicons(gpath, res_rep_file, name)
if contigs:
return create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logger)
logger.info(contigs)
# if contigs:
# return create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logger)
def create_replicons(gpath, res_rep_file, name):
......@@ -294,18 +302,20 @@ def create_replicons(gpath, res_rep_file, name):
----------
gpath : str
path to the genome sequence
res_rep_file : str
path to the new file, containing 'gpath' sequence, but with 'gembase_named' headers
name : str
genome name to use (species.date.strain)
res_rep_file : str
path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers
Returns
-------
list :
List of all contigs with their size ["contig1;size1", "contig2;size2" ...]
"""
# Change headers to put into gembase format
# Change headers of replicon file to put into gembase format, and save it in res_rep_file
contigs = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file)
# Return each contig name with its size.
return contigs
......@@ -349,7 +359,7 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, name, contigs, logge
Returns
-------
bool
True if everythin went well, False if any problem
True if everything went well, False if any problem
"""
# open gff generated by prodigal to read it
......
......@@ -850,7 +850,10 @@ def check_out_dirs(resdir):
def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
"""
For the given genome (sequence in gpath), rename all its contigs
with the new name: 'gembase_name', and save the output sequence in outfile
with the new name: 'gembase_name', and save the output sequence in outfile.
For each contig renamed, save its new name as well as its size. This will be used to generate
gff files
Parameters
----------
......@@ -861,22 +864,30 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
outfile : str
path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers
Returns
-------
list
List of all contigs with their size ["contig1;size1", "contig2;size2" ...]
"""
# Initialize variables
contig_num = 1
cont_size = 0
contigs = []
# Read input sequence given to prodigal, and open file where sequences with new
# headers must be written.
with open(gpath, "r") as gpf, open(outfile, "w") as grf:
for line in gpf:
# When we find a new header line, convert its name to gembase format, and write it
# to the list of contigs, as well as its size (calculated before)
# to output replicon file
if line.startswith(">") :
new_cont = ">" + gembase_name + "." + str(contig_num).zfill(4)
grf.write(new_cont + "\n")
# If first contig, just convert its name, but no size yet
# If not first contig (size != 0), add its name as well as its size to contigs list
if cont_size != 0:
contigs.append("{};{}".format(new_cont, cont_size))
contig_num += 1
cont_size = 0
# Sequence line: write it as is in replicon file, and add its size to cont_size.
else:
grf.write(line)
cont_size += len(line.strip())
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment