__main__.py 19.19 KiB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
###############################################################################
# #
# Copyright (C) 2020 Melanie HENNART #
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <https://www.gnu.org/licenses/>. #
# #
# #
# Contact: #
# #
# Melanie HENNART, PhD Student #
# melanie.hennart@pasteur.fr #
# Biodiversity and Epidemiology of Bacterial Pathogens #
# Institut Pasteur #
# 25-28, Rue du Docteur Roux #
# 75015 Paris Cedex 15 #
# France #
# #
###############################################################################
"""
diphtOscan is a tool to screen genome assemblies of the diphtheriae species
complex (DiphSC) for:
- Species (e.g. C. diphtheriae, C. belfantii, C. rouxii, C. ulcerans,
C. ramonii and C. pseudotuberculosis)
- Biovar-associated genes
- MLST sequence type
- Virulence factors
- Antimicrobial resistance: acquired genes, SNPs & genomic context
- Detection of tox gene (Presence/Absence & Allele)
- Detection of integrons (Integron_Finder)
- Tree building (JolyTree)
Usage:
======
python diphtOscan.py argument1 argument2
argument1: un entier signifiant un truc
argument2: une chaîne de caractères décrivant un bidule
"""
__authors__ = ("Melanie HENNART; Martin RETHORET-PASTY")
__contact__ = ("martin.rethoret-pasty@pasteur.fr")
__version__ = "1.6.1"
__copyright__ = "copyleft"
__date__ = "2024/03/04"
###############################################################################
# #
# ================ #
# = INSTALLATION = #
# ================ #
# #
# [1] REQUIREMENTS ========================================================= #
# #
# -- Mash: fast pairwise p-distance estimation ----------------------------- #
# VERSION >= 2.1 #
# src: github.com/marbl/Mash #
# Ondov BD, Treangen TJ, Melsted P, Mallonee AB, Bergman NH, Koren S, #
# Phillippy AM (2016) Mash: fast genome and metagenome distance #
# estimation using MinHash. Genome Biology, 17:132. #
# doi:10.1186/s13059-016-0997-x #
# #
#PATH_MASH="mash" #
# #
# #
# [2] NOTES ON THE USE OF JOLYTREE WITH SLURM (slurm.schedmd.com) ========== #
# #
#os.system("module purge") ;
#os.system("module load Mash/2.1") ; #
#PATH_MASH = "module load Mash/2.1" #
# #
# #
###############################################################################
###############################################################################
# ================ #
# = NOTE = #
# ================ #
# #
# mash sketch -o reference genome1.fna genome2.fna #
# mash info reference.msh #
# #
###############################################################################
import sys
import os
import pandas as pd
import datetime
import os.path
import argparse
import subprocess
import shutil
sys.path.append('/module')
from typing import List
from module.species import get_species_results, is_cd_complex
from module.template_iTOL import spuA, narG, toxin, amr_families
from module.updating_database import update_database
from module.jolytree_generation import generate_jolytree
from module.utils import (
get_chromosome_mlst_results,
get_tox_results,
get_chromosome_mlst_header,
get_tox_header,
delete_virulence_extended,
is_non_zero_file,
armfinder_to_table,
get_genomic_context,
find_resistance_db
)
def test_unique_dependency(name:str):
return shutil.which(name) is not None
def test_multiple_dependencies(dependencies:List[str]):
for dependency in dependencies:
presence = test_unique_dependency(dependency)
if presence is not True:
print(f'/!\\ Warning /!\\ : {dependency} missing in path!')
sys.exit(-1)
def test_required_dependency(args):
diphtoscan_dependencies = ["mash",'amrfinder','hmmsearch', 'makeblastdb','blastn', 'blastp']
joly_tree_path = "diphtoscan/script/JolyTree/JolyTree.sh"
joly_tree_dependencies = ["gawk",'fastme','REQ']
integron_fender_dependencies = ['hmmsearch', 'cmsearch', 'prodigal']
if args.assemblies == None: #TODO :Ensure that dependencies are not required to update the database
return args
subprocess.run(["echo", "Dependency testing"])
test_multiple_dependencies(diphtoscan_dependencies)
if args.integron:
rc = test_unique_dependency("integron_finder")
test_multiple_dependencies(integron_fender_dependencies)
if rc == 0:
args.integron = True
else:
print('/!\\ Warning /!\\ : integron_finder missing in path! Integron analysis not carried out.')
args.integron = False
if args.tree:
if os.path.isfile(joly_tree_path):
test_multiple_dependencies(joly_tree_dependencies)
args.tree = True
else:
print('/!\\ Warning /!\\ : JolyTree.sh missing in /diphtoscan/script/JolyTree/ ! Joly_tree representation not carried out.')
args.tree = False
print('\n')
return args
def redefine_output_file(args):
if not os.path.exists(args.outdir):
print(f"Directory {args.outdir} does not exist.")
try:
os.makedirs(args.outdir)
print("Directory '%s' created successfully \n" %args.outdir)
except OSError :
print("Directory '%s' can not be created \n" %args.outdir)
sys.exit(0)
final_output_path = args.outdir
args.outdir = f"{args.outdir}_temp_folder"
return args, final_output_path
def rename_temp_folder_file(directory):
path, file_name = os.path.split(directory)
expected_filename = f"{file_name}.txt"
file_path = os.path.join(directory, expected_filename)
if os.path.exists(file_path):
new_filename = expected_filename.replace("_temp_folder.txt", ".txt")
new_file_path = os.path.join(args.outdir, new_filename)
try:
os.rename(file_path, new_file_path)
except Exception as e:
print(f"Error renaming file : {e}")
else:
print(f"The {file_path} file does not exist.")
def move_file_to_outdir_folder(temporary_folder, outdir_folder):
for fichier in os.listdir(outdir_folder):
chemin_fichier = os.path.join(outdir_folder, fichier)
try:
if os.path.isfile(chemin_fichier):
os.unlink(chemin_fichier)
elif os.path.isdir(chemin_fichier):
shutil.rmtree(chemin_fichier)
except Exception as e:
print(f"Error when deleting {chemin_fichier}: {e}")
rename_temp_folder_file(temporary_folder)
for fichier in os.listdir(temporary_folder):
source_path = os.path.join(temporary_folder, fichier)
destination_path = os.path.join(outdir_folder, fichier)
try:
if os.path.isfile(source_path):
shutil.move(source_path, destination_path)
elif os.path.isdir(source_path):
shutil.move(source_path, destination_path)
except Exception as e:
print(f"Error when transferring {source_path} to {destination_path}:: {e}")
try:
shutil.rmtree(temporary_folder)
except Exception as e:
print(f"Error when deleting the temporary folder {temporary_folder}: {e}")
def parse_arguments():
parser = argparse.ArgumentParser(description='diphtOscan is a tool to screen genome assemblies '
'of the diphtheriae species complex (CdSC)',
add_help=False)
updating_args = parser.add_argument_group('Updating option')
updating_args.add_argument('-u', '--update', action='store_true',
help='Update database MLST, Tox Allele & AMR (default: no).'
'The database update can be executed on its own without the -a option.')
required_args = parser.add_argument_group('Required option')
required_args.add_argument('-a', '--assemblies', nargs='+', type=str, required=('-u' not in sys.argv and '--update' not in sys.argv),
help='FASTA file(s) for assemblies. ') #-a is required only if -u is not present. It allows the user to update the database easily
screening_args = parser.add_argument_group('Screening options')
screening_args.add_argument('-st', '--mlst', action='store_true',
help='Turn on species Corynebacterium diphtheriae species complex (CdSC)'
' and MLST sequence type (default: no)')
screening_args.add_argument('-t', '--tox', action='store_true',
help='Turn on tox allele (default: no)')
screening_args.add_argument('-res_vir', '--resistance_virulence', action='store_true',
help='Turn on resistance and main virulence genes screening (default: no resistance '
'and virulence gene screening)')
screening_args.add_argument('-plus', '--extend_genotyping', action='store_true',
help='Turn on all virulence genes screening (default: no all virulence '
'gene screening)')
screening_args.add_argument('-integron', '--integron', action='store_true',
help='Screening the intregon(default: no)')
output_args = parser.add_argument_group('Output options')
output_args.add_argument('-o', '--outdir', type=str, default="results_"+ datetime.datetime.today().strftime("%Y-%m-%d_%I-%M-%S_%p"),
help='Folder for detailed output (default: results_YYYY-MM-DD_II-MM-SS_PP)')
setting_args = parser.add_argument_group('Settings')
setting_args.add_argument('--min_identity', type=float, default=80.0,
help='Minimum alignment identity for main results (default: 80)')
setting_args.add_argument('--min_coverage', type=float, default=50.0,
help='Minimum alignment coverage for main results (default: 50)')
setting_args.add_argument('--threads', type=int, default=4,
help='The number of threads to use for processing. (default: 4)')
setting_args.add_argument('--overwrite', action='store_true',
help='Allows the output directory to be overwritten if it already exists')
tree_args = parser.add_argument_group('Phylogenetic tree')
tree_args.add_argument('-tree', '--tree', action='store_true',
help='Generates a phylogenetic tree from JolyTree')
help_args = parser.add_argument_group('Help')
help_args.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS,
help='Show this help message and exit')
help_args.add_argument('--version', action='version', version='diphtOscan v' + __version__,
help="Show program's version number and exit")
# If no arguments were used, print the entire help (argparse default is to just give an error
# like '-a is required').
if len(sys.argv) == 1:
parser.print_help(file=sys.stderr)
sys.exit(1)
args = parser.parse_args()
args.extract = False
args.path = os.path.dirname(os.path.abspath(__file__))
args = test_required_dependency(args)
return args
if __name__ == "__main__":
args = parse_arguments()
get_path = os.getcwd()
MLST_db = (get_chromosome_mlst_header(), args.path + '/data/mlst/pubmlst_diphtheria_seqdef_scheme_3.fas', args.path + '/data/mlst/st_profiles.txt')
TOX_db = (get_tox_header(), args.path + '/data/tox/pubmlst_diphtheria_seqdef_scheme_4.fas', args.path + '/data/tox/tox_profiles.txt')
update_database(args,MLST_db,TOX_db)
if args.assemblies == None:
sys.exit(0)
resistance_db = find_resistance_db(args)
if args.overwrite :
args, final_output_path = redefine_output_file(args)
try:
os.makedirs(args.outdir)
print("Directory '%s' created successfully \n" %args.outdir)
except OSError :
print("Directory '%s' can not be created \n" %args.outdir)
sys.exit(0)
dict_results = {}
data_resistance = pd.DataFrame()
for genome in args.assemblies :
basename = os.path.basename(genome)
strain = os.path.splitext(basename)[0]
dict_genome = get_species_results(genome, args.path + '/data/species', str(args.threads))
if args.mlst :
cd_complex = is_cd_complex(dict_genome)
dict_genome.update(get_chromosome_mlst_results(MLST_db, genome, cd_complex, args))
if args.tox :
dict_genome.update(get_tox_results(TOX_db, genome, args))
if args.resistance_virulence :
min_identity = "-1" # Defaut amrfinder
os.system('amrfinder --nucleotide ' + genome +
' --name '+strain+
' --nucleotide_output ' + args.outdir + "/" + strain + ".prot.fa" +
' --output '+ args.outdir + "/" + strain + ".blast.out" +
' --ident_min '+ min_identity +
' --coverage_min ' + str(args.min_coverage/100) +
' --organism Corynebacterium_diphtheriae' +
' --database ' + resistance_db +
' --threads ' + str(args.threads)+
#' --blast_bin /opt/gensoft/exe/blast+/2.12.0/bin/' +
' --translation_table 11 --plus --quiet ')
if is_non_zero_file(args.outdir +'/' +strain + ".prot.fa"):
data = pd.read_csv(args.outdir +'/' + strain + ".blast.out",sep="\t", dtype='str')
data['File'] = genome
data_resistance = pd.concat([data_resistance, data], axis = 0, ignore_index=True)
dict_genome.update({"GENOMIC_CONTEXT" : get_genomic_context (args.outdir, data)})
else :
os.system('rm '+ args.outdir +'/' + strain + ".prot.fa")
os.system('rm '+ args.outdir +'/' + strain + ".blast.out")
if args.integron :
os.system('integron_finder --cpu ' + str(args.threads)+
' --outdir '+ args.outdir + "/" +
' --gbk --func-annot --mute '+ genome)
os.system('find '+ args.outdir + "/Results_Integron_Finder_*/ " + '-empty -type d -delete')
files = pd.read_csv(args.outdir + "/Results_Integron_Finder_"+strain + "/" + strain+".summary",sep="\t", index_col=0, skiprows = 1)
dict_genome.update(files[['CALIN','complete','In0']].sum().to_dict())
dict_results[strain] = dict_genome
table_results = pd.DataFrame(dict_results)
table_results = table_results.T
if len(data_resistance.index) != 0 :
table_resistance = armfinder_to_table(data_resistance)
for family in table_resistance.columns:
table_resistance[family] = table_resistance[family].apply(lambda x : ";".join(sorted(x.split(';'))))
if not args.extend_genotyping :
header = [x for x in table_resistance.columns if x not in delete_virulence_extended()]
table_resistance = table_resistance[sorted(header)]
table_resistance = table_resistance.replace('','-')
results = pd.concat([table_results, table_resistance], axis=1, join='outer')
else :
results = table_results
results = results.infer_objects().fillna("-")
spuA(results, args)
narG(results, args)
toxin(results, args)
amr_families(results, args)
results.to_csv(args.outdir+"/"+args.outdir.split("/")[-1]+".txt", sep='\t')
if args.tree and len(args.assemblies) >= 4 :
generate_jolytree(args)
if args.overwrite :
move_file_to_outdir_folder(temporary_folder = args.outdir,
outdir_folder = final_output_path
)