diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py index e09697e5d1def7dd205ecb44634059a30369bdc8..73747b175d81c19f707e7188a4b3a407c790cb04 100755 --- a/PanACoTA/annotate_module/genome_seq_functions.py +++ b/PanACoTA/annotate_module/genome_seq_functions.py @@ -111,7 +111,11 @@ def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, soft, logger, quiet=Fals bar.update(curnum) curnum += 1 # analyse genome, and check everything went well - res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger) + try: + res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger) + except UnicodeDecodeError: + logger.warning(f"'{genome}' does not seem to be a fasta file. It will be ignored.") + res = False # Problem while analysing genome -> genome ignored if not res: toremove.append(genome) diff --git a/test/data/annotate/genomes/genome.fna.bin b/test/data/annotate/genomes/genome.fna.bin new file mode 100644 index 0000000000000000000000000000000000000000..dface9661457a8a1fc0337368dafba54ebeb1309 Binary files /dev/null and b/test/data/annotate/genomes/genome.fna.bin differ diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py index e362b732e76f9449d68baf14829e23084b2769b8..a01db9b4c2255b5b36cb1d1ca7641457b0789848 100755 --- a/test/test_unit/test_annotate/test_genome_func.py +++ b/test/test_unit/test_annotate/test_genome_func.py @@ -574,6 +574,32 @@ def test_analyse_all_genomes_nocut(caplog): assert ("Calculating genome size, number of contigs, L90") in caplog.text +def test_analyse_all_genomes_binary(caplog): + """ + Analyze all given genomes: don't cut at stretches of N, but look at their sequence + file, to calculate L90, genome size and nb contigs. Add this information, as well as the + path to the genomic sequence, to the genomes dict. + 1 file is a binary file: write warning message and remove it from analysis. + """ + caplog.set_level(logging.DEBUG) + gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "genome.fna.bin"] + genomes = {gs[0]: ["SAEN.1113"], + gs[1]: ["SAEN.1114"], + gs[2]: ["ESCO.0416"], + gs[3]: ["BIN.1234"]} + nbn = 0 + # Run analysis + gfunc.analyse_all_genomes(genomes, GEN_PATH, GENEPATH, nbn, "prokka", logger, quiet=False) + # construct expected results + gpaths = [os.path.join(GEN_PATH, gname) for gname in gs] + exp_genomes = {gs[0]: ["SAEN.1113", gpaths[0], gpaths[0], 51, 4, 2], + gs[1]: ["SAEN.1114", gpaths[1], gpaths[1], 67, 3, 3], + gs[2]: ["ESCO.0416", gpaths[2], gpaths[2], 70, 4, 1]} + assert exp_genomes == genomes + assert ("Calculating genome size, number of contigs, L90") in caplog.text + assert ("'genome.fna.bin' does not seem to be a fasta file. It will be ignored.") in caplog.text + + def test_analyse_all_genomes_cut(caplog): """ Analyze all given genomes: cut at stretches of 3N, and look at their sequence