From bd8884701ecc68332036b239ac2ab8bca1245562 Mon Sep 17 00:00:00 2001
From: asetGem <amandine.perrin@pasteur.fr>
Date: Mon, 9 Nov 2020 14:37:50 +0100
Subject: [PATCH] check that genomes to analyse are not binary files

---
 .../annotate_module/genome_seq_functions.py   |   6 +++-
 test/data/annotate/genomes/genome.fna.bin     | Bin 0 -> 893 bytes
 .../test_annotate/test_genome_func.py         |  26 ++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 test/data/annotate/genomes/genome.fna.bin

diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py
index e09697e5..73747b17 100755
--- a/PanACoTA/annotate_module/genome_seq_functions.py
+++ b/PanACoTA/annotate_module/genome_seq_functions.py
@@ -111,7 +111,11 @@ def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, soft, logger, quiet=Fals
             bar.update(curnum)
             curnum += 1
         # analyse genome, and check everything went well
-        res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger)
+        try:
+            res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger)
+        except UnicodeDecodeError:
+            logger.warning(f"'{genome}' does not seem to be a fasta file. It will be ignored.")
+            res = False
         # Problem while analysing genome -> genome ignored
         if not res:
             toremove.append(genome)
diff --git a/test/data/annotate/genomes/genome.fna.bin b/test/data/annotate/genomes/genome.fna.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dface9661457a8a1fc0337368dafba54ebeb1309
GIT binary patch
literal 893
zcmWIWW@Zs#U|`??Vnv3StY<HLfUG4z%nrmw`Q>_f1(mbkh`*8+IC&!ASistOlcMIt
zEXiBCAa8Q!qIHWG%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YAQdYL*q(RxS|+
zJvNgA9W65#NH7GTSQwx6pgai4!sPrMDp}~p#>S?mkj9oIW@6UEhGwn$CsrX>pta3F
z%mu`0`9-;jB{*%46V6#f$g2>KGOl~_rv}ZkNjqPddjc()0K}X?T%3_ukc!hTV6ZME
zWS1K}Si{&j8ZM&Q_xquQRxZ$<6F>}#)|AAOL_F4|5ws2-i|bBo|5&BJf7PvQ>3QF~
zepQ^@_k2<Aq-T|?7QDjd2DTAe-=DY3_Ox{#&Hy@zkx7IZcgg~~3k(_<K@>Edp=&}<
z9S|)H3=NEhKqfS;pld>pTZE<;Kqexl(e<H66GC4o(2ej&L)V5L+6Zk&f!bhUAK=Z(
Q29jX~LVlol1QUn{0RPAQt^fc4

literal 0
HcmV?d00001

diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py
index e362b732..a01db9b4 100755
--- a/test/test_unit/test_annotate/test_genome_func.py
+++ b/test/test_unit/test_annotate/test_genome_func.py
@@ -574,6 +574,32 @@ def test_analyse_all_genomes_nocut(caplog):
     assert ("Calculating genome size, number of contigs, L90") in caplog.text
 
 
+def test_analyse_all_genomes_binary(caplog):
+    """
+    Analyze all given genomes: don't cut at stretches of N, but look at their sequence
+    file, to calculate L90, genome size and nb contigs. Add this information, as well as the
+    path to the genomic sequence, to the genomes dict.
+    1 file is a binary file: write warning message and remove it from analysis.
+    """
+    caplog.set_level(logging.DEBUG)
+    gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "genome.fna.bin"]
+    genomes = {gs[0]: ["SAEN.1113"],
+               gs[1]: ["SAEN.1114"],
+               gs[2]: ["ESCO.0416"],
+               gs[3]: ["BIN.1234"]}
+    nbn = 0
+    # Run analysis
+    gfunc.analyse_all_genomes(genomes, GEN_PATH, GENEPATH, nbn, "prokka", logger, quiet=False)
+    # construct expected results
+    gpaths = [os.path.join(GEN_PATH, gname) for gname in gs]
+    exp_genomes = {gs[0]: ["SAEN.1113", gpaths[0], gpaths[0], 51, 4, 2],
+                   gs[1]: ["SAEN.1114", gpaths[1], gpaths[1], 67, 3, 3],
+                   gs[2]: ["ESCO.0416", gpaths[2], gpaths[2], 70, 4, 1]}
+    assert exp_genomes == genomes
+    assert ("Calculating genome size, number of contigs, L90") in caplog.text
+    assert ("'genome.fna.bin' does not seem to be a fasta file. It will be ignored.") in caplog.text
+
+
 def test_analyse_all_genomes_cut(caplog):
     """
     Analyze all given genomes: cut at stretches of 3N, and look at their sequence
-- 
GitLab