From bd8884701ecc68332036b239ac2ab8bca1245562 Mon Sep 17 00:00:00 2001 From: asetGem <amandine.perrin@pasteur.fr> Date: Mon, 9 Nov 2020 14:37:50 +0100 Subject: [PATCH] check that genomes to analyse are not binary files --- .../annotate_module/genome_seq_functions.py | 6 +++- test/data/annotate/genomes/genome.fna.bin | Bin 0 -> 893 bytes .../test_annotate/test_genome_func.py | 26 ++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 test/data/annotate/genomes/genome.fna.bin diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py index e09697e5..73747b17 100755 --- a/PanACoTA/annotate_module/genome_seq_functions.py +++ b/PanACoTA/annotate_module/genome_seq_functions.py @@ -111,7 +111,11 @@ def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, soft, logger, quiet=Fals bar.update(curnum) curnum += 1 # analyse genome, and check everything went well - res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger) + try: + res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger) + except UnicodeDecodeError: + logger.warning(f"'{genome}' does not seem to be a fasta file. It will be ignored.") + res = False # Problem while analysing genome -> genome ignored if not res: toremove.append(genome) diff --git a/test/data/annotate/genomes/genome.fna.bin b/test/data/annotate/genomes/genome.fna.bin new file mode 100644 index 0000000000000000000000000000000000000000..dface9661457a8a1fc0337368dafba54ebeb1309 GIT binary patch literal 893 zcmWIWW@Zs#U|`??Vnv3StY<HLfUG4z%nrmw`Q>_f1(mbkh`*8+IC&!ASistOlcMIt zEXiBCAa8Q!qIHWG%$pV;5;SFo`26@OlemP+z0YX+v37~4tB5UO`YAQdYL*q(RxS|+ zJvNgA9W65#NH7GTSQwx6pgai4!sPrMDp}~p#>S?mkj9oIW@6UEhGwn$CsrX>pta3F z%mu`0`9-;jB{*%46V6#f$g2>KGOl~_rv}ZkNjqPddjc()0K}X?T%3_ukc!hTV6ZME zWS1K}Si{&j8ZM&Q_xquQRxZ$<6F>}#)|AAOL_F4|5ws2-i|bBo|5&BJf7PvQ>3QF~ zepQ^@_k2<Aq-T|?7QDjd2DTAe-=DY3_Ox{#&Hy@zkx7IZcgg~~3k(_<K@>Edp=&}< z9S|)H3=NEhKqfS;pld>pTZE<;Kqexl(e<H66GC4o(2ej&L)V5L+6Zk&f!bhUAK=Z( Q29jX~LVlol1QUn{0RPAQt^fc4 literal 0 HcmV?d00001 diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py index e362b732..a01db9b4 100755 --- a/test/test_unit/test_annotate/test_genome_func.py +++ b/test/test_unit/test_annotate/test_genome_func.py @@ -574,6 +574,32 @@ def test_analyse_all_genomes_nocut(caplog): assert ("Calculating genome size, number of contigs, L90") in caplog.text +def test_analyse_all_genomes_binary(caplog): + """ + Analyze all given genomes: don't cut at stretches of N, but look at their sequence + file, to calculate L90, genome size and nb contigs. Add this information, as well as the + path to the genomic sequence, to the genomes dict. + 1 file is a binary file: write warning message and remove it from analysis. + """ + caplog.set_level(logging.DEBUG) + gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta", "genome.fna.bin"] + genomes = {gs[0]: ["SAEN.1113"], + gs[1]: ["SAEN.1114"], + gs[2]: ["ESCO.0416"], + gs[3]: ["BIN.1234"]} + nbn = 0 + # Run analysis + gfunc.analyse_all_genomes(genomes, GEN_PATH, GENEPATH, nbn, "prokka", logger, quiet=False) + # construct expected results + gpaths = [os.path.join(GEN_PATH, gname) for gname in gs] + exp_genomes = {gs[0]: ["SAEN.1113", gpaths[0], gpaths[0], 51, 4, 2], + gs[1]: ["SAEN.1114", gpaths[1], gpaths[1], 67, 3, 3], + gs[2]: ["ESCO.0416", gpaths[2], gpaths[2], 70, 4, 1]} + assert exp_genomes == genomes + assert ("Calculating genome size, number of contigs, L90") in caplog.text + assert ("'genome.fna.bin' does not seem to be a fasta file. It will be ignored.") in caplog.text + + def test_analyse_all_genomes_cut(caplog): """ Analyze all given genomes: cut at stretches of 3N, and look at their sequence -- GitLab