From fb1b7b698d44426191ece4c406c322bd32c41480 Mon Sep 17 00:00:00 2001
From: Amandine PERRIN <amandine.perrin@pasteur.fr>
Date: Mon, 12 Oct 2020 10:13:58 +0200
Subject: [PATCH] When contig is split, put unique ID at the begining of the
 header

---
 .../annotate_module/genome_seq_functions.py   |  5 ++--
 .../exp_files/exp_split_contig_cut3N.fna      |  4 +--
 .../exp_files/exp_split_contig_nocut.fna      |  2 +-
 .../exp_files/exp_split_empty_contig.fna      |  2 +-
 .../test_annotate/test_genome_func.py         | 30 +++++++++----------
 5 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py
index 639a4f9f..6ec2bb81 100755
--- a/PanACoTA/annotate_module/genome_seq_functions.py
+++ b/PanACoTA/annotate_module/genome_seq_functions.py
@@ -334,7 +334,8 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num,
 def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num):
     """
     Save the contig read just before into dicts and write it to sequence file.
-    Contig name must be at most 20 characters (required by prokka)
+    Unique ID of contig must be in the first field of header, before the first space
+    (required by prokka)
 
     Parameters
     ----------
@@ -368,7 +369,7 @@ def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num):
         # we get empty contigs, if 2 occurrences of the pattern are side by side).
         if len(seq) == 0:
             continue
-        new_contig_name = "{}_{}\n".format(cur_contig_name, num)
+        new_contig_name = ">{}_{}\n".format(num, cur_contig_name.split(">")[1])
         contig_sizes[new_contig_name] = len(seq)
         gresf.write(new_contig_name)
         gresf.write(seq + "\n")
diff --git a/test/data/annotate/exp_files/exp_split_contig_cut3N.fna b/test/data/annotate/exp_files/exp_split_contig_cut3N.fna
index d221ee3d..79c95494 100644
--- a/test/data/annotate/exp_files/exp_split_contig_cut3N.fna
+++ b/test/data/annotate/exp_files/exp_split_contig_cut3N.fna
@@ -1,4 +1,4 @@
->my_contig_name_for_my_sequence_2
+>2_my_contig_name for_my_sequence
 AACTGCTTTTTAAGCGCGCTCCTGCG
->my_contig_name_for_my_sequence_3
+>3_my_contig_name for_my_sequence
 GGTTGTGTGGGCCCAGAGCGAGNCG
diff --git a/test/data/annotate/exp_files/exp_split_contig_nocut.fna b/test/data/annotate/exp_files/exp_split_contig_nocut.fna
index 8cdf3340..f6d01ad9 100644
--- a/test/data/annotate/exp_files/exp_split_contig_nocut.fna
+++ b/test/data/annotate/exp_files/exp_split_contig_nocut.fna
@@ -1,2 +1,2 @@
->my_contig_name_for_my_sequence_2
+>2_my_contig name for my_sequence
 AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG
diff --git a/test/data/annotate/exp_files/exp_split_empty_contig.fna b/test/data/annotate/exp_files/exp_split_empty_contig.fna
index 0c5be8ca..d123f540 100644
--- a/test/data/annotate/exp_files/exp_split_empty_contig.fna
+++ b/test/data/annotate/exp_files/exp_split_empty_contig.fna
@@ -1,2 +1,2 @@
->my_contig_name_for_my_sequence_2
+>2_my_contig_name_for_my_sequence
 AACTGCTTTTTAAGCGCGCTCCTGCGNGGTTGTGTGGGCCCAGAGCGAGNCG
diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py
index 779c7d01..2e782af1 100755
--- a/test/test_unit/test_annotate/test_genome_func.py
+++ b/test/test_unit/test_annotate/test_genome_func.py
@@ -125,7 +125,7 @@ def test_split_contig_nocut():
     """
     pat = None
     whole_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
-    cur_contig_name = ">my_contig_name_for_my_sequence"
+    cur_contig_name = ">my_contig name for my_sequence"
     contig_sizes = {"contig_1": 10}
     resfile = os.path.join(GENEPATH, "test_split_contig_nocut.fna")
     gresf = open(resfile, "w")
@@ -149,7 +149,7 @@ def test_split_contig_cut():
     """
     pat = "NNN+"
     whole_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
-    cur_contig_name = ">my_contig_name_for_my_sequence"
+    cur_contig_name = ">my_contig_name for_my_sequence"
     contig_sizes = {">contig_1": 10}
     resfile = os.path.join(GENEPATH, "test_split_contig_nocut.fna")
     gresf = open(resfile, "w")
@@ -214,7 +214,7 @@ def test_format_contig_cut():
 
 def test_format_contig_nocut():
     """
-    For a given contig, if we want to annotate it with prokka, and do not cut at each stretch of 
+    For a given contig, if we want to annotate it with prokka, and do not cut at each stretch of
     5 'N'check that it writes this contig as given
     """
     cut = False
@@ -238,8 +238,8 @@ def test_format_contig_nocut():
 
 def test_format_contig_nocut_prodigal_notSameName():
     """
-    For a given contig, if we want to annotate it with prodigal, and do not cut, 
-    then we keep the same file (no need to split at 20 characters) 
+    For a given contig, if we want to annotate it with prodigal, and do not cut,
+    then we keep the same file (no need to split at 20 characters)
     However, we must check that contig names are all different.
     Add 2 contigs, to be sure the 'num' parameter is not increased.
     """
@@ -400,7 +400,7 @@ def test_analyse1genome_nocut_prodigal():
 def test_analyse1genome_cut_prodigal():
     '''
     Analyse the given genome, cutting at stretches of 5N, in order to annotate it
-    Create new genome file in outdir, calculate genome size, nb contigs and L90, and add it 
+    Create new genome file in outdir, calculate genome size, nb contigs and L90, and add it
     to the genomes dict, as well as the path to the genome file.
     '''
     gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
@@ -429,8 +429,8 @@ def test_analyse1genome_cut_prodigal():
 def test_analyse1genome_cut_prokka():
     '''
     Analyse the given genome, cutting at stretches of 5N, in order to annotate it with prokka
-    Create new genome file in outdir, with shortened contig names, calculate genome size, 
-    nb contigs and L90, and add it 
+    Create new genome file in outdir, with shortened contig names, calculate genome size,
+    nb contigs and L90, and add it
     to the genomes dict, as well as the path to the genome file.
     '''
     gs = ["genome1.fasta", "genome2.fasta", "genome3.fasta"]
@@ -485,7 +485,7 @@ def test_analyse1genome_same_names_nocut(caplog):
     genomes = {genome: ["SAEN.1015.0117"]}
     cut = False
     pat = None
-    assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes, 
+    assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes,
                                     "prodigal", logger)
     assert ("myheader contig name is used for several contigs. Please put different names for "
             "each contig. This genome will be ignored") in caplog.text
@@ -501,7 +501,7 @@ def test_analyse1genome_same_last_name_nocut(caplog):
     genomes = {genome: ["SAEN.1015.0117"]}
     cut = False
     pat = None
-    assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes, 
+    assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes,
                                     "prodigal", logger)
     assert ("myheader contig name is used for several contigs. Please put different names for "
             "each contig. This genome will be ignored") in caplog.text
@@ -509,7 +509,7 @@ def test_analyse1genome_same_last_name_nocut(caplog):
 
 def test_analyse1genome_nofile(caplog):
     '''
-    Test that when we ask to analyse a genome whose sequence file does not exist, it returns false 
+    Test that when we ask to analyse a genome whose sequence file does not exist, it returns false
     with corresponding error message.
     '''
     caplog.set_level(logging.DEBUG)
@@ -527,7 +527,7 @@ def test_analyse1genome_nofile(caplog):
 
 def test_analyse1genome_empty(caplog):
     '''
-    Test that when we ask to analyse a genome whose sequence file does not exist, it returns false 
+    Test that when we ask to analyse a genome whose sequence file does not exist, it returns false
     with corresponding error message.
     '''
     caplog.set_level(logging.DEBUG)
@@ -592,7 +592,7 @@ def test_analyse_all_genomes_cut(caplog):
                    gs[1]: ["SAEN.1114", gpaths[1], opaths[1], 51, 6, 5],
                    gs[2]: ["ESCO.0416", gpaths[2], opaths[2], 70, 4, 1]}
     assert exp_genomes == genomes
-    assert ("Cutting genomes at each time there are at least 3 'N' in a row, " 
+    assert ("Cutting genomes at each time there are at least 3 'N' in a row, "
             "and then, calculating genome size, number of contigs and L90.") in caplog.text
 
 
@@ -654,7 +654,7 @@ def test_analyse_all_genomes_cut_empty(caplog):
                    gs[1]: ["SAEN.1114", gpaths[1], opaths[1], 51, 6, 5],
                    gs[3]: ["ESCO.0123", gpaths[3], opaths[3], 70, 4, 1]}
     assert exp_genomes == genomes
-    assert ("Cutting genomes at each time there are at least 3 'N' in a row, " 
+    assert ("Cutting genomes at each time there are at least 3 'N' in a row, "
             "and then, calculating genome size, number of contigs and L90.") in caplog.text
     assert ("Your file test/data/annotate/genomes/empty.fasta "
             "does not contain any gene. Please check that you really gave a "
@@ -679,7 +679,7 @@ def test_analyse_all_genomes_noseq(caplog):
     # Run analysis
     with pytest.raises(SystemExit):
         gfunc.analyse_all_genomes(genomes, "toto", GENEPATH, nbn, "prokka", logger, quiet=True)
-    assert ("No genome was found in the database folder toto. See logfile " 
+    assert ("No genome was found in the database folder toto. See logfile "
             "for more information.") in caplog.text
 
 
-- 
GitLab