diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 6f7ce6d521dc88c2788e37706ab7076362d3adf6..cd647f677639d3445593f1a7b2c94bb9d6aff8ba 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = 0.5 +__version__ = 0.6 from .libcodonusage import ( aa2colour, codon2aa, @@ -11,6 +11,7 @@ from .libcodonusage import ( make_aa_codon_columns, make_counts_only, render_md, + save_counts_table, sort_counts_by_aa, violin_usage, violin_usage_vertical, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 9bbda1f55f6b0806d7bae62a8b051bab28b86c91..e8c8eea123933142d4fc98b7d9b6246a4a174804 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -95,15 +95,15 @@ def load_counts_table(table_path, index_col="old_locus_tag"): there are other columns containing various pieces of information regarding those genes. """ - render_md(f"Loading data from {table_path}...\n") + render_md(f"Loading data from [{table_path}]({table_path})...\n") codon_counts = pd.read_table(table_path, index_col=index_col) nb_genes = len(codon_counts) render_md( f""" - There are {nb_genes} genes in the table. + There are {nb_genes} genes in the table. - The table looks as follows (first 3 lines): - """) + The table looks as follows (first 3 lines): +""") display(codon_counts.head(3)) return codon_counts @@ -176,33 +176,41 @@ def detect_fishy_genes(codon_counts): A table of boolean criteria is returned, with one line per gene. """ + + def display_gene_set(gene_set, max_size=10): + """ + Print out genes in a gene set, depending on their number. + """ + if gene_set and len(gene_set) <= max_size: + print(" ", ", ".join(gene_set)) + render_md("### Searching for mis-annotated genes") (criteria, gene_sets) = compute_criteria(codon_counts) render_md( """ - We should avoid genes that are not in-frame. We can likely exclude - those that do not start with a valid start codon. - Here, this will be seen by looking at the translation of the first - codon. - If it is "*" (`start_stop`) or "-" (`wrong_start`), - the codon is not a valid start codon. - """) +We should avoid genes that are not in-frame. We can likely exclude +those that do not start with a valid start codon. +Here, this will be seen by looking at the translation of the first +codon. +If it is "*" (`start_stop`) or "-" (`wrong_start`), +the codon is not a valid start codon. +""") wrong_start = gene_sets["wrong_start"] - render_md(f"There are {len(wrong_start)} genes that start with a `-`:") - display(wrong_start) + render_md(f"There are {len(wrong_start)} genes that start with a `-`.") + display_gene_set(wrong_start) start_stop = gene_sets["start_stop"] render_md( - f"There are {len(start_stop)} genes that start with a stop codon:") - display(start_stop) + f"There are {len(start_stop)} genes that start with a stop codon.") + display_gene_set(start_stop) no_met_start = gene_sets["no_met_start"] if no_met_start == wrong_start | start_stop: render_md( """ - Genes not belonging to the `start_stop` or `wrong_start` - category all start with a methionine. - """) +Genes not belonging to the `start_stop` or `wrong_start` +category all start with a methionine. +""") else: raise NotImplementedError( "There are genes that start with something else than 'M', " @@ -211,45 +219,45 @@ def detect_fishy_genes(codon_counts): start_upstream = gene_sets["start_upstream"] render_md( f"{len(start_upstream)} genes have a possibly ill-defined" - " start position:") - display(start_upstream) + " start position.") + display_gene_set(start_upstream) start_upstream_met_start = gene_sets["start_upstream_met_start"] if start_upstream_met_start: render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'" - " start:") - display(start_upstream_met_start) + " start.") + display_gene_set(start_upstream_met_start) start_upstream_met_start_nostops = gene_sets[ "start_upstream_met_start_nostops"] if start_upstream_met_start_nostops: render_md(f"{len(start_upstream_met_start_nostops)} of them" - " also do not contain any stop:") - display(start_upstream_met_start_nostops) + " also do not contain any stop.") + display_gene_set(start_upstream_met_start_nostops) render_md("Those genes could probably be kept.") has_stops_good_met_start = gene_sets["has_stops_good_met_start"] render_md( f"{len(has_stops_good_met_start)} genes contain stops" " but have a well defined start position with 'M'.") - if len(has_stops_good_met_start) <= 10: - display(has_stops_good_met_start) + display_gene_set(has_stops_good_met_start) good_met_start = gene_sets["good_met_start"] render_md( f"{len(good_met_start)} genes have a well defined " "start position with 'M'.") render_md( - """If genes that have stop readthrough are a known phenomenon, - only those among them that do not have a valid start codon - might be excluded.""") + """ +If genes that have stop readthrough are a known phenomenon, +only those among them that do not have a valid start codon +might be excluded. +""") no_stop_good_met_start = gene_sets["no_stop_good_met_start"] render_md( f"{len(no_stop_good_met_start)} genes have a well defined" " start position with 'M' and contain no stop codon.") - if len(no_stop_good_met_start) <= 10: - display(no_stop_good_met_start) + display_gene_set(no_stop_good_met_start) return criteria @@ -257,14 +265,17 @@ def make_counts_only(counts_table): """ Integrate "informative" columns of *counts_table* into the index. """ + # To ensure a stable order: + ref_info_cols = [ + "old_locus_tag", "locus_tag", "length", + "start_codon", "expected_start_aa", + "first_stop", "nb_stops", "start_upstream", "end_downstream"] + # To ensure no info columns are lost: info_cols = [ counts_table.index.name, *counts_table.columns.difference(codon2aa)] - assert set(info_cols) == { - "old_locus_tag", "locus_tag", "length", - "start_codon", "expected_start_aa", - "first_stop", "nb_stops", "start_upstream", "end_downstream"} - return counts_table.reset_index().set_index(info_cols) + assert set(info_cols) == set(ref_info_cols) + return counts_table.reset_index().set_index(ref_info_cols) def make_aa_codon_columns(counts_table): @@ -302,6 +313,15 @@ def sort_counts_by_aa(counts_table): return sorted_counts +def save_counts_table(counts_table, table_path): + """ + Save table *counts_table* at *table_path*. + """ + table_path.parent.mkdir(parents=True, exist_ok=True) + counts_table.to_csv(table_path, sep="\t") + render_md(f"The table was saved at [{table_path}]({table_path}).") + + def load_bias_table(table_path, nb_cluster_series=2): """ Load a table containing by-amino-acid codon usage biases.