diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index ee0978219a0ac3661090b96e25ee1ff5b3c1711d..93cf75e2c6d182fd906c785221e537aba3da4472 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.18" +__version__ = "0.19" from .libcodonusage import ( aa2colour, aa_usage, @@ -8,6 +8,7 @@ from .libcodonusage import ( centroid_usage, codon2aa, columns_by_aa, + compare_clusterings, detect_fishy_genes, exclude_all_nan_cols, extract_top_genes_from_cluster, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 6516fef989c95a54ebbd1a82ff5294df57ffa19e..8f1e0a68fb36e54b7efbde2f5c36703d664fcb57 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -14,6 +14,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. """Functions used in Jupyter notebooks.""" +from itertools import combinations import json from operator import itemgetter from pathlib import Path @@ -694,6 +695,57 @@ def make_cluster_table(scub_table, centroids_scub_table): index=scub_table.index) +def compare_clusterings( + usage_table, + clust1_template="cluster_{aa}_full_bias", + clust2_template="cluster_{aa}_highest_bias"): + """ + Compare lists of genes in *usage_table* for the clusters + found in the columns whose name conform to + *clust1_template* and *clust2_template*. + """ + crosstabs = {} + for aa in aa2colour.keys(): + if aa == "M" or aa == "W": + continue + clust1_level = clust1_template.format(aa=aa) + clust2_level = clust2_template.format(aa=aa) + crosstabs[aa] = pd.crosstab( + usage_table.reset_index()[clust1_level], + usage_table.reset_index()[clust2_level]) + cols_for_aa = columns_by_aa[aa] + cluster_names = [ + f"{aa}_{codon}" + for (aa, codon) in usage_table[cols_for_aa].columns] + # Checking that the clusters are the same + same_clusters = True + for (clust_1, clust_2) in combinations(cluster_names, 2): + # assert crosstabs[aa][clust_1][clust_2] == 0 + try: + if crosstabs[aa][clust_1][clust_2] != 0: + same_clusters = False + except KeyError: + # At least one cluster is absent. + # Is the absence symmetric? + miss_1_in_1 = clust_1 in usage_table.reset_index()[ + clust1_level] + miss_2_in_2 = clust_2 not in usage_table.reset_index()[ + clust2_level] + miss_1_in_2 = clust_1 in usage_table.reset_index()[ + clust2_level] + miss_2_in_1 = clust_2 not in usage_table.reset_index()[ + clust1_level] + if ( + (miss_1_in_1 != miss_1_in_2) + or (miss_2_in_1 != miss_2_in_2)): + same_clusters = False + if not same_clusters: + render_md(f"\nClusters for {aa} differ:\n") + else: + render_md(f"\nClusters for {aa} are identical:\n") + display(crosstabs[aa]) + + def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2): """ Load a table containing by-amino-acid codon usage biases.