diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 33f7a903f73a78220944c0f9b53a216606df2c32..b2e521012a4e1cfa98e46fd0f05ae8e74fda8e61 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,11 +1,13 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = 0.7 +__version__ = "0.10" from .libcodonusage import ( aa2colour, + by_aa_codon_usage, codon2aa, columns_by_aa, detect_fishy_genes, + gene_wide_codon_usage, load_bias_table, load_counts_table, make_aa_codon_columns, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index fe0429c07282f85dc8ab0fe7d1d6cd634d5263bb..f9ab307c2140a1489f4d10b04a317189087385c4 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -45,6 +45,10 @@ from cytoolz import groupby # Python module that facilitates the exploration of tbular data # python3 -m pip install plydata # from plydata import define, pull, query +# Python library with useful data-processing features +# python3 -m pip install scikit-learn +# https://scikit-learn.org/stable/install.html +from sklearn.preprocessing import normalize # Python module to vizualize set intersections # python3 -m pip install upsetplot from upsetplot import from_indicators, UpSet @@ -334,36 +338,171 @@ def save_counts_table(counts_table, table_path): render_md(f"The table was saved at [{table_path}]({table_path}).") -def load_bias_table(table_path, nb_cluster_series=2): +def gene_wide_codon_usage(codon_counts, verbose=False): + """ + """ + render_md(""" +We will compute codon usage "gene-wide" (i.e. not by amino-acid), +by looking at the proportion of codons within a gene's CDS. +""") + render_md(""" +To compute codon proportions, we can divide each line by its sum, +or, equivalently (and hopefully more efficiently), normalize the data +using the "l1" norm (which, for positive-only values amounts to the sum). +""") + codon_proportions = pd.DataFrame( + normalize(codon_counts, norm="l1"), + index=codon_counts.index, + columns=codon_counts.columns) + # Doesn't seem to be working: + # codon_proportions.style.hide(axis="index") + if verbose: + display(codon_proportions.head(3)) + # Check that the sum of proportions (columns) for a gene is 1 + colsums = codon_proportions.sum(axis=1).values + # Due to imprecision in float arithmetics, + # we can only check that the sums are close to 1 + assert np.allclose(colsums, np.full(len(colsums), 1)) + render_md(""" +To compute the usage biases, we also need to compute codon proportions +in the global usage. +""") + render_md("We compute the global usage, as the sum on columns.") + global_usage = codon_counts.sum(axis=0) + render_md("Then we normalize it the same way as for the individual genes.") + # The values.reshape(1, -1) turns the Series data into a 2D array + # with one line. flatten() turns the data back to 1D + global_proportions = pd.Series( + normalize(global_usage.values.reshape(1, -1), norm="l1").flatten(), + index=global_usage.index) + assert global_proportions.sum() == 1. + render_md(""" +Codon usage biases in genes can then be computed by subtracting +the corresponding global proportion to a codon's proportion. +""") + codon_usage_biases = codon_proportions.sub(global_proportions) + render_md(""" +We can standardize these values (by dividing by their standard deviation +across genes) so that they are more comparable between codons. +""") + standardized_codon_usage_biases = codon_usage_biases.div( + codon_usage_biases.std()) + # Doesn't seem to be working: + # standardized_codon_usage_biases.style.hide(axis="index") + if verbose: + display(standardized_codon_usage_biases.head(3)) + return standardized_codon_usage_biases + + +def by_aa_codon_usage(codon_counts, verbose=False): + """ + """ + render_md(""" +We will compute codon usage "by amino-acid", by looking at the +proportion of codons for each amino-acid within a gene's CDS. +""") + render_md(""" +We first need to compute, for a given gene, the total number of codons +corresponding to each amino-acid. +""") + summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() + render_md(""" +To compute codon proportions, we can then divide each codon counts by +the total for the corresponding amino-acid. +""") + codon_proportions = codon_counts.div( + summed_by_aa) + # Doesn't seem to be working: + # codon_proportions.style.hide(axis="index") + if verbose: + display(codon_proportions.head(3)) + # There are some (genes, aa) pairs where the sum is zero. + # This is normal if that gene does not have that amino-acid. + sums = codon_proportions.groupby(level=0, axis=1).sum() + # Values in sums should be either zero or one. + all_ones = np.full(sums.shape, 1.) + all_zeroes = np.full(sums.shape, 0.) + # Due to imprecision in float arithmetics, + # we can only check that the sums are close to 1 or to 0 + assert np.array_equal( + np.isclose( + sums.values, all_ones), + np.logical_not( + np.isclose( + sums.values, all_zeroes))) + render_md(""" +To compute the usage biases, we also need to compute codon proportions +in the global usage. +""") + render_md(""" +We compute the global usage, as the sum of the counts for a given codon, +across genes. +""") + global_usage = codon_counts.sum(axis=0) + render_md("Then we sum over codons corresponding to the same amino-acid.") + global_summed_by_aa = global_usage.groupby(level=0).sum() + render_md(""" +Dividing the global usage for each codon by the total for the corresponding +amino-acid, we can now compute the global proportions the same way as for +the individual genes. +""") + global_proportions = global_usage.div( + global_summed_by_aa) + # The proportions of codons for a given amino-acid should sum to 1 + assert all( + val == 1. + for val in global_proportions.groupby(level=0).sum()) + render_md(""" +Codon usage biases in genes can then be computed by subtracting +the corresponding global proportion to a codon's proportion. +""") + codon_usage_biases = codon_proportions.sub(global_proportions) + render_md(""" +We can standardize these values (by dividing by their standard deviation +across genes) so that they are more comparable between codons. +""") + standardized_codon_usage_biases = codon_usage_biases.div( + codon_usage_biases.std()) + # Doesn't seem to be working: + # standardized_codon_usage_biases.style.hide(axis="index") + if verbose: + display(standardized_codon_usage_biases.head(3)) + return standardized_codon_usage_biases + + +def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2): """ Load a table containing by-amino-acid codon usage biases. The table, located at *table_path*, should have tab-separated columns. It is expected to contain a series of informative columns preceding the columns containing the codon usage bias values. - The first nine columns contain information about gene identifiers - and features extracted from CDS annotation data. + + The first *nb_info_cols* columns contain information about gene + identifiers and features extracted from CDS annotation data. + Then, there can be series of gene-clustering information, where each - column of the series corresponds to clusters defined based on codon-usage - biases among the codons coding an amino-acid. There are no columns for - amino-acids coded by only one codon (M and W). + column of the series corresponds to clusters defined based on + codon-usage biases among the codons coding an amino-acid. + There are no columns for amino-acids coded by only one codon (M and W). *nb_cluster_series* specifies the number of such series. The result is a pandas DataFrame object, where all the above information - is encoded as a MultiIndex, the codon usage biases being in the remaining - columns. + is encoded as a MultiIndex, the codon usage biases being in the + remaining columns. """ return pd.read_csv( table_path, sep="\t", - # 9 starting levels from the initial table, + # *nb_info_cols* starting levels from the initial table, # plus levels corresponging to clustering information. # By default, from 2 methods (nb_cluster_series=2): # * `cluster_{aa}_kmeans` for each amino-acid # having more than one codon # * `cluster_{aa}_full_bias` for each amino-acid # having more than one codon - index_col=list(range(9 + nb_cluster_series * (len(aa2colour) - 2))), + index_col=list( + range(nb_info_cols + nb_cluster_series * (len(aa2colour) - 2))), header=[0, 1])