diff --git a/notebooks/Solutions/seaborn_TP_solutions.ipynb b/notebooks/Solutions/seaborn_TP_solutions.ipynb index 5ebf0fa0c645800b6604ebd3ecceb4f1bd7764a9..d96cdf7286b6acb1f0bd577db8472e48b27bc839 100644 --- a/notebooks/Solutions/seaborn_TP_solutions.ipynb +++ b/notebooks/Solutions/seaborn_TP_solutions.ipynb @@ -10,11 +10,11 @@ "<div style=\"text-align:center\">\n", " <img src=\"../images/seaborn.png\" width=\"600px\">\n", " <div>\n", - " Bertrand Néron, François Laurent, Etienne Kornobis\n", + " Bertrand Néron, François Laurent, Etienne Kornobis, Vincent Guillemot\n", " <br />\n", " <a src=\" https://research.pasteur.fr/en/team/bioinformatics-and-biostatistics-hub/\">Bioinformatics and Biostatistiqucs HUB</a>\n", " <br />\n", - " © Institut Pasteur, 2021\n", + " © Institut Pasteur, 2024\n", " </div> \n", "</div>" ] @@ -24,12 +24,22 @@ "id": "compliant-basis", "metadata": {}, "source": [ - "Practice your graphing skills using data from milieu intérieur in `data/mi.csv`:" + "Practice your graphing skills through the data of [happiness 2016](https://www.kaggle.com/datasets/unsdsn/world-happiness?select=2016.csv)\n", + "\n", + "(The data are already in data directory as `happiness_2016.csv`)" + ] + }, + { + "cell_type": "markdown", + "id": "3778963b-3bae-486d-8db7-30f23eb239ac", + "metadata": {}, + "source": [ + "## Import the data and have a look on them" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "minor-doctrine", "metadata": {}, "outputs": [], @@ -40,34 +50,199 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "d4500895-2a2c-4ebb-a1fc-153e97769b9d", + "execution_count": 3, + "id": "skilled-daniel", "metadata": {}, "outputs": [], "source": [ - "import warnings\n", - "warnings.simplefilter(action='ignore', category=FutureWarning, lineno=1498)\n", - "warnings.simplefilter(action='ignore', category=UserWarning, lineno=118)" + "ha_df = pd.read_csv(\"../data/happiness_2016.csv\")" ] }, { "cell_type": "code", - "execution_count": null, - "id": "skilled-daniel", + "execution_count": 4, + "id": "f8729a5b-314d-42fc-b130-783ca5e2076a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(157, 13)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "mi_df = pd.read_csv(\"../data/mi.csv\")" + "ha_df.shape" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "brutal-manufacturer", "metadata": {}, - "outputs": [], - "source": [ - "mi_df.head()" + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Region</th>\n", + " <th>Happiness Rank</th>\n", + " <th>Happiness Score</th>\n", + " <th>Lower Confidence Interval</th>\n", + " <th>Upper Confidence Interval</th>\n", + " <th>Economy (GDP per Capita)</th>\n", + " <th>Family</th>\n", + " <th>Health (Life Expectancy)</th>\n", + " <th>Freedom</th>\n", + " <th>Trust (Government Corruption)</th>\n", + " <th>Generosity</th>\n", + " <th>Dystopia Residual</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Denmark</td>\n", + " <td>Western Europe</td>\n", + " <td>1</td>\n", + " <td>7.526</td>\n", + " <td>7.460</td>\n", + " <td>7.592</td>\n", + " <td>1.44178</td>\n", + " <td>1.16374</td>\n", + " <td>0.79504</td>\n", + " <td>0.57941</td>\n", + " <td>0.44453</td>\n", + " <td>0.36171</td>\n", + " <td>2.73939</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Switzerland</td>\n", + " <td>Western Europe</td>\n", + " <td>2</td>\n", + " <td>7.509</td>\n", + " <td>7.428</td>\n", + " <td>7.590</td>\n", + " <td>1.52733</td>\n", + " <td>1.14524</td>\n", + " <td>0.86303</td>\n", + " <td>0.58557</td>\n", + " <td>0.41203</td>\n", + " <td>0.28083</td>\n", + " <td>2.69463</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Iceland</td>\n", + " <td>Western Europe</td>\n", + " <td>3</td>\n", + " <td>7.501</td>\n", + " <td>7.333</td>\n", + " <td>7.669</td>\n", + " <td>1.42666</td>\n", + " <td>1.18326</td>\n", + " <td>0.86733</td>\n", + " <td>0.56624</td>\n", + " <td>0.14975</td>\n", + " <td>0.47678</td>\n", + " <td>2.83137</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Norway</td>\n", + " <td>Western Europe</td>\n", + " <td>4</td>\n", + " <td>7.498</td>\n", + " <td>7.421</td>\n", + " <td>7.575</td>\n", + " <td>1.57744</td>\n", + " <td>1.12690</td>\n", + " <td>0.79579</td>\n", + " <td>0.59609</td>\n", + " <td>0.35776</td>\n", + " <td>0.37895</td>\n", + " <td>2.66465</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Finland</td>\n", + " <td>Western Europe</td>\n", + " <td>5</td>\n", + " <td>7.413</td>\n", + " <td>7.351</td>\n", + " <td>7.475</td>\n", + " <td>1.40598</td>\n", + " <td>1.13464</td>\n", + " <td>0.81091</td>\n", + " <td>0.57104</td>\n", + " <td>0.41004</td>\n", + " <td>0.25492</td>\n", + " <td>2.82596</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Country Region Happiness Rank Happiness Score \\\n", + "0 Denmark Western Europe 1 7.526 \n", + "1 Switzerland Western Europe 2 7.509 \n", + "2 Iceland Western Europe 3 7.501 \n", + "3 Norway Western Europe 4 7.498 \n", + "4 Finland Western Europe 5 7.413 \n", + "\n", + " Lower Confidence Interval Upper Confidence Interval \\\n", + "0 7.460 7.592 \n", + "1 7.428 7.590 \n", + "2 7.333 7.669 \n", + "3 7.421 7.575 \n", + "4 7.351 7.475 \n", + "\n", + " Economy (GDP per Capita) Family Health (Life Expectancy) Freedom \\\n", + "0 1.44178 1.16374 0.79504 0.57941 \n", + "1 1.52733 1.14524 0.86303 0.58557 \n", + "2 1.42666 1.18326 0.86733 0.56624 \n", + "3 1.57744 1.12690 0.79579 0.59609 \n", + "4 1.40598 1.13464 0.81091 0.57104 \n", + "\n", + " Trust (Government Corruption) Generosity Dystopia Residual \n", + "0 0.44453 0.36171 2.73939 \n", + "1 0.41203 0.28083 2.69463 \n", + "2 0.14975 0.47678 2.83137 \n", + "3 0.35776 0.37895 2.66465 \n", + "4 0.41004 0.25492 2.82596 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ha_df.head()" ] }, { @@ -75,7 +250,7 @@ "id": "departmental-exhibition", "metadata": {}, "source": [ - "- Do a boxplot showing the differences in temperature between females and males:" + "## Do a boxplot showing the differences in `happiness` between `Region`:" ] }, { @@ -85,7 +260,7 @@ "metadata": {}, "outputs": [], "source": [ - "sns.boxplot(data=mi_df, x=\"Sex\", y=\"Temperature\")" + "sns.boxplot(data=ha_df, x=\"Happiness Score\", y=\"Region\")" ] }, { @@ -93,7 +268,7 @@ "id": "portuguese-worse", "metadata": {}, "source": [ - "- Using a histogram and continuous probability density curve, display the distribution of age in the dataset" + "## Using a histogram and continuous probability density curve, display the distribution of `Freedom` in the dataset" ] }, { @@ -103,7 +278,7 @@ "metadata": {}, "outputs": [], "source": [ - "sns.histplot(data=mi_df, x=\"Age\")" + "sns.histplot(data=ha_df, x=\"Freedom\")" ] }, { @@ -113,7 +288,7 @@ "metadata": {}, "outputs": [], "source": [ - "sns.histplot(data=mi_df, x=\"Age\", kde=True)" + "sns.histplot(data=ha_df, x=\"Freedom\", kde=True)" ] }, { @@ -121,7 +296,7 @@ "id": "prepared-stephen", "metadata": {}, "source": [ - "- Use a barplot to show the count of vaccinated for yellow fever (see the documentation for a countplot)" + "- Use a barplot to show the count of country per Region (see the documentation for a countplot)" ] }, { @@ -131,527 +306,501 @@ "metadata": {}, "outputs": [], "source": [ - "sns.countplot(data=mi_df, x=\"VaccineYellowFever\")" + "sns.countplot(data=ha_df, x=\"Region\")" ] }, { "cell_type": "markdown", - "id": "immediate-method", + "id": "4e1d16f2-57d8-4f0e-9a69-e9ab193a3ebc", "metadata": {}, "source": [ - "- Plot the distribution of age for the people vaccinated for the flu" + "As you can see the labels overlaps each ohers and are not readable\n", + "\n", + "One possibility is to rotate the X-labels. In this case is better to provide the labels." ] }, { "cell_type": "code", "execution_count": null, - "id": "academic-measure", + "id": "64ee74bb-5f37-485f-8c03-d06ac14d3010", "metadata": {}, "outputs": [], "source": [ - "sns.histplot(data=mi_df.query(\"VaccineFlu == 'Yes'\"), x=\"Age\", kde=True)" + "# extract the Region from the data, I will use them as labels for figures below\n", + "regions = ha_df.loc[:, 'Region'].drop_duplicates()" ] }, { - "cell_type": "markdown", - "id": "temporal-synthesis", - "metadata": {}, - "source": [ - "- Feel free to explore more of [seaborn](https://seaborn.pydata.org/examples/index.html) !" - ] - }, - { - "cell_type": "markdown", - "id": "db56d49a-4770-4f9e-af6b-78960574d338", - "metadata": {}, - "source": [ - "# Exploring count matrices from RNA-seq data" - ] - }, - { - "cell_type": "markdown", - "id": "5377668b-dea5-4c20-8249-5266f98774eb", + "cell_type": "code", + "execution_count": null, + "id": "eb7c96ac-585a-4787-a2ee-dec3d04790ca", "metadata": {}, + "outputs": [], "source": [ - "<img src=\"../images/rnaseq.png\" style=\"margin:0 auto;width:800px\">" + "ax = sns.countplot(data=ha_df, x=\"Region\")\n", + "ax.set_xticks(regions)\n", + "ax.set_xticklabels(regions, rotation=45, ha='right', rotation_mode='anchor')" ] }, { "cell_type": "markdown", - "id": "ebf1606b-0b21-4821-a899-551ec33c977e", + "id": "e2eac27c-2de9-4942-82b9-294318ec5fd4", "metadata": {}, "source": [ - "- Import the count_matrix tsv file from the data folder" + "## On the same data `Happiness` and `Region` do a boxplot and a swarmplot to display the structure of the data" ] }, { "cell_type": "code", "execution_count": null, - "id": "3e0fe80a-175f-4cec-96f9-28bd7005097d", + "id": "b0fd6058-65b0-4f9b-bc59-dce0193f1580", "metadata": {}, "outputs": [], "source": [ - "counts_df = pd.read_csv(\"../data/count_matrix.tsv\", sep=\"\\t\")" + "ax = sns.swarmplot(data=ha_df, x=\"Region\", y=\"Happiness Score\")\n", + "ax.set_xticks(regions)\n", + "ax.set_xticklabels(regions,rotation=45, ha='right', rotation_mode='anchor')" ] }, { "cell_type": "code", "execution_count": null, - "id": "671736af-d00e-475a-9670-86374402a741", + "id": "1fe079b4-013a-4d48-ac31-9daad4b4673e", "metadata": {}, "outputs": [], "source": [ - "counts_df" + "ax = sns.boxplot(data=ha_df, x=\"Region\", y=\"Happiness Score\", hue='Region') # see the result of the option hue\n", + "ax.set_xticks(regions)\n", + "ax.set_xticklabels(regions,rotation=45, ha='right', rotation_mode='anchor')" ] }, { "cell_type": "markdown", - "id": "c80d9947-9ccf-4499-a1c2-9194377cd054", - "metadata": {}, - "source": [ - "- Simplify the dataframe to only have the \"Geneid\", \"WTx\" and \"Cx\" columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01a3b73d-46cb-48e7-8f18-2c8a6a612724", + "id": "immediate-method", "metadata": {}, - "outputs": [], "source": [ - "counts_df.drop([\"Chr\", \"Start\", \"End\", \"Strand\", \"Length\"], axis=1, inplace=True)" + "## Plot the distribution of `happiness` for the people leaving `Western Europe`" ] }, { "cell_type": "code", "execution_count": null, - "id": "997274c0-63ed-492f-94ea-f6bdd8552455", + "id": "academic-measure", "metadata": {}, "outputs": [], "source": [ - "counts_df" + "sns.histplot(data=ha_df.query(\"Region == 'Western Europe'\"), x=\"Happiness Score\", kde=True)" ] }, { "cell_type": "markdown", - "id": "eb65b51f-f689-4a66-b47c-e79f0e9eba52", + "id": "fd9789db-9bab-478a-bcec-1c8b6775cf20", "metadata": {}, "source": [ - "- Format properly your DataFrame to be able to use https://seaborn.pydata.org/generated/seaborn.clustermap.html to realize a heatmap." + "## Plot the `Health (Life Expectancy)` vs `Happiness Score` and color the dots according to the region " ] }, { "cell_type": "code", "execution_count": null, - "id": "49f81fab-0e9e-4add-a0b6-249b735c7ab8", + "id": "d42f72f1-1d01-496e-89a1-68391ffa4281", "metadata": {}, "outputs": [], "source": [ - "counts_df.set_index(\"Geneid\", inplace=True)" + "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, - "id": "467a2115-6735-4a7f-aaf6-bbc1029a57a2", + "id": "52ae8376-3c66-4ca6-86a9-f9ae9f56076f", "metadata": {}, "outputs": [], "source": [ - "counts_df" + "fig, ax = plt.subplots(figsize=(9, 7))\n", + "sns.scatterplot(data=ha_df, x=\"Health (Life Expectancy)\", y=\"Happiness Score\", hue=\"Region\", ax=ax)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "79ae3c32-ce30-49d0-882b-e5681a50fef8", + "cell_type": "markdown", + "id": "temporal-synthesis", "metadata": {}, - "outputs": [], "source": [ - "sns.clustermap(data=counts_df)" + "- Feel free to explore more of [seaborn](https://seaborn.pydata.org/examples/index.html) !" ] }, { "cell_type": "markdown", - "id": "f8d6188e-3a37-4ba5-b377-a11696054e9c", + "id": "3063abf7-2251-48eb-b371-6c5b70b45fe7", "metadata": {}, "source": [ - "- Explore the clustermap documentation to have a more visual heatmap by standardizing the data within genes." + "## Do a barplot of the Happiness Score for each Region" ] }, { "cell_type": "code", "execution_count": null, - "id": "d1c055d5-0608-4bc3-ac33-738848946639", + "id": "85dd0df6-74e7-43be-9a7c-eb922a06601b", "metadata": {}, "outputs": [], "source": [ - "sns.clustermap(data=counts_df, z_score=0)" + "sns.barplot(data=ha_df, y=\"Region\", x=\"Happiness Score\", hue='Region', orient='h')" ] }, { "cell_type": "markdown", - "id": "2e61a207-223a-4c01-88ea-76b1b8c3a0b9", - "metadata": {}, - "source": [ - "- Reformat the counts_df dataframe to have genes in columns and samples in rows.\n", - "- Add a \"group\" column defining the grouping of the samples:\n", - " - \"WTx\" samples will be from the \"WT\" group.\n", - " - \"Cx\" samples will be from the \"C\" group." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59d107fa-7246-4c58-98d2-573313499034", + "id": "3ee5741a-64f4-4690-963b-1f7e729398bf", "metadata": {}, - "outputs": [], "source": [ - "counts_df = counts_df.T" + "## from this point we will focus on the Regions\n", + "\n", + "### clean our dataset. Remove not relevant columns" ] }, { "cell_type": "code", "execution_count": null, - "id": "6744918a-098d-40b0-a6a6-0774c6c3df16", + "id": "0344b730-1535-47fb-82f5-07003fd223f9", "metadata": {}, "outputs": [], "source": [ - "counts_df.sort_index(inplace=True)" + "ha_df.columns" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "7100ab14-e759-4b12-9e39-f4e422fdb070", + "cell_type": "markdown", + "id": "36e449d1-0add-4ebc-8903-d535219ce423", "metadata": {}, - "outputs": [], "source": [ - "counts_df.loc[:,\"groups\"] = [\"C\", \"C\", \"C\", \"WT\", \"WT\", \"WT\"]" + "1. keep only columns 'Region', 'Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Freedom','Trust (Government Corruption)', 'Generosity'\n", + "2. set the index to the Region\n", + "3. have a look on your new data" ] }, { "cell_type": "code", "execution_count": null, - "id": "4a42b339-3ed8-4b7f-b5f9-f8a4eed8dd6c", + "id": "bdf897c4-b8f3-4dff-b9c0-0ad47b25ecc0", "metadata": {}, "outputs": [], "source": [ - "counts_df" + "region_df = ha_df.loc[:, ['Region', 'Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Freedom','Trust (Government Corruption)', 'Generosity']]\n", + "region_df.set_index('Region', inplace=True)\n", + "region_df.head()" ] }, { "cell_type": "markdown", - "id": "9a88ecb1-9ed3-4160-91ee-24a30e994b71", + "id": "e1ae03ac-ac7c-436d-987d-113e9cca3eec", "metadata": {}, "source": [ - "- Display a barplot showing the mean expression for each group for a particular gene (for example \"gene-LEPBI_RS00065\")." + "## Aggregate the new data region by region. Compute the mean of each country as value for the corresponding Region" ] }, { "cell_type": "code", "execution_count": null, - "id": "924a1a92-5cba-4796-b6c0-bbf849112434", + "id": "3fc3ea89-a448-4e7b-abfb-3fa92cffc5f7", "metadata": {}, "outputs": [], "source": [ - "sns.barplot(data=counts_df, x=\"groups\", y=\"gene-LEPBI_RS00065\")" + "reg_agg = region_df.groupby('Region').agg('mean')\n", + "reg_agg" ] }, { "cell_type": "markdown", - "id": "99e2455a-cb7d-44d5-a4a0-2cf272c814ab", + "id": "97cb188c-3e50-4492-961f-cadea3611aaa", "metadata": {}, "source": [ - "- Try plotting a swarmplot on top of the previous barplot:" + "## Do a hierarchically-clustered heatmap " ] }, { "cell_type": "code", "execution_count": null, - "id": "7102bd62-f2d1-4573-a741-45903b9dee1d", + "id": "9aa21ed4-e9b2-4eb3-a693-c59ceb513552", "metadata": {}, "outputs": [], "source": [ - "sns.barplot(data=counts_df, x=\"groups\", y=\"gene-LEPBI_RS00065\")\n", - "sns.swarmplot(data=counts_df, x=\"groups\", y=\"gene-LEPBI_RS00065\")" + "sns.clustermap(data=reg_agg)" ] }, { "cell_type": "markdown", - "id": "d200d375-362e-4c1d-a88e-130b094e6feb", + "id": "88d27d29-e3b8-43d7-8324-25e50c247872", "metadata": {}, "source": [ - "- Now plot the same data using a boxplot. Can you see the problem of displaying boxplots for this kind of data ?" + "Check the data." ] }, { "cell_type": "code", "execution_count": null, - "id": "6cb35e0f-eb0f-4d66-8808-996b1c89c894", + "id": "0128f575-0b2a-4cbc-8f6e-8b7e22d81254", "metadata": {}, "outputs": [], "source": [ - "sns.boxplot(data=counts_df, x=\"groups\", y=\"gene-LEPBI_RS00065\")" + "reg_agg.describe()" ] }, { "cell_type": "markdown", - "id": "2e1cabe0-aab7-4f0e-888e-81aae7d5df8d", + "id": "f9b39ab8-0051-4840-9e87-fe2bcb8ca07a", "metadata": {}, "source": [ - "- Compute the median of each genes by groups:" + "The data are not in the same range, so it could be better to standardize the data before to do the clustering" ] }, { "cell_type": "code", "execution_count": null, - "id": "a5a896e2-2133-4365-bc78-43bee7f253a7", + "id": "ff4beb57-b357-47a3-b7bd-877e05229b6b", "metadata": {}, "outputs": [], "source": [ - "med_counts_df = counts_df.groupby(\"groups\").median().T" + "normalized_reg=(reg_agg - reg_agg.mean()) / reg_agg.std()\n", + "normalized_reg" ] }, { "cell_type": "code", "execution_count": null, - "id": "572d74c5-f612-4494-9ece-b6f80fd1cd6d", + "id": "e19f9472-cb9b-434b-8689-2bf09d49b902", "metadata": {}, "outputs": [], "source": [ - "med_counts_df" + "sns.clustermap(data=normalized_reg, annot=True) # see the results of the annot option " ] }, { "cell_type": "markdown", - "id": "308cc10b-6727-4bc5-b05d-4777037e252e", + "id": "d64a0377-339b-4fe7-beb4-a32e4a4e0113", "metadata": {}, "source": [ - "We are going now to add extra annotations to this median table in order to identify genes of interest.\n", - "- Import the annotation.csv table from the data folder: " + "It's possible to do that directly in seaborn. with the option z_score (https://seaborn.pydata.org/generated/seaborn.clustermap.html)" ] }, { "cell_type": "code", "execution_count": null, - "id": "a52c17a1-59bd-4fd7-9b7b-1ba98b194399", + "id": "3b439517-5007-4fbb-828d-265f9835594f", "metadata": {}, "outputs": [], "source": [ - "annot_df = pd.read_csv(\"../data/annotations.csv\", index_col=0)" + "sns.clustermap(data=reg_agg, z_score=1, annot=True)" ] }, { "cell_type": "markdown", - "id": "50fa81a7-3f34-4160-ad2d-f77d21be9ac0", + "id": "f8d6188e-3a37-4ba5-b377-a11696054e9c", "metadata": {}, "source": [ - "Annotations in this table are available for many types of loci (the \"genetic_type\" column), but here we will focus on the \"gene\" genetic_type. \n", - "- Filter the annotation dataframe to have only \"gene\" as \"genetic_type\"." + "- Explore the clustermap documentation to have a more visual heatmap by standardizing the data within genes." ] }, { - "cell_type": "code", - "execution_count": null, - "id": "2606146d-daf9-4e99-a4f3-722b16d6ecba", + "cell_type": "markdown", + "id": "a2627322-e6a5-422f-8a69-b89dbd4b777e", "metadata": {}, - "outputs": [], "source": [ - "annot_df = annot_df.query(\"genetic_type == 'gene'\")" + "## Create a function which produce a single image with four different plots of your choice and save it to pdf file.\n", + "\n", + "like the image below." ] }, { - "cell_type": "code", - "execution_count": null, - "id": "74581155-e29d-48d1-8548-dc8186db35e2", + "cell_type": "markdown", + "id": "4121ff3d-6814-493e-a505-357ad81b0d28", "metadata": {}, - "outputs": [], "source": [ - "annot_df" + "<img src=\"../images/multiple_figure.png\" width=\"50%\" />" ] }, { - "cell_type": "markdown", - "id": "f8a4e744-e7e2-43b6-b3d4-e59feb40d3ff", + "cell_type": "code", + "execution_count": null, + "id": "a322c866-9232-4fae-bcee-9a635e3fd70b", "metadata": {}, + "outputs": [], "source": [ - "- Concatenate the dataframe with median by group and the annotation dataframe together:" + "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, - "id": "cfd41f82-0e31-41f9-8e1c-75cc934e89d2", + "id": "044022d1-741d-4a07-ba7f-c1f863cca138", "metadata": {}, "outputs": [], "source": [ - "total_df = pd.concat([med_counts_df, annot_df.set_index(\"ID\")], axis=1, join=\"inner\")" + "def expression_graph():\n", + " fig, axs = plt.subplots(2,2, figsize=(9,7), constrained_layout=True) # constrained_layout=True avoid overlapping between axis title and X-labels from the above figure\n", + " sns.boxplot(data=ha_df, x=\"Happiness Score\", y=\"Region\", hue='Region', ax=axs[0,0])\n", + " axs[0,0].set_title(\"happiness data structure\")\n", + " \n", + " sns.scatterplot(data=ha_df, x=\"Health (Life Expectancy)\", y=\"Happiness Score\", hue=\"Region\", legend=False, ax=axs[0,1])\n", + " axs[0,1].set_title(\"Happiness vs Health\")\n", + " \n", + " sns.barplot(data=ha_df, y=\"Region\", x=\"Happiness Score\", hue='Region', orient='h', ax=axs[1,0])\n", + " axs[1,0].set_title(\"happiness through the world\")\n", + " \n", + " sns.histplot(data=ha_df, x=\"Happiness Score\", kde=True, ax=axs[1,1])\n", + " axs[1,1].set_title(\"Happiness data distribution\")\n", + " \n", + " return fig\n", + " " ] }, { "cell_type": "code", "execution_count": null, - "id": "e2d779f7-c127-4f24-8e6c-34190bf8460d", - "metadata": { - "tags": [] - }, + "id": "c33bfc78-7480-4327-93a0-f8aaca0d3614", + "metadata": {}, "outputs": [], "source": [ - "total_df" + "my_fig = expression_graph()\n", + "my_fig.suptitle(\"Happiness Report\")\n", + "my_fig.savefig(\"happiness_visualization.pdf\", bbox_inches = \"tight\") # bbox_inches = \"tight\" avoid to truncate the Y-labels on left on pdf" ] }, { "cell_type": "markdown", - "id": "af9f8e1f-5f8b-4152-b08a-44e957f13cec", + "id": "0d05aba4-3c85-4cd9-85f3-5296b19308fb", "metadata": {}, "source": [ - "- Calculate an estimate of the gene expression fold change for each gene (by dividing the C median expressions by WT median expressions).\n", - "- Add it as a \"FoldChange\" column to the previous dataframe." + "# Extras" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "03e3e893-ae73-4fea-853c-bcdc6129f71c", + "cell_type": "markdown", + "id": "66d6668e-683f-462e-a72f-28bdda8736f2", "metadata": {}, - "outputs": [], "source": [ - "total_df.loc[:, \"FoldChange\"] = total_df.C / total_df.WT" + "- Using ipywidget, make a function to display barplot of `Happiness Score` by country but with region selected by the user (using a Dropdown widget)" ] }, { "cell_type": "markdown", - "id": "d70eb26b-0a26-4bbc-af03-ba8781b09fb5", + "id": "042bd87e-d2dc-4544-a771-51d80c565d0f", "metadata": {}, "source": [ - "- Use a barplot to display fold changes and using the new gene annotation (The \"Name\" column)" + "Imports the needed modules \n", + "- `widgets` and `interact` from the `ipywidgets` package\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "e11d36d2-6a41-463b-92e0-af4cdcec279f", + "id": "64ebeca1-1332-4585-9e5c-c1b66f82be71", "metadata": {}, "outputs": [], "source": [ - "sns.barplot(data=total_df, x=\"FoldChange\", y=\"Name\")" + "from ipywidgets import widgets\n", + "from ipywidgets import interact" ] }, { "cell_type": "markdown", - "id": "34a26492-7c6b-4a07-a4de-67ec8f693cdc", + "id": "277264e6-a173-40c5-b71e-4cd551a7fa99", "metadata": {}, "source": [ - "- By calculating the length of each gene and using a visualisation, does gene expression appears correlated with gene length ?" + "create a dataframe containing regions (without duplicates values" ] }, { "cell_type": "code", "execution_count": null, - "id": "9e16ae65-8cd9-42cd-931c-8dd519715255", + "id": "ebf7fde9-b4a1-4e8a-86ab-86ad8b1b533a", "metadata": {}, "outputs": [], "source": [ - "total_df.loc[:,\"gene_length\"] = total_df.loc[:,\"stop\"] - total_df.loc[:,\"start\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "08854b8a-f35c-4ce5-a509-c94d37ceaa2a", - "metadata": {}, - "outputs": [], - "source": [ - "sns.relplot(data=total_df, x=\"C\", y=\"gene_length\")" + "regions = ha_df.loc[:, 'Region'].drop_duplicates()" ] }, { "cell_type": "markdown", - "id": "a2627322-e6a5-422f-8a69-b89dbd4b777e", + "id": "f34e5053-ccf5-4a67-96db-7457fe16bbd6", "metadata": {}, "source": [ - "- Create a function which produce a single image with four different plots of your choice and save it to pdf file." + "1. Use this DataFarame to populate your dropdown list\n", + "2. Use the region selected in dropdown list as parameter of your function\n", + "3. select form the whole data frame the data corresponding to this region\n", + "4. display the barplot" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "a322c866-9232-4fae-bcee-9a635e3fd70b", + "cell_type": "markdown", + "id": "feba608f-2ecb-41ae-b04a-12f075fd644b", "metadata": {}, - "outputs": [], "source": [ - "import matplotlib.pyplot as plt" + "below the code skeleton of your function\n", + "\n", + "```python\n", + "@interact(region=widgets.Dropdown(options=regions))\n", + "def plot_counts(region):\n", + " data = ha_df.loc[ha_df['Region'] == region]\n", + " ax = sns.barplot(data= ....\n", + "```" ] }, { "cell_type": "code", "execution_count": null, - "id": "044022d1-741d-4a07-ba7f-c1f863cca138", + "id": "fb746fda-36cc-4c35-92d8-257a489fb278", "metadata": {}, "outputs": [], "source": [ - "def expression_graph():\n", "\n", - " fig, axs = plt.subplots(2,2, figsize=(9,7)) # 2 rows, 2 columns\n", - " sns.barplot(data=total_df, x=\"FoldChange\", y=\"Name\", ax=axs[0,0])\n", - " sns.barplot(data=counts_df, x=\"groups\", y=\"gene-LEPBI_RS00065\", ax=axs[0,1])\n", - " sns.swarmplot(data=counts_df, x=\"groups\", y=\"gene-LEPBI_RS00065\", ax=axs[0,1])\n", - " sns.scatterplot(data=total_df, x=\"WT\", y=\"gene_length\", ax=axs[1,0])\n", - " sns.scatterplot(data=total_df, x=\"C\", y=\"gene_length\", ax=axs[1,1], color=\"red\")\n", - " \n", + "@interact(region=widgets.Dropdown(options=regions))\n", + "def plot_counts(region):\n", + " data = ha_df.loc[ha_df['Region'] == region]\n", + " ax = sns.barplot(data=data, y='Happiness Score', x='Country')\n", + " ax.set_xticks(data.Country)\n", + " ax.set_xticklabels(data.Country, rotation=45, ha='right', rotation_mode='anchor')\n", " " ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "c33bfc78-7480-4327-93a0-f8aaca0d3614", - "metadata": {}, - "outputs": [], - "source": [ - "expression_graph()\n", - "plt.savefig(\"expression_visualization.pdf\")" - ] - }, - { - "cell_type": "markdown", - "id": "0d05aba4-3c85-4cd9-85f3-5296b19308fb", - "metadata": {}, - "source": [ - "# Extras" - ] - }, { "cell_type": "markdown", - "id": "66d6668e-683f-462e-a72f-28bdda8736f2", + "id": "3f4bd68e-eb26-46f8-a00f-86f9d0570580", "metadata": {}, "source": [ - "- Using ipywidget, make a function to display barplot of gene expression by groups with the gene being selected by the user (using a Dropdown widget for example)." + "You can customize your figure as classical seaborn/matplotib figure\n", + "\n", + "for instance to display the value above each bar" ] }, { "cell_type": "code", "execution_count": null, - "id": "64ebeca1-1332-4585-9e5c-c1b66f82be71", + "id": "7bcee7c5-f1c2-4035-9b7c-e68e1d73a932", "metadata": {}, "outputs": [], "source": [ - "from ipywidgets import widgets\n", - "from ipywidgets import interact\n", - "import matplotlib.pyplot as plt" + "\n", + "@interact(region=widgets.Dropdown(options=regions))\n", + "def plot_counts(region):\n", + " data = ha_df.loc[ha_df['Region'] == region]\n", + " ax = sns.barplot(data=data, y='Happiness Score', x='Country')\n", + " for i in ax.containers:\n", + " # add label on each bar https://www.geeksforgeeks.org/how-to-show-values-on-seaborn-barplot/\n", + " ax.bar_label(i, fmt=\"{:.2f}\", rotation='vertical', padding=3)\n", + " \n", + " ax.set_xticks(data.Country)\n", + " ax.set_xticklabels(data.Country, rotation=45, ha='right', rotation_mode='anchor')\n", + " ax.margins(y=0.1) # add margin to avoid to have label outside the barplotboundaries, here add 10% white space vertically \n", + " # https://stackoverflow.com/questions/72662991/how-can-i-prevent-bar-labels-from-going-outside-the-barplot-boundaries-range\n", + " " ] }, { "cell_type": "code", "execution_count": null, - "id": "fb746fda-36cc-4c35-92d8-257a489fb278", + "id": "d78b7b86-ecaa-4d27-80ca-2d3e46c2aca3", "metadata": {}, "outputs": [], - "source": [ - "@interact(genes=widgets.Dropdown(options=counts_df.columns))\n", - "def plot_counts(genes):\n", - " return plt.show(sns.barplot(data=counts_df, x=\"groups\", y=genes))" - ] + "source": [] } ], "metadata": { diff --git a/notebooks/Solutions/seaborn_TP_solutions_happiness2016.ipynb b/notebooks/Solutions/seaborn_TP_solutions_happiness2016.ipynb deleted file mode 100644 index d96cdf7286b6acb1f0bd577db8472e48b27bc839..0000000000000000000000000000000000000000 --- a/notebooks/Solutions/seaborn_TP_solutions_happiness2016.ipynb +++ /dev/null @@ -1,827 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "instrumental-personal", - "metadata": {}, - "source": [ - "# <center><b>Hands-on</b></center>\n", - "\n", - "<div style=\"text-align:center\">\n", - " <img src=\"../images/seaborn.png\" width=\"600px\">\n", - " <div>\n", - " Bertrand Néron, François Laurent, Etienne Kornobis, Vincent Guillemot\n", - " <br />\n", - " <a src=\" https://research.pasteur.fr/en/team/bioinformatics-and-biostatistics-hub/\">Bioinformatics and Biostatistiqucs HUB</a>\n", - " <br />\n", - " © Institut Pasteur, 2024\n", - " </div> \n", - "</div>" - ] - }, - { - "cell_type": "markdown", - "id": "compliant-basis", - "metadata": {}, - "source": [ - "Practice your graphing skills through the data of [happiness 2016](https://www.kaggle.com/datasets/unsdsn/world-happiness?select=2016.csv)\n", - "\n", - "(The data are already in data directory as `happiness_2016.csv`)" - ] - }, - { - "cell_type": "markdown", - "id": "3778963b-3bae-486d-8db7-30f23eb239ac", - "metadata": {}, - "source": [ - "## Import the data and have a look on them" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "minor-doctrine", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import seaborn as sns" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "skilled-daniel", - "metadata": {}, - "outputs": [], - "source": [ - "ha_df = pd.read_csv(\"../data/happiness_2016.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f8729a5b-314d-42fc-b130-783ca5e2076a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(157, 13)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ha_df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "brutal-manufacturer", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Country</th>\n", - " <th>Region</th>\n", - " <th>Happiness Rank</th>\n", - " <th>Happiness Score</th>\n", - " <th>Lower Confidence Interval</th>\n", - " <th>Upper Confidence Interval</th>\n", - " <th>Economy (GDP per Capita)</th>\n", - " <th>Family</th>\n", - " <th>Health (Life Expectancy)</th>\n", - " <th>Freedom</th>\n", - " <th>Trust (Government Corruption)</th>\n", - " <th>Generosity</th>\n", - " <th>Dystopia Residual</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>Denmark</td>\n", - " <td>Western Europe</td>\n", - " <td>1</td>\n", - " <td>7.526</td>\n", - " <td>7.460</td>\n", - " <td>7.592</td>\n", - " <td>1.44178</td>\n", - " <td>1.16374</td>\n", - " <td>0.79504</td>\n", - " <td>0.57941</td>\n", - " <td>0.44453</td>\n", - " <td>0.36171</td>\n", - " <td>2.73939</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Switzerland</td>\n", - " <td>Western Europe</td>\n", - " <td>2</td>\n", - " <td>7.509</td>\n", - " <td>7.428</td>\n", - " <td>7.590</td>\n", - " <td>1.52733</td>\n", - " <td>1.14524</td>\n", - " <td>0.86303</td>\n", - " <td>0.58557</td>\n", - " <td>0.41203</td>\n", - " <td>0.28083</td>\n", - " <td>2.69463</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>Iceland</td>\n", - " <td>Western Europe</td>\n", - " <td>3</td>\n", - " <td>7.501</td>\n", - " <td>7.333</td>\n", - " <td>7.669</td>\n", - " <td>1.42666</td>\n", - " <td>1.18326</td>\n", - " <td>0.86733</td>\n", - " <td>0.56624</td>\n", - " <td>0.14975</td>\n", - " <td>0.47678</td>\n", - " <td>2.83137</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>Norway</td>\n", - " <td>Western Europe</td>\n", - " <td>4</td>\n", - " <td>7.498</td>\n", - " <td>7.421</td>\n", - " <td>7.575</td>\n", - " <td>1.57744</td>\n", - " <td>1.12690</td>\n", - " <td>0.79579</td>\n", - " <td>0.59609</td>\n", - " <td>0.35776</td>\n", - " <td>0.37895</td>\n", - " <td>2.66465</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>Finland</td>\n", - " <td>Western Europe</td>\n", - " <td>5</td>\n", - " <td>7.413</td>\n", - " <td>7.351</td>\n", - " <td>7.475</td>\n", - " <td>1.40598</td>\n", - " <td>1.13464</td>\n", - " <td>0.81091</td>\n", - " <td>0.57104</td>\n", - " <td>0.41004</td>\n", - " <td>0.25492</td>\n", - " <td>2.82596</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Country Region Happiness Rank Happiness Score \\\n", - "0 Denmark Western Europe 1 7.526 \n", - "1 Switzerland Western Europe 2 7.509 \n", - "2 Iceland Western Europe 3 7.501 \n", - "3 Norway Western Europe 4 7.498 \n", - "4 Finland Western Europe 5 7.413 \n", - "\n", - " Lower Confidence Interval Upper Confidence Interval \\\n", - "0 7.460 7.592 \n", - "1 7.428 7.590 \n", - "2 7.333 7.669 \n", - "3 7.421 7.575 \n", - "4 7.351 7.475 \n", - "\n", - " Economy (GDP per Capita) Family Health (Life Expectancy) Freedom \\\n", - "0 1.44178 1.16374 0.79504 0.57941 \n", - "1 1.52733 1.14524 0.86303 0.58557 \n", - "2 1.42666 1.18326 0.86733 0.56624 \n", - "3 1.57744 1.12690 0.79579 0.59609 \n", - "4 1.40598 1.13464 0.81091 0.57104 \n", - "\n", - " Trust (Government Corruption) Generosity Dystopia Residual \n", - "0 0.44453 0.36171 2.73939 \n", - "1 0.41203 0.28083 2.69463 \n", - "2 0.14975 0.47678 2.83137 \n", - "3 0.35776 0.37895 2.66465 \n", - "4 0.41004 0.25492 2.82596 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ha_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "departmental-exhibition", - "metadata": {}, - "source": [ - "## Do a boxplot showing the differences in `happiness` between `Region`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "saved-identity", - "metadata": {}, - "outputs": [], - "source": [ - "sns.boxplot(data=ha_df, x=\"Happiness Score\", y=\"Region\")" - ] - }, - { - "cell_type": "markdown", - "id": "portuguese-worse", - "metadata": {}, - "source": [ - "## Using a histogram and continuous probability density curve, display the distribution of `Freedom` in the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "continuous-indian", - "metadata": {}, - "outputs": [], - "source": [ - "sns.histplot(data=ha_df, x=\"Freedom\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "understanding-vegetarian", - "metadata": {}, - "outputs": [], - "source": [ - "sns.histplot(data=ha_df, x=\"Freedom\", kde=True)" - ] - }, - { - "cell_type": "markdown", - "id": "prepared-stephen", - "metadata": {}, - "source": [ - "- Use a barplot to show the count of country per Region (see the documentation for a countplot)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "worldwide-communication", - "metadata": {}, - "outputs": [], - "source": [ - "sns.countplot(data=ha_df, x=\"Region\")" - ] - }, - { - "cell_type": "markdown", - "id": "4e1d16f2-57d8-4f0e-9a69-e9ab193a3ebc", - "metadata": {}, - "source": [ - "As you can see the labels overlaps each ohers and are not readable\n", - "\n", - "One possibility is to rotate the X-labels. In this case is better to provide the labels." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64ee74bb-5f37-485f-8c03-d06ac14d3010", - "metadata": {}, - "outputs": [], - "source": [ - "# extract the Region from the data, I will use them as labels for figures below\n", - "regions = ha_df.loc[:, 'Region'].drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb7c96ac-585a-4787-a2ee-dec3d04790ca", - "metadata": {}, - "outputs": [], - "source": [ - "ax = sns.countplot(data=ha_df, x=\"Region\")\n", - "ax.set_xticks(regions)\n", - "ax.set_xticklabels(regions, rotation=45, ha='right', rotation_mode='anchor')" - ] - }, - { - "cell_type": "markdown", - "id": "e2eac27c-2de9-4942-82b9-294318ec5fd4", - "metadata": {}, - "source": [ - "## On the same data `Happiness` and `Region` do a boxplot and a swarmplot to display the structure of the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0fd6058-65b0-4f9b-bc59-dce0193f1580", - "metadata": {}, - "outputs": [], - "source": [ - "ax = sns.swarmplot(data=ha_df, x=\"Region\", y=\"Happiness Score\")\n", - "ax.set_xticks(regions)\n", - "ax.set_xticklabels(regions,rotation=45, ha='right', rotation_mode='anchor')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fe079b4-013a-4d48-ac31-9daad4b4673e", - "metadata": {}, - "outputs": [], - "source": [ - "ax = sns.boxplot(data=ha_df, x=\"Region\", y=\"Happiness Score\", hue='Region') # see the result of the option hue\n", - "ax.set_xticks(regions)\n", - "ax.set_xticklabels(regions,rotation=45, ha='right', rotation_mode='anchor')" - ] - }, - { - "cell_type": "markdown", - "id": "immediate-method", - "metadata": {}, - "source": [ - "## Plot the distribution of `happiness` for the people leaving `Western Europe`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "academic-measure", - "metadata": {}, - "outputs": [], - "source": [ - "sns.histplot(data=ha_df.query(\"Region == 'Western Europe'\"), x=\"Happiness Score\", kde=True)" - ] - }, - { - "cell_type": "markdown", - "id": "fd9789db-9bab-478a-bcec-1c8b6775cf20", - "metadata": {}, - "source": [ - "## Plot the `Health (Life Expectancy)` vs `Happiness Score` and color the dots according to the region " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d42f72f1-1d01-496e-89a1-68391ffa4281", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52ae8376-3c66-4ca6-86a9-f9ae9f56076f", - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(figsize=(9, 7))\n", - "sns.scatterplot(data=ha_df, x=\"Health (Life Expectancy)\", y=\"Happiness Score\", hue=\"Region\", ax=ax)" - ] - }, - { - "cell_type": "markdown", - "id": "temporal-synthesis", - "metadata": {}, - "source": [ - "- Feel free to explore more of [seaborn](https://seaborn.pydata.org/examples/index.html) !" - ] - }, - { - "cell_type": "markdown", - "id": "3063abf7-2251-48eb-b371-6c5b70b45fe7", - "metadata": {}, - "source": [ - "## Do a barplot of the Happiness Score for each Region" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85dd0df6-74e7-43be-9a7c-eb922a06601b", - "metadata": {}, - "outputs": [], - "source": [ - "sns.barplot(data=ha_df, y=\"Region\", x=\"Happiness Score\", hue='Region', orient='h')" - ] - }, - { - "cell_type": "markdown", - "id": "3ee5741a-64f4-4690-963b-1f7e729398bf", - "metadata": {}, - "source": [ - "## from this point we will focus on the Regions\n", - "\n", - "### clean our dataset. Remove not relevant columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0344b730-1535-47fb-82f5-07003fd223f9", - "metadata": {}, - "outputs": [], - "source": [ - "ha_df.columns" - ] - }, - { - "cell_type": "markdown", - "id": "36e449d1-0add-4ebc-8903-d535219ce423", - "metadata": {}, - "source": [ - "1. keep only columns 'Region', 'Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Freedom','Trust (Government Corruption)', 'Generosity'\n", - "2. set the index to the Region\n", - "3. have a look on your new data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdf897c4-b8f3-4dff-b9c0-0ad47b25ecc0", - "metadata": {}, - "outputs": [], - "source": [ - "region_df = ha_df.loc[:, ['Region', 'Happiness Score', 'Economy (GDP per Capita)', 'Family', 'Freedom','Trust (Government Corruption)', 'Generosity']]\n", - "region_df.set_index('Region', inplace=True)\n", - "region_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "e1ae03ac-ac7c-436d-987d-113e9cca3eec", - "metadata": {}, - "source": [ - "## Aggregate the new data region by region. Compute the mean of each country as value for the corresponding Region" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fc3ea89-a448-4e7b-abfb-3fa92cffc5f7", - "metadata": {}, - "outputs": [], - "source": [ - "reg_agg = region_df.groupby('Region').agg('mean')\n", - "reg_agg" - ] - }, - { - "cell_type": "markdown", - "id": "97cb188c-3e50-4492-961f-cadea3611aaa", - "metadata": {}, - "source": [ - "## Do a hierarchically-clustered heatmap " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9aa21ed4-e9b2-4eb3-a693-c59ceb513552", - "metadata": {}, - "outputs": [], - "source": [ - "sns.clustermap(data=reg_agg)" - ] - }, - { - "cell_type": "markdown", - "id": "88d27d29-e3b8-43d7-8324-25e50c247872", - "metadata": {}, - "source": [ - "Check the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0128f575-0b2a-4cbc-8f6e-8b7e22d81254", - "metadata": {}, - "outputs": [], - "source": [ - "reg_agg.describe()" - ] - }, - { - "cell_type": "markdown", - "id": "f9b39ab8-0051-4840-9e87-fe2bcb8ca07a", - "metadata": {}, - "source": [ - "The data are not in the same range, so it could be better to standardize the data before to do the clustering" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff4beb57-b357-47a3-b7bd-877e05229b6b", - "metadata": {}, - "outputs": [], - "source": [ - "normalized_reg=(reg_agg - reg_agg.mean()) / reg_agg.std()\n", - "normalized_reg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e19f9472-cb9b-434b-8689-2bf09d49b902", - "metadata": {}, - "outputs": [], - "source": [ - "sns.clustermap(data=normalized_reg, annot=True) # see the results of the annot option " - ] - }, - { - "cell_type": "markdown", - "id": "d64a0377-339b-4fe7-beb4-a32e4a4e0113", - "metadata": {}, - "source": [ - "It's possible to do that directly in seaborn. with the option z_score (https://seaborn.pydata.org/generated/seaborn.clustermap.html)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b439517-5007-4fbb-828d-265f9835594f", - "metadata": {}, - "outputs": [], - "source": [ - "sns.clustermap(data=reg_agg, z_score=1, annot=True)" - ] - }, - { - "cell_type": "markdown", - "id": "f8d6188e-3a37-4ba5-b377-a11696054e9c", - "metadata": {}, - "source": [ - "- Explore the clustermap documentation to have a more visual heatmap by standardizing the data within genes." - ] - }, - { - "cell_type": "markdown", - "id": "a2627322-e6a5-422f-8a69-b89dbd4b777e", - "metadata": {}, - "source": [ - "## Create a function which produce a single image with four different plots of your choice and save it to pdf file.\n", - "\n", - "like the image below." - ] - }, - { - "cell_type": "markdown", - "id": "4121ff3d-6814-493e-a505-357ad81b0d28", - "metadata": {}, - "source": [ - "<img src=\"../images/multiple_figure.png\" width=\"50%\" />" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a322c866-9232-4fae-bcee-9a635e3fd70b", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "044022d1-741d-4a07-ba7f-c1f863cca138", - "metadata": {}, - "outputs": [], - "source": [ - "def expression_graph():\n", - " fig, axs = plt.subplots(2,2, figsize=(9,7), constrained_layout=True) # constrained_layout=True avoid overlapping between axis title and X-labels from the above figure\n", - " sns.boxplot(data=ha_df, x=\"Happiness Score\", y=\"Region\", hue='Region', ax=axs[0,0])\n", - " axs[0,0].set_title(\"happiness data structure\")\n", - " \n", - " sns.scatterplot(data=ha_df, x=\"Health (Life Expectancy)\", y=\"Happiness Score\", hue=\"Region\", legend=False, ax=axs[0,1])\n", - " axs[0,1].set_title(\"Happiness vs Health\")\n", - " \n", - " sns.barplot(data=ha_df, y=\"Region\", x=\"Happiness Score\", hue='Region', orient='h', ax=axs[1,0])\n", - " axs[1,0].set_title(\"happiness through the world\")\n", - " \n", - " sns.histplot(data=ha_df, x=\"Happiness Score\", kde=True, ax=axs[1,1])\n", - " axs[1,1].set_title(\"Happiness data distribution\")\n", - " \n", - " return fig\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c33bfc78-7480-4327-93a0-f8aaca0d3614", - "metadata": {}, - "outputs": [], - "source": [ - "my_fig = expression_graph()\n", - "my_fig.suptitle(\"Happiness Report\")\n", - "my_fig.savefig(\"happiness_visualization.pdf\", bbox_inches = \"tight\") # bbox_inches = \"tight\" avoid to truncate the Y-labels on left on pdf" - ] - }, - { - "cell_type": "markdown", - "id": "0d05aba4-3c85-4cd9-85f3-5296b19308fb", - "metadata": {}, - "source": [ - "# Extras" - ] - }, - { - "cell_type": "markdown", - "id": "66d6668e-683f-462e-a72f-28bdda8736f2", - "metadata": {}, - "source": [ - "- Using ipywidget, make a function to display barplot of `Happiness Score` by country but with region selected by the user (using a Dropdown widget)" - ] - }, - { - "cell_type": "markdown", - "id": "042bd87e-d2dc-4544-a771-51d80c565d0f", - "metadata": {}, - "source": [ - "Imports the needed modules \n", - "- `widgets` and `interact` from the `ipywidgets` package\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64ebeca1-1332-4585-9e5c-c1b66f82be71", - "metadata": {}, - "outputs": [], - "source": [ - "from ipywidgets import widgets\n", - "from ipywidgets import interact" - ] - }, - { - "cell_type": "markdown", - "id": "277264e6-a173-40c5-b71e-4cd551a7fa99", - "metadata": {}, - "source": [ - "create a dataframe containing regions (without duplicates values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebf7fde9-b4a1-4e8a-86ab-86ad8b1b533a", - "metadata": {}, - "outputs": [], - "source": [ - "regions = ha_df.loc[:, 'Region'].drop_duplicates()" - ] - }, - { - "cell_type": "markdown", - "id": "f34e5053-ccf5-4a67-96db-7457fe16bbd6", - "metadata": {}, - "source": [ - "1. Use this DataFarame to populate your dropdown list\n", - "2. Use the region selected in dropdown list as parameter of your function\n", - "3. select form the whole data frame the data corresponding to this region\n", - "4. display the barplot" - ] - }, - { - "cell_type": "markdown", - "id": "feba608f-2ecb-41ae-b04a-12f075fd644b", - "metadata": {}, - "source": [ - "below the code skeleton of your function\n", - "\n", - "```python\n", - "@interact(region=widgets.Dropdown(options=regions))\n", - "def plot_counts(region):\n", - " data = ha_df.loc[ha_df['Region'] == region]\n", - " ax = sns.barplot(data= ....\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb746fda-36cc-4c35-92d8-257a489fb278", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "@interact(region=widgets.Dropdown(options=regions))\n", - "def plot_counts(region):\n", - " data = ha_df.loc[ha_df['Region'] == region]\n", - " ax = sns.barplot(data=data, y='Happiness Score', x='Country')\n", - " ax.set_xticks(data.Country)\n", - " ax.set_xticklabels(data.Country, rotation=45, ha='right', rotation_mode='anchor')\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "3f4bd68e-eb26-46f8-a00f-86f9d0570580", - "metadata": {}, - "source": [ - "You can customize your figure as classical seaborn/matplotib figure\n", - "\n", - "for instance to display the value above each bar" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7bcee7c5-f1c2-4035-9b7c-e68e1d73a932", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "@interact(region=widgets.Dropdown(options=regions))\n", - "def plot_counts(region):\n", - " data = ha_df.loc[ha_df['Region'] == region]\n", - " ax = sns.barplot(data=data, y='Happiness Score', x='Country')\n", - " for i in ax.containers:\n", - " # add label on each bar https://www.geeksforgeeks.org/how-to-show-values-on-seaborn-barplot/\n", - " ax.bar_label(i, fmt=\"{:.2f}\", rotation='vertical', padding=3)\n", - " \n", - " ax.set_xticks(data.Country)\n", - " ax.set_xticklabels(data.Country, rotation=45, ha='right', rotation_mode='anchor')\n", - " ax.margins(y=0.1) # add margin to avoid to have label outside the barplotboundaries, here add 10% white space vertically \n", - " # https://stackoverflow.com/questions/72662991/how-can-i-prevent-bar-labels-from-going-outside-the-barplot-boundaries-range\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d78b7b86-ecaa-4d27-80ca-2d3e46c2aca3", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}