From 5ce2be2cf44c15f4733a01f9252af20abd90ec78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20=20MENAGER?= <herve.menager@pasteur.fr> Date: Sun, 6 May 2018 20:40:49 +0200 Subject: [PATCH] generate correlation circle for the PCA (#49), WIP Former-commit-id: 414f368d52e5d67dcbc3dbcb66cd2edf04ffeb4a --- ippisite/ippidb/management/commands/pca.py | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ippisite/ippidb/management/commands/pca.py b/ippisite/ippidb/management/commands/pca.py index f752538e..c64f4680 100644 --- a/ippisite/ippidb/management/commands/pca.py +++ b/ippisite/ippidb/management/commands/pca.py @@ -4,6 +4,8 @@ import json from django.core.management import BaseCommand, CommandError from django.forms.models import model_to_dict import pandas as pd +import matplotlib.pyplot as plt +import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler @@ -49,6 +51,28 @@ class Command(BaseCommand): x = StandardScaler().fit_transform(x) pca = PCA(n_components=2) principal_components = pca.fit_transform(x) + print(pca.explained_variance_) + print(pca.components_) + variance_ratio = pd.Series(pca.explained_variance_ratio_) + coef = np.transpose(pca.components_) + cols = ['PC-'+str(x) for x in range(len(variance_ratio))] + pc_infos = pd.DataFrame(coef, columns=cols, index=features) #pd.DataFrame(data=x).columns) + plt.Circle((0,0),radius=10, color='g', fill=False) + circle1 = plt.Circle((0,0),radius=1, color='g', fill=False) + fig = plt.gcf() + fig.gca().add_artist(circle1) + for idx in range(len(pc_infos["PC-0"])): + x = pc_infos["PC-0"][idx] + y = pc_infos["PC-1"][idx] + plt.plot([0.0,x],[0.0,y],'k-') + plt.plot(x, y, 'rx') + plt.annotate(pc_infos.index[idx], xy=(x,y)) + plt.xlabel("PC-0 (%s%%)" % str(variance_ratio[0])[:4].lstrip("0.")) + plt.ylabel("PC-1 (%s%%)" % str(variance_ratio[1])[:4].lstrip("0.")) + plt.xlim((-1,1)) + plt.ylim((-1,1)) + plt.title("Circle of Correlations") + plt.savefig('foo2.png') principal_df = pd.DataFrame(data = principal_components , columns = ['x', 'y']) final_df = pd.concat([principal_df, df[['family','id']]], axis = 1) for index, row in final_df.iterrows(): -- GitLab