# coding: utf-8

# In[1]:

get_ipython().magic(u'matplotlib inline')


# In[2]:

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np


# # Gaussian Mixture Models (GMMs)
# See the documentation at http://scikit-learn.org/stable/modules/mixture.html#gmm section 2.1.1.

# In[8]:

from sklearn import datasets
from sklearn.externals.six.moves import xrange
from sklearn.mixture import GaussianMixture


# For a full example with plots, see http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html
# 
# Please note that while in this example the GMMs are used for classification, the basic usage is for clustering (or density estimation). The .fit() method does not need the labels...
# 
# Here we just look at the main steps for fitting a model:

# In[5]:

#-get the data
iris = datasets.load_iris()
X = iris.data
y = iris.target

#-how many features are there?


# In[9]:

#-fit a model (example):
n_classes = 3
clst = GaussianMixture(n_components=n_classes, covariance_type='diag', init_params='kmeans', max_iter=20)
clst.fit(X)


# In[11]:

#-inspect the fitted parameters:
print(clst.means_)    # is the matrix of component centers, one per row
print(clst.weights_)  # these are the mixing coefficients


# In[14]:

#-you can assign the data points to a cluster:
y_pred = clst.predict(X)


# In[18]:

#-and "compare" with the true labels:
sum(y != y_pred)*1.0 / y.size


# Why there is such a mismatch between cluster labels and class labels? Inspect the two and try for find a way to put in correspondence the class labels with the cluster labels.

# # K-means clustering
# 
# See the documentation at
# http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
# 
# Note that there is a parameter, n_jobs, which allows you running the
# code on several CPUs. Great speed-up for large data sets.
# 
# The basic steps are as before:

# In[19]:

from sklearn.cluster import  KMeans


# In[20]:

#-fit a model (example):
n_classes = 3
clst = KMeans(n_clusters=n_classes)
clst.fit(X)


# In[21]:

#-inspect the fitted parameters:
clst.cluster_centers_    # is the matrix of component centers, one per row


# In[22]:

#-you can assign the data points to a cluster:
y_pred = clst.predict(X)


# In[24]:

#-and compare with the true labels:
sum(y != y_pred)*1.0 / y.size  ## Why does it not work, as you woud expect??


# ** NOTE: ** For large data sets there is a "batch" version of KMeans, which converges much faster. Look at:
#  http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans
# 
# And see the discussion:
# http://scikit-learn.org/stable/auto_examples/cluster/plot_mini_batch_kmeans.html

# ## Application: color quantization
# An important application of KMeans is in image processing. For example in re-quantizing the color levels. For this, execute the example at http://scikit-learn.org/stable/auto_examples/cluster/plot_color_quantization.html
# 
# ### TODO: (it time allows)
# Use a different clustering method - explore the options from
# 
# - mean shift: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift
# - spectral clustering: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering
# - hierarchical clustering (Ward): http://scikit-learn.org/0.16/modules/generated/sklearn.cluster.Ward.html