# coding: utf-8 # In[1]: get_ipython().magic(u'matplotlib inline') # In[2]: import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np # # Gaussian Mixture Models (GMMs) # See the documentation at http://scikit-learn.org/stable/modules/mixture.html#gmm section 2.1.1. # In[8]: from sklearn import datasets from sklearn.externals.six.moves import xrange from sklearn.mixture import GaussianMixture # For a full example with plots, see http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_covariances.html # # Please note that while in this example the GMMs are used for classification, the basic usage is for clustering (or density estimation). The .fit() method does not need the labels... # # Here we just look at the main steps for fitting a model: # In[5]: #-get the data iris = datasets.load_iris() X = iris.data y = iris.target #-how many features are there? # In[9]: #-fit a model (example): n_classes = 3 clst = GaussianMixture(n_components=n_classes, covariance_type='diag', init_params='kmeans', max_iter=20) clst.fit(X) # In[11]: #-inspect the fitted parameters: print(clst.means_) # is the matrix of component centers, one per row print(clst.weights_) # these are the mixing coefficients # In[14]: #-you can assign the data points to a cluster: y_pred = clst.predict(X) # In[18]: #-and "compare" with the true labels: sum(y != y_pred)*1.0 / y.size # Why there is such a mismatch between cluster labels and class labels? Inspect the two and try for find a way to put in correspondence the class labels with the cluster labels. # # K-means clustering # # See the documentation at # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans # # Note that there is a parameter, n_jobs, which allows you running the # code on several CPUs. Great speed-up for large data sets. # # The basic steps are as before: # In[19]: from sklearn.cluster import KMeans # In[20]: #-fit a model (example): n_classes = 3 clst = KMeans(n_clusters=n_classes) clst.fit(X) # In[21]: #-inspect the fitted parameters: clst.cluster_centers_ # is the matrix of component centers, one per row # In[22]: #-you can assign the data points to a cluster: y_pred = clst.predict(X) # In[24]: #-and compare with the true labels: sum(y != y_pred)*1.0 / y.size ## Why does it not work, as you woud expect?? # ** NOTE: ** For large data sets there is a "batch" version of KMeans, which converges much faster. Look at: # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans # # And see the discussion: # http://scikit-learn.org/stable/auto_examples/cluster/plot_mini_batch_kmeans.html # ## Application: color quantization # An important application of KMeans is in image processing. For example in re-quantizing the color levels. For this, execute the example at http://scikit-learn.org/stable/auto_examples/cluster/plot_color_quantization.html # # ### TODO: (it time allows) # Use a different clustering method - explore the options from # # - mean shift: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift # - spectral clustering: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering # - hierarchical clustering (Ward): http://scikit-learn.org/0.16/modules/generated/sklearn.cluster.Ward.html