# -*- coding: utf-8 -*- """ Created on Tue Sep 30 17:30:39 2014 @author: vlad """ %load_ext autoreload %autoreload 2 import os os.chdir('/home/vlad/ownCloud/Work/Teach/Pattern recognition/code') import numpy as np from sklearn import datasets from matplotlib import pylab as plt # check the help for 'datasets.make_classification' function X, y = datasets.make_classification(1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1) plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.Paired) # In the following, the linear models will contain .coeff_ and # .intercept_ attributes that you can check and use to plot the descision # surface (plane) ########################## # 1. LDA: # -train and classify from sklearn.lda import LDA # check the help of lda.LDA() class model = LDA() model.fit(X[0:500,:], y[0:500]) # check the performance on the new data: yp = model.predict(X[500:1001,:]) sum(yp != y[500:1001]) / 500 # transform the data into the maximum separating space Xt = model.transform(X) plt.hist([X[y==0,0], Xt[y==1,0]], bins=30, color=['blue','red']) # check the fitted posterior probabilities: prb = model.predict_proba(X) plt.hist([prb[y==0,0], prb[y==1,0]], bins=30, color=['blue','red']) # see how the class label is chosen: df = model.decision_function(X) # returns h(x)=+w0 ## TODO: ## -Plot the transformed data separately for training data and testing data. ## -check how the sign (use numpy.sign()) of df is mapped onto class labels ## -generate a higher dimensional dataset (n_features=5) - call it differently - ## and study ## --the optimal subspace (plot different pairs of coordinates) ## --select to have more redundant or linearly dependent features and see ## how the LD subspace changes ########################## # 2. Logistic regression: # -train and classify from sklearn.linear_model import LogisticRegression model = LogisticRegression(C=1000) # large values imply less penalty #X, y = datasets.make_classification(1000, n_features=2, # n_informative=2, n_redundant=0, # n_clusters_per_class=1) # plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.Paired) model.fit(X[0:500,:], y[0:500]) yp = model.predict(X[500:1001,:]) sum(yp != y[500:1001])/ 500 # compare with LDA # check the fitted posterior probabilities: prb = model.predict_proba(X) plt.hist([prb[y==0,0], prb[y==1,0]], bins=30, color=['blue','red']) ## TODO; ## -Check what .transform() method does ## -Generate a higher dimensional space, with some correlation structure ## between variables and see how the subspace generated by .transform() changes ############################ # 3. Linear SVM # -train and classify from sklearn.svm import LinearSVC # check the documentation! model = LinearSVC() model.fit(X[0:500,:], y[0:500]) yp = model.predict(X[500:1001,:]) sum(yp != y[500:1001])/ 500 # compare with LDA and logistic regression ## TODO ## -Use model.decision_function(X) * y to get the margins ## -plot the margin density (histogram); change C (e.g. 0.1, 10, 1000) ## and see how the margin distribution changes