# -*- coding: utf-8 -*- """ Created on Sun Oct 5 18:35:20 2014 @author: vlad """ ######### ## Objectives: ## ## -learn how to compute performance parameters ## -use cross-validation for performance estimation ## -compare the performance of two classifiers ## import os os.chdir('/home/vlad/ownCloud/Work/Teach/Pattern recognition/pr04/ex') import numpy as np from sklearn import datasets from sklearn import metrics from matplotlib import pylab as plt # check the help for 'datasets.make_classification' function X, y = datasets.make_classification(1000, n_features=2, weights=[0.4,0.6], n_informative=2, n_redundant=0, n_clusters_per_class=1, flip_y=0.01, shuffle=True) plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.Paired) # train a model (LDA) on 200 points: from sklearn.lda import LDA model = LDA() model.fit(X[0:200,:], y[0:200]) # confusion matrix (true values are by rows, predicted by column) cm_train = metrics.confusion_matrix(y[0:200], model.predict(X[0:200,:])) ### TODO ### ### replace the "pass" code below with actual code computing the parameters ### def perf(C, param="TPF"): """ Simple function for computing several performance parameters, based on a 2x2 confusion matrix (binary classification case). Usage: p = perf(C, param="TFP") Parameters: C: numpy.ndarray 2x2 confusion matrix with counts param: string what parameter to compute. Default: "TPF". Possible values: "TPF", "FPF", "Err", "Sensitivity", "Specificity", "NPV", "PPV". Return: float the value of the parameter of interest. """ if C.shape != (2,2): raise ValueError("The confusion matrix must be a 2x2 matrix.") param = param.lower() # advanced coding: use dictionaries instead of "if-elif-else" if param == "tpf": return float(C[1,1]) / float(C[1,0]+C[1,1]) elif param == "fpf": pass elif param == "err": pass elif param == "sensitivity": pass elif param == "specificity": pass elif param == "npv": pass elif param == "ppv": pass else: raise ValueError("Unknown parameter.") return ## ROC, AUC: z = model.predict_proba(X[200:1000,:])[:,1] # take the posteriors for only 1 class fpf, tpf, th = metrics.roc_curve(y[200:1000], z) plt.plot(fpf, tpf, lw=3, label='ROC') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Fraction') plt.ylabel('True Positive Fraction') plt.title('ROC') ## ...and the AUC metrics.roc_auc_score(y[200:1000], z) ## Cross-validation ## have a look at ## http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cross_validation from sklearn import cross_validation X, y = datasets.make_classification(1000, n_features=2, weights=[0.4,0.6], n_informative=2, n_redundant=0, n_clusters_per_class=1, flip_y=0.01, shuffle=True) plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.Paired) K = 5 kf = cross_validation.KFold(X.shape[0], n_folds=K) tpf = np.zeros(5) i = 0 for train_index, test_index in kf: X_train, y_train = X[train_index, :], y[train_index] X_test, y_test = X[test_index,:], y[test_index] model.fit(X_train, y_train) yp = model.predict(X_test) cm = metrics.confusion_matrix(y_test, yp) tpf[i] = perf(cm, "TPF") i += 1 print("Estimated TPF: ", np.mean(tpf)) print("Std. dev: ", np.std(tpf)) ### TODO ### ### Implement Agresti-Coull approximation def prop_CI(p, n, alpha=0.05, method="normal"): from scipy import stats from scipy.stats import norm method = method.lower() ci_lo = ci_hi = 0.0 if method == "normal": # normal approximation for binomial CIs z = norm.ppf(1-alpha/2) # quantile ci_lo = p - z*np.sqrt(p*(1-p)/n) ci_hi = p + z*np.sqrt(p*(1-p)/n) elif method == "agresti": pass else: raise ValueError("Unknown method.") return ci_lo, ci_hi ### TODO ### ### Use the CV above to estimate error rate and then produce ### different CI for the result. ## More advanced example. from ## http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html#example-plot-roc-crossval-py import numpy as np from scipy import interp import matplotlib.pyplot as plt from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc from sklearn.cross_validation import StratifiedKFold ############################################################################### # Data IO and generation # import some data to play with iris = datasets.load_iris() X = iris.data y = iris.target X, y = X[y != 2], y[y != 2] n_samples, n_features = X.shape # Add noisy features random_state = np.random.RandomState(0) X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] ############################################################################### # Classification and ROC analysis # Run classifier with cross-validation and plot ROC curves cv = StratifiedKFold(y, n_folds=6) classifier = svm.SVC(kernel='linear', probability=True, random_state=random_state) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show() ### TODO ### ### Train a linear SVM and an LDA on each partition of a 5-fold CV and ### recurd the predicted values for the left-out samples. Build the ### McNemar contingency table and compute the chi^2. ### ### Use scipy.stats.chi2(1).ppf(1-alpha/2) to obtain the quantiles for ### chi^2 distribution with 1 degree of freedom. ### ### ### TODO ### ### Simulate a large data set (say 10000 samples) and estimate the ### performance (TPF, FPF) of some classifier (your choice) by ### 10x5-CV on modeling sets of sizes 100, 500, 1000,..., 5000. ### For each estimate provide 95% CI, but using estimated standard ### deviation divided by sqrt(n), instead of sqrt(p(1-p)n). ### Compare the estimates with observed performance on 5000 samples ### not used in the modeling (and estimate 95% CI again).