# -*- coding: utf-8 -*-
"""
Created on Sun Oct  5 18:35:20 2014

@author: vlad
"""
#########
## Objectives:
##
## -learn how to compute performance parameters
## -use cross-validation for performance estimation
## -compare the performance of two classifiers
##

import os
os.chdir('/home/vlad/ownCloud/Work/Teach/Pattern recognition/pr04/ex')

import numpy as np
from sklearn import datasets
from sklearn import metrics

from matplotlib import pylab as plt

# check the help for 'datasets.make_classification' function
X, y = datasets.make_classification(1000, n_features=2, weights=[0.4,0.6],
                                    n_informative=2, n_redundant=0, 
                                    n_clusters_per_class=1, flip_y=0.01,
                                    shuffle=True)
                                    
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.Paired)     


# train a model (LDA) on 200 points:
from sklearn.lda import LDA

model = LDA()
model.fit(X[0:200,:], y[0:200])

# confusion matrix (true values are by rows, predicted by column)
cm_train = metrics.confusion_matrix(y[0:200], model.predict(X[0:200,:]))


### TODO
###
### replace the "pass" code below with actual code computing the parameters
###
def perf(C, param="TPF"):
    """
    Simple function for computing several performance parameters, based
    on a 2x2 confusion matrix (binary classification case).
    
    Usage:
        p = perf(C, param="TFP")
       
    Parameters:
        C: numpy.ndarray
            2x2 confusion matrix with counts
        param: string
            what parameter to compute. Default: "TPF". Possible values:
            "TPF", "FPF", "Err", "Sensitivity", "Specificity", "NPV", "PPV".
            
    Return:
        float
            the value of the parameter of interest.
    """
    if C.shape != (2,2):
        raise ValueError("The confusion matrix must be a 2x2 matrix.")
        
    param = param.lower()
    
    # advanced coding: use dictionaries instead of "if-elif-else"
    if param == "tpf":
        return float(C[1,1]) / float(C[1,0]+C[1,1])
    elif param == "fpf":
        pass
    elif param == "err":
        pass
    elif param == "sensitivity":
        pass
    elif param == "specificity":
        pass
    elif param == "npv":
        pass
    elif param == "ppv":
        pass
    else:
        raise ValueError("Unknown parameter.")
        
    return
    
    
## ROC, AUC:
z = model.predict_proba(X[200:1000,:])[:,1]  # take the posteriors for only 1 class
fpf, tpf, th = metrics.roc_curve(y[200:1000], z)
plt.plot(fpf, tpf, lw=3, label='ROC')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Fraction')
plt.ylabel('True Positive Fraction')
plt.title('ROC')

## ...and the AUC
metrics.roc_auc_score(y[200:1000], z)


## Cross-validation
## have a look at
## http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cross_validation
from sklearn import cross_validation

X, y = datasets.make_classification(1000, n_features=2, weights=[0.4,0.6],
                                    n_informative=2, n_redundant=0, 
                                    n_clusters_per_class=1, flip_y=0.01,
                                    shuffle=True)
                                    
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.Paired)     

K = 5
kf = cross_validation.KFold(X.shape[0], n_folds=K)

tpf = np.zeros(5)
i = 0
for train_index, test_index in kf:
    X_train, y_train = X[train_index, :], y[train_index]
    X_test, y_test   = X[test_index,:], y[test_index]
    
    model.fit(X_train, y_train)
    yp = model.predict(X_test)
    
    cm = metrics.confusion_matrix(y_test, yp)
    tpf[i] = perf(cm, "TPF")
    i += 1
    
print("Estimated TPF: ", np.mean(tpf))
print("Std. dev: ", np.std(tpf))

### TODO
###
### Implement Agresti-Coull approximation
def prop_CI(p, n, alpha=0.05, method="normal"):
    from scipy import stats
    from scipy.stats import norm
    
    method = method.lower()
    
    ci_lo = ci_hi = 0.0
    
    if method == "normal":
        # normal approximation for binomial CIs
        z = norm.ppf(1-alpha/2)  # quantile
        ci_lo = p - z*np.sqrt(p*(1-p)/n)
        ci_hi = p + z*np.sqrt(p*(1-p)/n)
    elif method == "agresti":
        pass
    else: 
        raise ValueError("Unknown method.")
        
    return ci_lo, ci_hi
    
### TODO
###
### Use the CV above to estimate error rate and then produce
### different CI for the result.
    

## More advanced example. from
## http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html#example-plot-roc-crossval-py
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold

###############################################################################
# Data IO and generation

# import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
X, y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape

# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

###############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(y, n_folds=6)
classifier = svm.SVC(kernel='linear', probability=True,
                     random_state=random_state)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


### TODO
###
### Train a linear SVM and an LDA on each partition of a 5-fold CV and 
### recurd the predicted values for the left-out samples. Build the
### McNemar contingency table and compute the chi^2.
###
### Use scipy.stats.chi2(1).ppf(1-alpha/2) to obtain the quantiles for 
### chi^2 distribution with 1 degree of freedom.
###
###
### TODO
###
### Simulate a large data set (say 10000 samples) and estimate the
### performance (TPF, FPF) of some classifier (your choice) by 
### 10x5-CV on modeling sets of sizes 100, 500, 1000,..., 5000. 
### For each estimate provide 95% CI, but using estimated standard
### deviation divided by sqrt(n), instead of sqrt(p(1-p)n). 
### Compare the estimates with observed performance on 5000 samples 
### not used in the modeling (and estimate 95% CI again).