import numpy as np
import matplotlib.pyplot as plt

####### Gene expression data - real world...
### Download data file mda.zip from IS (sources/mda.zip)

Xtr = np.load('X-train.npy')  # 22283 variables, 130 observations
Ytr = np.load('Y-train.npy')  # Ytr[:,0] - ER positive; Ytr[:,1] - pCR
Ytr = Ytr.astype('int32')     # make sure the labels are INTs

Xts = np.load('X-test.npy')  # 22283 variables, 100 observations
Yts = np.load('Y-test.npy')  # Ytr[:,0] - ER positive; Ytr[:,1] - pCR
Yts = Yts.astype('int32')     # make sure the labels are INTs

####### 1. AdaBoost

#### Read the docs:
#### http://scikit-learn.org/stable/modules/ensemble.htm
####
#### and have a look at the examples:
####
# http://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_hastie_10_2.html
# http://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_twoclass.html

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import zero_one_loss

#### TODO:
#### Run the examples below, and try other base learners: e.g. a decision
#### tree with 2 levels...

###### Classical AdaBoost: Discrete AdaBoost algorithm
## Weak learner: decision stumps
T=200
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=T,
                         algorithm='SAMME')

# fit the model (as usual)
bdt.fit(Xtr, Ytr[:,0])

# The result of AdaBoost with decision stumps can be analyzed
# to find the most important variables from the data set:

np.where(bdt.feature_importances_ > 0.01)

# gives the indexes of variables with importance score 
# higher than a threshold (0.01)

# Get the errors, per step:
# - train error
err_tr = np.zeros((T,)) 
for i, yp in enumerate(bdt.staged_predict(Xtr)):
    err_tr[i] = zero_one_loss(yp, Ytr[:,0])

# - test error
err_ts = np.zeros((T,)) 
for i, yp in enumerate(bdt.staged_predict(Xts)):
    err_ts[i] = zero_one_loss(yp, Yts[:,0])

fig = plt.figure()
ax  = plt.subplot(111)
ax.set_ylim(-0.001, 0.5)
ax.set_xlim(0, T+2)
ax.plot(np.arange(T)+1, err_tr, color='blue')
ax.plot(np.arange(T)+1, err_ts, color='red')
plt.show()

###### Real AdaBoost
## Weak learner: decision stumps

T=200
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=T,
                         algorithm='SAMME.R')

# fit the model (as usual)
bdt.fit(Xtr, Ytr[:,0])

# The result of AdaBoost with decision stumps can be analyzed
# to find the most important variables from the data set:

np.where(bdt.feature_importances_ > 0.01)

# gives the indexes of variables with importance score 
# higher than a threshold (0.01)

# Get the errors, per step:
# - train error
err_tr = np.zeros((T,)) 
for i, yp in enumerate(bdt.staged_predict(Xtr)):
    err_tr[i] = zero_one_loss(yp, Ytr[:,0])

# - test error
err_ts = np.zeros((T,)) 
for i, yp in enumerate(bdt.staged_predict(Xts)):
    err_ts[i] = zero_one_loss(yp, Yts[:,0])

fig = plt.figure()
ax  = plt.subplot(111)
ax.set_ylim(-0.001, 0.5)
ax.set_xlim(0, T+2)
ax.plot(np.arange(T)+1, err_tr, color='blue')
ax.plot(np.arange(T)+1, err_ts, color='red')
plt.show()


######### 2. Random Forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10)
clf.fit(Xtr, Ytr[:,0])

## Train error:
zero_one_loss(clf.predict(Xtr), Ytr[:,0])
zero_one_loss(clf.predict(Xts), Yts[:,0])

# Other parameters
clf = RandomForestClassifier(n_estimators=10, max_depth=2)
clf.fit(Xtr, Ytr[:,0])

## Train error:
zero_one_loss(clf.predict(Xtr), Ytr[:,0])
zero_one_loss(clf.predict(Xts), Yts[:,0])

#### TODO:
#### -What can you say about error rate on the test set in the 2nd
#### example (with respect to 1st example)?
#### -Try other parameter combinations...