import numpy as np import matplotlib.pyplot as plt ####### Gene expression data - real world... ### Download data file mda.zip from IS (sources/mda.zip) Xtr = np.load('X-train.npy') # 22283 variables, 130 observations Ytr = np.load('Y-train.npy') # Ytr[:,0] - ER positive; Ytr[:,1] - pCR Ytr = Ytr.astype('int32') # make sure the labels are INTs Xts = np.load('X-test.npy') # 22283 variables, 100 observations Yts = np.load('Y-test.npy') # Ytr[:,0] - ER positive; Ytr[:,1] - pCR Yts = Yts.astype('int32') # make sure the labels are INTs ####### 1. AdaBoost #### Read the docs: #### http://scikit-learn.org/stable/modules/ensemble.htm #### #### and have a look at the examples: #### # http://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_hastie_10_2.html # http://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_twoclass.html from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import zero_one_loss #### TODO: #### Run the examples below, and try other base learners: e.g. a decision #### tree with 2 levels... ###### Classical AdaBoost: Discrete AdaBoost algorithm ## Weak learner: decision stumps T=200 bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=T, algorithm='SAMME') # fit the model (as usual) bdt.fit(Xtr, Ytr[:,0]) # The result of AdaBoost with decision stumps can be analyzed # to find the most important variables from the data set: np.where(bdt.feature_importances_ > 0.01) # gives the indexes of variables with importance score # higher than a threshold (0.01) # Get the errors, per step: # - train error err_tr = np.zeros((T,)) for i, yp in enumerate(bdt.staged_predict(Xtr)): err_tr[i] = zero_one_loss(yp, Ytr[:,0]) # - test error err_ts = np.zeros((T,)) for i, yp in enumerate(bdt.staged_predict(Xts)): err_ts[i] = zero_one_loss(yp, Yts[:,0]) fig = plt.figure() ax = plt.subplot(111) ax.set_ylim(-0.001, 0.5) ax.set_xlim(0, T+2) ax.plot(np.arange(T)+1, err_tr, color='blue') ax.plot(np.arange(T)+1, err_ts, color='red') plt.show() ###### Real AdaBoost ## Weak learner: decision stumps T=200 bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=T, algorithm='SAMME.R') # fit the model (as usual) bdt.fit(Xtr, Ytr[:,0]) # The result of AdaBoost with decision stumps can be analyzed # to find the most important variables from the data set: np.where(bdt.feature_importances_ > 0.01) # gives the indexes of variables with importance score # higher than a threshold (0.01) # Get the errors, per step: # - train error err_tr = np.zeros((T,)) for i, yp in enumerate(bdt.staged_predict(Xtr)): err_tr[i] = zero_one_loss(yp, Ytr[:,0]) # - test error err_ts = np.zeros((T,)) for i, yp in enumerate(bdt.staged_predict(Xts)): err_ts[i] = zero_one_loss(yp, Yts[:,0]) fig = plt.figure() ax = plt.subplot(111) ax.set_ylim(-0.001, 0.5) ax.set_xlim(0, T+2) ax.plot(np.arange(T)+1, err_tr, color='blue') ax.plot(np.arange(T)+1, err_ts, color='red') plt.show() ######### 2. Random Forest from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=10) clf.fit(Xtr, Ytr[:,0]) ## Train error: zero_one_loss(clf.predict(Xtr), Ytr[:,0]) zero_one_loss(clf.predict(Xts), Yts[:,0]) # Other parameters clf = RandomForestClassifier(n_estimators=10, max_depth=2) clf.fit(Xtr, Ytr[:,0]) ## Train error: zero_one_loss(clf.predict(Xtr), Ytr[:,0]) zero_one_loss(clf.predict(Xts), Yts[:,0]) #### TODO: #### -What can you say about error rate on the test set in the 2nd #### example (with respect to 1st example)? #### -Try other parameter combinations...