%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import datasets
from sklearn.calibration import CalibratedClassifierCV
from tqdm import tqdm
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
We want to control the error rate for a classification task to a prespecified error rate $\epsilon$ by refusing to make predictions on some inputs. In the following, we will use two different approaches to this problem:
1. Probability calibration: We will calibrate our classifier by using existing probability calibration tools (sigmoid and isotonic) in the scikit-learn package and decide to refuse or predict based on the calibrated probability estimates.
2. Conjugate prediction: Calibrate a threshold instead of probability distributions: we discover an appropriate threshold value (which we call the "acceptance threshold") on calibration data and refuse to predict on the test data when multiple labels are above the threshold.
Given the training data $(X_1,Y_1), \ldots, (X_n,Y_n)$ $\sim$ $\mathcal{D}^n$ for some fixed but unknown $\mathcal{D}$ and tolerance level $\epsilon \in (0,1)$ train a selective classifier $C:\mathcal{X}\rightarrow \mathcal{Y} \cup \{refuse\}$, such that: \begin{eqnarray} P\left(C(X) \neq Y ~ | ~ C(X) \neq refuse \right) & \leq & \epsilon \end{eqnarray} where $(X,Y) \sim \mathcal{D}$ and independent from the training data. Intuitively, our error on the data items on which we guess is less than $\epsilon$.
# LOAD THE DATA
# MNIST DATA SET (MULTI-LABEL)
digits = datasets.fetch_mldata('mnist-original')
X, y = digits.data, digits.target
y = y.astype(int)
# THE OTHER DATA-SETS
# MAKE SURE THE LABELS ARE INTEGERS STARTING FROM 0.
# COD
#cod = datasets.fetch_mldata('cod-rna')
#X, y = cod.data, cod.target
#y = (y + 1)/2
#y = y.astype(int)
## CONNECT
#connect = datasets.fetch_mldata('connect-4')
#X, y = connect.data, connect.target
#y = (y +1)/2
#y = y.astype(int)
## COVER
#covtype = datasets.fetch_covtype()
#X, y = covtype.data, covtype.target
## LETTER
#letter = datasets.fetch_mldata('letter')
#X, y = letter.data, letter.target
#y = y - 1
#y = y.astype(int)
## SATI
#sati = datasets.fetch_mldata('satimage')
#X, y = sati.data, sati.target
#y = y - 1
#y = y.astype(int)
## SENSIT
#sensit = datasets.fetch_mldata('sensit-vehicle-combined')
#X, y = sensit.data, sensit.target
#y = y - 1
#y = y.astype(int)
# SPLIT THE DATA TRAIN:TEST 3:1
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.75)
# COMPUTE THE BASE ERROR LEVEL (FOR DEMONSTRATION)
classifier = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, oob_score=False, n_jobs=-1)
classifier.fit(train_X,train_y)
epsilon_0 = 1 - classifier.score(test_X,test_y)
# CHOOSE THE TARGET ERROR RATE AS 0.01
epsilon = 0.01
print(epsilon_0)
In this section we first approach the problem by employing probability calibration methods from Scikit-Learn. For detailed information about these methods we refer to the documentation of the calibration module and the references therein.
Most ML algorithms compute probability estimates as well as the simple point predictions, i.e. they estimate $\hat{P}\left(Y~|~X\right)$ and then make the prediction $\hat{Y} = argmax_{y}~\hat{P}\left(y~|~X\right)$. In scikit-learn these probability estimates can be obtained by predict_proba method for many of the classifiers.
Calibration is needed because the machine learning tools give inaccurate estimates of probabilities (usually systematically under- or over-confident) so Scikit-Learn uses calibration to correct for those inaccuracies. We will show later that these corrections aren't always, well, correct.
Probability calibration is typically implemented as learning an appropriate monotonic transformation of the estimated probabilities from an hold-out set ("calibration set " from now on). The goal is finding the transformation such that in the calibration set among the points with transformed probabilities $x$, roughly fraction $x$ of them are correctly labelled.
In this demo, we will use two of the most common calibration techniques Platt's regression and isotonic regression. The main difference between those two is while Platt's regression assumes the transformation we try to learn has a sigmoid shape (i.e, $f(x) = (1+e^{ax+b})^{-1}$) and infers the parameters of the sigmoid ($a$ and $b$); isotonic regression assumes a non-parametric transformation, in particular it fits a piecewise constant function.
In the following, we first split the training data as 2:1 as the core training and calibration sets, train a random forest classifier on the core training set, and calibrate that classifier using the calibration set.
# SPLIT THE TRAINING DATA AS THE CORE TRAINING AND CALIBRATION SETS (2:1)
core_X, cal_X, core_y, cal_y = train_test_split(train_X, train_y, train_size=0.66)
n_core, n_features = core_X.shape
# TRAIN YOUR CLASSIFIER ON THE CORE SET
classifier = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, oob_score=False, n_jobs=-1)
## ALTERNATIVE CLASSIFIERS
## NOTE THE FOLLOWING CODE SIMPLY USES THE DEFAULT SETTING FOR THE HYPER PARAMETERS,
## ONE CAN CHOOSE BETTER PARAMETERS WITH VALIDATION FOR LOWER REFUSE RATES.
# classifier = svm.SVC()
# classifier = KNeighborsClassifier(n_neighbors=10)
# classifier = LogisticRegression()
# TRAIN THE CLASSIFIER
classifier.fit(core_X, core_y)
Up to here all three methods (isotonic, sigmoid, and our method) do the same thing. Below we use sigmoid and isotonic (the current Scikit-Learn methods).
# CALIBRATE THE CLASSIFIER
classifier_sigmoid = CalibratedClassifierCV(base_estimator = classifier, method = 'sigmoid', cv = 'prefit')
classifier_sigmoid.fit(cal_X,cal_y)
classifier_isotonic = CalibratedClassifierCV(base_estimator = classifier, method = 'isotonic', cv = 'prefit')
classifier_isotonic.fit(cal_X,cal_y)
Next, predict the labels of the points in the test set while refusing if the classifier is not confident enough. To quantify that we plugged-in the calibrated probabilities to the Chow's rule (for each of isotonic and sigmoid):
Note that in the case of the perfect estimation of the posteriors this method would give a stronger guarantee than the Conjugate approach. In particular it would control the error rate conditioned on the test object $X$ as well.
# UNDERLYING ALGORITM (e.g. RANDOM FORESTS) PREDICTS THE LABELS FOR THE TEST SET
pred = classifier.predict(test_X)
err = pred != test_y
# NOW FILTER OUT REFUSED PREDICTIONS.
# PROBABILITY CALIBRATION (Maximum Likelihood)
# REFUSE PREDICTIONS IF NO LABEL HAS CONFIDENCE MORE THAN 1-epsilon
ref_sigmoid = (classifier_sigmoid.predict_proba(test_X) >= 1-epsilon).sum(axis=1) == 0
ref_isotonic = (classifier_isotonic.predict_proba(test_X) >= 1-epsilon).sum(axis=1) == 0
print("MNIST")
print("Dimensions:\t\t\t\t" ,X.shape, "\nNumber of Labels:\t\t\t",10, "\nError rate on all predictions:\t\t {:0.4f}".format(np.mean(err)))
print("\nPLATT'S REGRESSION ")
print("Error Rate on non-refused predictions:\t{:0.4f}".format(np.sum(err*(1-ref_sigmoid))/np.sum(1-ref_sigmoid)))
print("Refuse Rate:\t{:0.4f}".format(np.mean(ref_sigmoid)))
print("\nISOTONIC REGRESSION ")
print("Error Rate on non-refused predictions:\t{:0.4f}".format(np.sum(err*(1-ref_isotonic))/np.sum(1-ref_isotonic)))
print("Refuse Rate:\t{:0.4f}".format(np.mean(ref_isotonic)))
For now, please observe that the error rates are below our target rate (1%) and, as a result, the refusal rate is unnecessarily high.
Instead of trying to calibrate probabilities, infer an acceptance threshold $\alpha$ directly based on the errors in the calibration set. That is,
Calibration Step: For each $\alpha$, refuse if $\hat{P}\left(y~|~X\right) <= \alpha$ for all the labels. Operationally, we start from the minimum $\alpha$ (=0, for which we won't refuse any data point, so our error rate will equal the error rate on the whole calibration set) and increase until the error rate on non-refused data points in the calibration set satisfies the conditions given in the presentation.
Mathematically: Increase $\alpha$ till $$ \frac{E_\alpha + 1 }{E_\alpha + C_\alpha + 1 } < \epsilon \frac{1+ 1/E_0}{1 + 1/N_{cal}} $$ where $E_x$ and $C_x$ stand for the number of errors and correct predictions assuming $\alpha = x$.
We are looking for the minimum such $\alpha$ because that will have the smallest number of refusals, respectively.
def conj_calibrate(C,Xcal, ycal, eps):
# COMPUTE THE ACCEPTANCE THRESHOLD FOR CLASSIFIER C, CALIBRATION DATA (Xcal,ycal) AND TARGET RATE eps
# NOTE: WORKS ONLY FOR CLASSIFIERS THAT SUPPORT predict_proba()
A = C.predict_proba(Xcal)
pred_cal = C.predict(Xcal)
n_cal, n_features = Xcal.shape
# COMPUTE THE POTENTIAL THRESHOLD POINTS
pt = A.max(axis=1)
# STEP 1 : FIND THE SMALLEST TEMPORARY THRESHOLD TO SATISFY THE CONDITION
a = np.argsort(pt)
#a = np.argsort(-pt)[:ncal]
errors = sum(pred_cal != ycal) + 1
eps_ = eps * (1 + 1/errors) / (1 + 1/n_cal)
predictions = n_cal + 1
threshold = 1
if errors/predictions > eps_:
for i in a: # As we change the threshold, we refuse one more data point each time.
if pred_cal[i] != ycal[i]:
errors = errors - 1 # We got rid of one error
predictions = predictions - 1
if errors/predictions <= eps_:
threshold = pt[i]
break
else:
threshold = 0
return threshold
def conj_test(C,Xtest, threshold):
# PREDICT LABELS OF THE POINTS IN Xtest AND CHOOSE THE POINTS TO REFUSE RELATIVE TO THE ACCEPTANCE THRESHOLD threshold
test_A = C.predict_proba(Xtest)
pred_test = C.predict(Xtest)
refused = np.sum(test_A>threshold, axis=1) == 0
return pred_test, refused
By combining these two steps:
def conj_calibrate_test(C,Xcal, ycal, Xtest, eps):
# CALIBRATE CLASSIFIER C ON (Xcal, ycal) WITH TARGET RATE eps AND PREDICT THE CORRESPONDING LABELS FOR Xtest
th = conj_calibrate(C,Xcal, ycal, eps)
return conj_test(C,Xtest, th)
# PREDICT THE TEST LABELS
pred, ref = conj_calibrate_test(classifier, cal_X,cal_y,test_X,epsilon)
err = pred != test_y
print("MNIST")
print("Dimensions:\t\t\t" ,X.shape, "\nNumber of Labels:\t\t",10, "\nError rate on all predictions:\t {:0.4f}".format(np.mean(err)))
print("\nCONJUGATE PREDICTION")
print("Target Error Rate: \t{:0.4f}".format(epsilon))
print("Selective Error Rate: \t{:0.4f}".format(np.sum(err*(1-ref))/np.sum(1-ref)),"\nRefusal Rate:\t\t{:0.4f}".format(np.mean(ref)))
In this setup, we apply the described methods to predict the upcoming point in a data stream $(X_1,Y_1), (X_2,Y_2), \ldots $. To do so, we first split the data-stream into $L$ sub-streams $S_1,\ldots,S_L$, such that, $S_l = (X_l,Y_l), (X_{l+L},Y_{l+L}), , (X_{l+2L},Y_{l+2L}) \ldots$ for all $l=1,\ldots, L$.
Next, to predict the $t^{th}$ data point we use the substream $t~ (mod~L)$ as the calibration set and all the remaining data points as the core training set in the offline procedure described above.
# LOAD THE DATA
# MNIST DATA SET (MULTI-LABEL)
digits = datasets.fetch_mldata('mnist-original')
X, y = digits.data, digits.target
y = y.astype(int)
# THE OTHER DATA-SETS
# PLEASE MAKE SURE THE LABELS ARE INTEGERS STARTING FROM 0.
# COD
#cod = datasets.fetch_mldata('cod-rna')
#X, y = cod.data, cod.target
#y = (y + 1)/2
#y = y.astype(int)
## CONNECT
#connect = datasets.fetch_mldata('connect-4')
#X, y = connect.data, connect.target
#y = (y +1)/2
#y = y.astype(int)
## COVER
# covtype = datasets.fetch_covtype()
# X, y = covtype.data, covtype.target
## LETTER
#letter = datasets.fetch_mldata('letter')
#X, y = letter.data, letter.target
#y = y - 1
#y = y.astype(int)
## SATI
#sati = datasets.fetch_mldata('satimage')
#X, y = sati.data, sati.target
#y = y - 1
#y = y.astype(int)
## SENSIT
#sensit = datasets.fetch_mldata('sensit-vehicle-combined')
#X, y = sensit.data, sensit.target
#y = y - 1
#y = y.astype(int)
## OPTIONAL: TAKE THE FIRST 10000 DATAPOINTS
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)
X = X[:10000,:]
y= y[:10000]
T = y.size
# MAKE SURE WE SAW EACH LABEL AT LEAST ONCE BEFORE THE START
c = [0 for k in range(max(y)+1)]
for tl in range(T):
c[y[tl]] = tl
if sum(np.equal(c,0))==0:
break
print('Predict from',tl,'to',T)
# CHOOSE A TARGET ERROR RATE
epsilon = 0.05
# SPLIT INTO L STREAMS
L = 2
# CORE TRAINING SET FOR EACH SUBSTREAM KEPT AS AN ELEMENT OF THIS LIST
tr_seq = [[] for l in range(L)]
# CALIBRATION SET FOR EACH SUBSTREAM KEPT AS AN ELEMENT OF THIS LIST
cal_seq = [[] for l in range(L)]
# SET THE BASE PREDICTOR
predictor = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, oob_score=False, n_jobs=-1)
# ERROR AND REFUSE FLAGS
err_0 = np.empty(T-tl)
err = np.empty(T-tl)
ref = np.empty(T-tl)
for t in tqdm(range(T)):
# PREDICT THE t^TH LABEL
if t>tl:
l = t%L
predictor.fit(X[tr_seq[l],:],y[tr_seq[l]])
pred, ref[t-tl] = conj_calibrate_test(predictor, X[cal_seq[l],:],y[cal_seq[l]],X[t,:].reshape(1,-1),epsilon)
err[t-tl] = (pred != y[t])*(1-ref[t-tl])
err_0[t-tl] = pred != y[t]
# UPDATE THE TRAINING AND CALIBRATION SETS
for l in range(L):
if t%L != l:
tr_seq[l].append(t)
else:
cal_seq[l].append(t)
# PLOT THE ERROR AND REFUSE RATES
time = np.array(range(T-tl))
with plt.style.context('fivethirtyeight'):
plt.plot(time,np.cumsum(ref)/time, 'r', linewidth=2, label = "Refusal Rate")
plt.plot(time, np.array([epsilon]*len(time)), '--k', linewidth=2, label = "Target Error Rate")
plt.plot(time, np.cumsum(err_0)/time, 'g', linewidth=2, label = "Error Rate w/o Refusals")
plt.plot(time-np.cumsum(ref), np.cumsum(err)/(time-np.cumsum(ref)), 'b', linewidth=2, label = "Error Rate with Refusals")
plt.ylim(0,1.1); plt.xlabel("Number of Data Points")
plt.xlim(0,T-tl)
plt.legend(loc ="best")