import numpy as np
from sklearn import datasets
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#data_preprocessing
#scale
def my_SCALE(X):
ask_scaled = input("scale or not: ")
if ask_scaled == 'yes':
X_scaled = preprocessing.scale(X)
return X_scaled
else:
return X
#MinMaxScaler
def my_MMS(X):
ask_mms = input("MinMaxScaler or not: ")
if ask_mms == 'yes':
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X)
return X_train_minmax
else:
return X
#MaxAbsScaler
def my_MAS(X):
ask_mas = input("MaxAbsScaler or not: ")
if ask_mas == 'yes':
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X)
return X_train_maxabs
else:
return X
# Normalizer
def my_NM(X):
ask_nm = input("Normalizer or not: ")
if ask_nm == 'yes':
normalizer=preprocessing.Normalizer().fit(X)
X_train_nominizer = normalizer.transform(X)
return X_train_nominizer
else:
return X
#Binary
def my_BI(X):
ask_bi = input("Binary or not: ")
if ask_bi == 'yes':
binarizer = preprocessing.Binarizer(threshold=0)
X_BI = binarizer.transform(X)
return X_BI
else:
return X
#pca
def my_PCA(X):
ask_pca = input("pca or not: ")
if ask_pca == 'yes':
n_com=2
pca = PCA(n_components = n_com)
X_pca = pca.fit_transform(X)
return X_pca
else:
return X
def data_process(X):
X_PCA = my_PCA(X)
X_SCALE = my_SCALE(X_PCA)
X_MMS = my_MMS(X_SCALE)
X_MAS = my_MAS(X_MMS)
X_NM = my_NM(X_MAS)
X_BI = my_BI(X_NM)
return X_BI
#estimator
#Random Forest
def RFC():
clf = RandomForestClassifier()
params = {'min_samples_leaf': range(1, 10), 'min_samples_split': range(2, 8)}
predictor = GridSearchCV(clf, params, n_jobs=-1)
return predictor
#Logistic Regression
def LR():
clf = LogisticRegression()
#print(clf.get_params())
params = {'C':[1, 2, 4] }
predictor = GridSearchCV(clf, params, n_jobs=-1)
return predictor
#Gradboost
def GBC():
clf = GradientBoostingClassifier()
params = {'min_samples_leaf': range(8, 10), 'min_samples_split': range(2, 8)}
predictor = GridSearchCV(clf, params, n_jobs=-1)
return predictor
#Native Bayes
def MNB():
clf = MultinomialNB()
params = {'alpha': [1.0, 2.0]}
predictor = GridSearchCV(clf, params, n_jobs=-1)
return predictor
#SVM
def SVM():
clf = svm.SVC()
params = {'kernel':('linear', 'rbf'), 'C':[1, 2, 4], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]}
predictor = GridSearchCV(clf, params, n_jobs=-1)
return predictor
#KNN
def KNN():
clf = neighbors.KNeighborsClassifier()
params = {'n_neighbors':range(3,10), 'leaf_size': range(20,40)}
predictor = GridSearchCV(clf, params, n_jobs=-1)
return predictor
# Decision Tree
def TDC():
#x = int(input())
#y = int(input())
clf = tree.DecisionTreeClassifier()
#print(clf.get_params())
params = {'min_samples_leaf': range(5, 10)}
predictor = GridSearchCV(clf, params, n_jobs=-1)
return predictor
#predictor = predictor.fit(X_test, y_test)
#predictor.predict_proba(X_test)
#print(y_test)
#predictor.predict_proba(X_train)
def getRecognitionRate(xtestData, ytestData):
testNum = len(xtestData)
rightNum = 0
for i in range(0, testNum):
if ytestData[i] == xtestData[i]:
rightNum += 1
return float(rightNum) / float(testNum)
def validation(X, Y, Z):
choose = input("held-out or cross: ")
if choose == "cross":
score1 = cross_val_score(X, Y, Z).mean()
return score1
else:
score2 = X.score(Y, Z)
return score2
def finalPredictor():
#preprocessing
print("For X_train data: ")
X_final_train = data_process(X_train)
#print(X_final_train)
print()
print("For y_train data: ")
X_final_test = data_process(X_test)
print()
# get predictor
clf_RFC = RFC()
clf_LR = LR()
clf_GBC = GBC()
clf_MNB = MNB()
clf_SVM = SVM()
clf_KNN = KNN()
clf_TDC = TDC()
# input
print("RFC validation: ")
clf_RFC.fit(X_train, y_train)
RFC_score = validation(clf_RFC, X_final_train, y_train)
print("LR validation: ")
clf_LR.fit(X_train, y_train)
LR_score = validation(clf_LR, X_final_train, y_train)
print("GBC validation: ")
clf_GBC.fit(X_train, y_train)
GBC_score = validation(clf_GBC, X_final_train, y_train)
print("MNB validation: ")
clf_MNB.fit(X_train, y_train)
MNB_score = validation(clf_MNB, X_final_train, y_train)
print("SVM validation: ")
clf_SVM.fit(X_train, y_train)
SVM_score = validation(clf_SVM, X_final_train, y_train)
print("KNN validation: ")
clf_KNN.fit(X_train, y_train)
KNN_score = validation(clf_KNN, X_final_train, y_train)
print("TDC validation: ")
clf_TDC.fit(X_train, y_train)
TDC_score = validation(clf_TDC, X_final_train, y_train)
print()
#score
print('RFC score: ', RFC_score)
print('LR score: ', LR_score)
print('GBC score: ', GBC_score)
print('MNB score: ', MNB_score)
print('SVM score: ', SVM_score)
print('KNN score: ', KNN_score)
print('TDC score: ', TDC_score)
print()
#recognition rate
print('RFC recognition rate: ', getRecognitionRate(clf_RFC.predict(X_final_test), y_test))
print('LR recognition rate: ', getRecognitionRate(clf_LR.predict(X_final_test), y_test))
print('GBC recognition rate: ', getRecognitionRate(clf_GBC.predict(X_final_test), y_test))
print('MNB recognition rate: ', getRecognitionRate(clf_MNB.predict(X_final_test), y_test))
print('SVM recognition rate: ', getRecognitionRate(clf_SVM.predict(X_final_test), y_test))
print('KNN recognition rate: ', getRecognitionRate(clf_KNN.predict(X_final_test), y_test))
print('TDC recognition rate: ', getRecognitionRate(clf_TDC.predict(X_final_test), y_test))
print()
if __name__ == '__main__':
finalPredictor()
print('The End.')