In [10]:
import numpy as np 
from sklearn import datasets
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors  
from sklearn import svm 
from sklearn.naive_bayes import MultinomialNB  
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#data_preprocessing

#scale
def my_SCALE(X):
    ask_scaled = input("scale or not: ") 
    if ask_scaled == 'yes':
        X_scaled = preprocessing.scale(X)
        return X_scaled
    else:
        return X

#MinMaxScaler
def my_MMS(X):
    ask_mms = input("MinMaxScaler or not: ") 
    if ask_mms == 'yes':
        min_max_scaler = preprocessing.MinMaxScaler() 
        X_train_minmax = min_max_scaler.fit_transform(X)  
        return X_train_minmax
    else:
        return X

#MaxAbsScaler
def my_MAS(X):
    ask_mas = input("MaxAbsScaler or not: ") 
    if ask_mas == 'yes':
        max_abs_scaler = preprocessing.MaxAbsScaler()
        X_train_maxabs = max_abs_scaler.fit_transform(X)
        return X_train_maxabs
    else:
        return X

# Normalizer
def my_NM(X):
    ask_nm = input("Normalizer or not: ") 
    if ask_nm == 'yes':
        normalizer=preprocessing.Normalizer().fit(X)
        X_train_nominizer = normalizer.transform(X)
        return X_train_nominizer
    else:
        return X

#Binary
def my_BI(X):
    ask_bi = input("Binary or not: ") 
    if ask_bi == 'yes':
        binarizer = preprocessing.Binarizer(threshold=0)
        X_BI = binarizer.transform(X)
        return X_BI
    else:
        return X

#pca

def my_PCA(X):
    ask_pca = input("pca or not: ")
    if ask_pca == 'yes':
        n_com=2
        pca = PCA(n_components = n_com)
        X_pca = pca.fit_transform(X)
        return X_pca
    else:
        return X

def data_process(X):
    X_PCA = my_PCA(X)
    X_SCALE = my_SCALE(X_PCA)
    X_MMS = my_MMS(X_SCALE)
    X_MAS = my_MAS(X_MMS)
    X_NM = my_NM(X_MAS)
    X_BI = my_BI(X_NM)   
    return X_BI

#estimator
#Random Forest
def RFC():
    clf = RandomForestClassifier()  
    params = {'min_samples_leaf': range(1, 10), 'min_samples_split': range(2, 8)}
    predictor = GridSearchCV(clf, params, n_jobs=-1)
    return predictor

#Logistic Regression
def LR():
    clf = LogisticRegression()
    #print(clf.get_params())
    params = {'C':[1, 2, 4] }
    predictor = GridSearchCV(clf, params, n_jobs=-1)
    return predictor

#Gradboost
def GBC():
    clf = GradientBoostingClassifier()
    params = {'min_samples_leaf': range(8, 10), 'min_samples_split': range(2, 8)}
    predictor = GridSearchCV(clf, params, n_jobs=-1)
    return predictor

#Native Bayes
def MNB():
    clf = MultinomialNB()
    params = {'alpha': [1.0, 2.0]}
    predictor = GridSearchCV(clf, params, n_jobs=-1)
    return predictor

#SVM
def SVM():
    clf = svm.SVC()  
    params = {'kernel':('linear', 'rbf'), 'C':[1, 2, 4], 'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]}
    predictor = GridSearchCV(clf, params, n_jobs=-1)
    return predictor

#KNN
def KNN():
    clf = neighbors.KNeighborsClassifier()
    params = {'n_neighbors':range(3,10), 'leaf_size': range(20,40)}
    predictor = GridSearchCV(clf, params, n_jobs=-1)
    return predictor

# Decision Tree
def TDC():
    #x = int(input())
    #y = int(input())
    clf = tree.DecisionTreeClassifier() 
    #print(clf.get_params())
    params = {'min_samples_leaf': range(5, 10)}
    predictor = GridSearchCV(clf, params, n_jobs=-1)
    return predictor
#predictor = predictor.fit(X_test, y_test)
#predictor.predict_proba(X_test)

#print(y_test)
#predictor.predict_proba(X_train)

def getRecognitionRate(xtestData, ytestData):  
    testNum = len(xtestData)  
    rightNum = 0  
    for i in range(0, testNum):  
        if ytestData[i] == xtestData[i]:  
            rightNum += 1  
    return float(rightNum) / float(testNum)

def validation(X, Y, Z):
    choose = input("held-out or cross: ")
    if choose == "cross":
        score1 = cross_val_score(X, Y, Z).mean()
        return score1
    else:
        score2 = X.score(Y, Z)
        return score2

def finalPredictor():
    #preprocessing
    print("For X_train data: ")
    X_final_train = data_process(X_train)    
    #print(X_final_train)
    print()
    
    print("For y_train data: ")
    X_final_test = data_process(X_test)    
    print()
    
    # get predictor
    clf_RFC = RFC()
    clf_LR = LR()
    clf_GBC = GBC()
    clf_MNB = MNB()
    clf_SVM = SVM()
    clf_KNN = KNN()
    clf_TDC = TDC()
    
    # input
    print("RFC validation: ")
    clf_RFC.fit(X_train, y_train)
    RFC_score = validation(clf_RFC, X_final_train, y_train)
    print("LR validation: ")
    clf_LR.fit(X_train, y_train)
    LR_score = validation(clf_LR, X_final_train, y_train)
    print("GBC validation: ")
    clf_GBC.fit(X_train, y_train)
    GBC_score = validation(clf_GBC, X_final_train, y_train)
    print("MNB validation: ")
    clf_MNB.fit(X_train, y_train)
    MNB_score = validation(clf_MNB, X_final_train, y_train)
    print("SVM validation: ")
    clf_SVM.fit(X_train, y_train)
    SVM_score = validation(clf_SVM, X_final_train, y_train)
    print("KNN validation: ")
    clf_KNN.fit(X_train, y_train)
    KNN_score = validation(clf_KNN, X_final_train, y_train)
    print("TDC validation: ")
    clf_TDC.fit(X_train, y_train)
    TDC_score = validation(clf_TDC, X_final_train, y_train)
    print()
    
    #score
    print('RFC score: ', RFC_score)
    print('LR score: ', LR_score)
    print('GBC score: ', GBC_score)
    print('MNB score: ', MNB_score)
    print('SVM score: ', SVM_score)
    print('KNN score: ', KNN_score)
    print('TDC score: ', TDC_score)
    print()
    
    #recognition rate
    print('RFC recognition rate: ', getRecognitionRate(clf_RFC.predict(X_final_test), y_test))
    print('LR recognition rate: ', getRecognitionRate(clf_LR.predict(X_final_test), y_test))
    print('GBC recognition rate: ', getRecognitionRate(clf_GBC.predict(X_final_test), y_test))
    print('MNB recognition rate: ', getRecognitionRate(clf_MNB.predict(X_final_test), y_test))
    print('SVM recognition rate: ', getRecognitionRate(clf_SVM.predict(X_final_test), y_test))
    print('KNN recognition rate: ', getRecognitionRate(clf_KNN.predict(X_final_test), y_test))
    print('TDC recognition rate: ', getRecognitionRate(clf_TDC.predict(X_final_test), y_test))
    print()

if __name__ == '__main__':  
    finalPredictor()
    print('The End.')
For X_train data: 
pca or not: no
scale or not: no
MinMaxScaler or not: no
MaxAbsScaler or not: no
Normalizer or not: no
Binary or not: no

For y_train data: 
pca or not: no
scale or not: no
MinMaxScaler or not: no
MaxAbsScaler or not: no
Normalizer or not: no
Binary or not: no

RFC validation: 
held-out or cross: cross
LR validation: 
held-out or cross: cross
GBC validation: 
held-out or cross: cross
MNB validation: 
held-out or cross: cross
SVM validation: 
held-out or cross: cross
KNN validation: 
held-out or cross: cross
TDC validation: 
held-out or cross: cross

RFC score:  0.941442568272
LR score:  0.950192828851
GBC score:  0.941859495518
MNB score:  0.924124452783
SVM score:  0.95811965812
KNN score:  0.93393266625
TDC score:  0.949572649573

RFC recognition rate:  1.0
LR recognition rate:  1.0
GBC recognition rate:  0.9666666666666667
MNB recognition rate:  0.9
SVM recognition rate:  1.0
KNN recognition rate:  1.0
TDC recognition rate:  1.0

The End.