import numpy as np import sklearn from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split # Read some CSV file with floats, ignore header mydata = [line.split(',') for line in open('mydata.csv').read().strip().split('\n')] print "type(mydata)", type(mydata) print "mydata", mydata # Save the headers headers = mydata[0] # So this is mydata has a first list of headers and then # a bunch of floats. # All except the headers go into the matrix data = np.array([np.array([float(value) for value in line[:-1]]) for line in mydata[1:]]) # Target (what we're predicting) goes into its own array targets = np.array([int(line[-1]) for line in mydata[1:]]) print "\n\nWe will be predicting these:" print targets print "\n\nUsing these:" print data # print "\n\nTo be explicit, each element in these two arrays has a 1-to-1 relationship like this:" # print zip(targets, data) # Sklearn can shuffle the data for us keeping the X-y correspondence intact X_train, X_test, y_train, y_test = train_test_split(data, targets, train_size=0.8) print "\n\ntrain_test_split both shuffles and splits the data, while maintaining the 1-to-1 relationship" print "\n\nOur training set looks like this:" print zip(y_train, X_train) print "\n\nNow that the data is parsed we can start training different classifiers:" clf = DecisionTreeClassifier() clf.fit(X_train, y_train) print "\n\nFor these targets in our test set:" print y_test print "\n\nOur classifier predicts:" print clf.predict(X_test) print "\n\nIt ranks the following feature importances:" feature_importances = zip(clf.feature_importances_, headers) top_features = sorted(feature_importances, key=lambda x: x[0], reverse=True) for i in top_features: print i # *SAMPLE RANDOM FOREST* clf = RandomForestClassifier() clf.fit(X_train, y_train) clf.predict(X_test) # *EASY EVALUATION* from sklearn.metrics import precision_score, recall_score, accuracy_score from sklearn.metrics import confusion_matrix, classification_report classification_predictions = clf.predict(X_test) classification_scores = clf.predict_proba(X_test) print accuracy_score(y_test, classification_predictions) print precision_score(y_test, classification_predictions) print recall_score(y_test, classification_predictions) print classification_report(y_test, classification_predictions) # from sklearn.datasets import load_boston # boston = load_boston()