import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split

# Read some CSV file with floats, ignore header
mydata = [line.split('\t') for line in
open('mydata.csv').read().split('\n')[1:]]

# All except the label go into the matrix
data = np.array([np.array([float(value) for value in line[:-1]]) for line
in mydata])

# Label goes into its own array
labels =  np.array([float(line[:-1]) for line in mydata])

# Sklearn can shuffle the data for us keeping the X-y correspondence intact
X_train, X_test, y_train, y_test = train_test_split(data, labels,
train_size=0.8)




clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)
clf.predict(X_test)

feature_importances = zip(cart.feature_importances_, iris.target_names)

top_features = sorted(feature_importances, key=lambda x: x[0], reverse=True)


# *SAMPLE RANDOM FOREST*




iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
train_size=0.8)

clf = RandomForestClassifier()

clf.fit(X_train, y_train)
clf.predict(X_test)

feature_importances = zip(cart.feature_importances_, iris.target_names)

top_features = sorted(feature_importances, key=lambda x: x[0], reverse=True)



# *SAMPLE CART TO RANDOM FOREST*

clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)
clf.predict(X_test)

feature_importances = zip(cart.feature_importances_, iris.target_names)

top_features = sorted(feature_importances, key=lambda x: x[0], reverse=True)

top2_features_indexes = [iris.target_names.index(feature) for feature
in top_features[:2]]

# Grab all rows, only columns in the top indexes
dataset = iris.data[:,top2_features_indexes]

X_train, X_test, y_train, y_test = train_test_split(dataset, iris.target,
train_size=0.8)

clf = RandomForestClassifier()

clf.fit(X_train, y_train)
clf.predict(X_test)




# *EASY EVALUATION*

from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

classification_predictions = clf.predict(X_test)
classification_scores = clf.predict_proba(X_test)

print accuracy_score(y_test, classification_predictions)
print precision_score(y_test, classification_predictions)
print recall_score(y_test, classification_predictions)

print classification_report(y_test, classification_predictions)

# from sklearn.datasets import load_boston

# boston = load_boston()


