import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
%matplotlib inline
plt.style.use('ggplot')
df = pd.read_excel('juan_data1.xlsx', skip_footer=25)
df['source'] = 'datafile1'
df2 = pd.read_excel('juan_data3_mod.xlsx')
df2['source'] = 'datafile3'
df3 = pd.read_excel('MRI_extension_data327x341.xlsx')
df3['source'] = 'MRI'
df4 = pd.read_excel('Juan_NSAID_SSRI.xlsx')
df4['source'] = 'NSAID_Supplement'
merged = pd.merge(pd.merge(pd.merge(df,df2,on='RID', suffixes=('_original', '_redundant1')), df3, on='RID', suffixes=('_redundant2', '_redundant3')), df4, on='RID', suffixes=('_redundant4', '_redundant5'))
assert(all(merged.target_original==merged.target_redundant1))
merged['target'] = merged.target_original
merged['time_bin'] = merged.target.apply(lambda x: (x+1) / 2)
merged['stable'] = merged.target.apply(lambda x: int(x%2 != 0))
merged[['target','time_bin', 'stable']].head()
targets = list(merged['target_original'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Label')
plt.ylabel('% of Samples')
plt.title('Label Distribution in Dataset\n')
plt.xticks([1,2,3,4,5,6,7,8])
plt.figure()
targets = list(merged['stable'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('State')
plt.ylabel('% of Samples')
plt.title('State Distribution in Dataset\n')
plt.xticks([0,1],['Transitioning', 'Stable'])
plt.figure()
targets = list(merged['time_bin'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Bin')
plt.ylabel('% of Samples')
plt.title('Time Bin Distribution in Dataset\n')
plt.xticks([1,2,3,4], ['>4.25', '4.25-2.25', '2.25-1.25', '1.25-0'])
print 'Distributions:'
Right not this just involves ignoring copies of features from different data-sources, will grow to ignore features with the same "sortable" value
polluting_label_fragments = ['target','RID', 'time_bin', 'stable', 'source', 'redundant', 'Unnamed']
valid_classifier_columns = [c for c in merged.columns if not any(dirt in c for dirt in polluting_label_fragments)]
ignored_columns = [c for c in merged.columns if any(dirt in c for dirt in polluting_label_fragments)]
print 'Ignoring {0} Columns.\nNon-redundant columns ignored: {1}'.format(len(ignored_columns), [i for i in ignored_columns if 'redundant' not in i])
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import cross_validation
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
loo = cross_validation.LeaveOneOut(n=merged.shape[0])
X = merged[valid_classifier_columns].as_matrix()
y = merged['stable'].as_matrix()
predictions = []
class_predictions = []
for train_index, test_index in loo:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(X_train)
X_train = imp.transform(X_train)
X_test = imp.transform(X_test)
clf = RandomForestClassifier(n_estimators=300)
clf.fit(X_train, y_train)
y_score = clf.predict_proba(X_test)[:,1]
predictions.append(y_score[0])
class_predictions.append(clf.predict(X_test)[0])
# Compute Precision-Recall and plot curve
n_classes = 2
average_precision = dict()
precision, recall, _ = precision_recall_curve(y,predictions)
average_precision = average_precision_score(y, predictions)
plt.plot(recall, precision, c='r', alpha=0.8)
plt.figure()
fpr, tpr, thresholds = roc_curve(y, predictions)
plt.plot(fpr,tpr,'r', alpha=0.8)
plt.plot([0,1],[0,1],'--',color='k')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC = {0}'.format(auc(fpr, tpr)))
from sklearn.metrics import classification_report
print "0 = Transitioning, 1 = Stable\n"
print classification_report(y, class_predictions)
from sklearn.metrics import precision_score, recall_score
import random
from numpy.random import choice
probability_of_stable = y.sum()/float(len(y))
probability_of_transition = 1 - probability_of_stable
precision_threshold = precision_score(y, np.array(class_predictions))
recall_threshold = recall_score(y, np.array(class_predictions))
guesses_above_precision_threshold = 0
guesses_above_recall_threshold = 0
n_trials = 100000
for i in range(n_trials):
guess = choice([0,1], len(y), p=[probability_of_transition, probability_of_stable])
guesses_above_precision_threshold += int(precision_score(y, guess) >= precision_threshold)
guesses_above_recall_threshold += int(recall_score(y, guess) >= recall_threshold)
print "Out of {0} trials:".format(n_trials)
print "\t{0:4} had better or equal precision (p={1:.6f})".format(guesses_above_precision_threshold, guesses_above_precision_threshold/float(n_trials))
print "\t{0:4} had better or equal recall (p={1:.6f})".format(guesses_above_recall_threshold, guesses_above_recall_threshold/float(n_trials))
from sklearn.tree import export_graphviz
import os
for i, sample_tree in enumerate(random.sample(clf.estimators_, 10)):
export_graphviz(sample_tree, out_file='{0}.dot'.format('trees/sample_tree_'+str(i)), feature_names=valid_classifier_columns)
os.system("dot -Tpng {0}.dot -o {0}.png".format('trees/sample_tree_'+str(i)))
pd.DataFrame(sorted(valid_classifier_columns), columns=['Feature']).to_csv('features_used.txt', index=False)