In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
%matplotlib inline
plt.style.use('ggplot')
/Users/sgtlennon/Library/Enthought/Canopy_64bit/User/envs/scipy/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Read and Merge All Datafiles on RID

In [2]:
df = pd.read_excel('juan_data1.xlsx', skip_footer=25)
df['source'] = 'datafile1'
df2 = pd.read_excel('juan_data3_mod.xlsx')
df2['source'] = 'datafile3'
df3 = pd.read_excel('MRI_extension_data327x341.xlsx')
df3['source'] = 'MRI'
df4 = pd.read_excel('Juan_NSAID_SSRI.xlsx')
df4['source'] = 'NSAID_Supplement'

merged = pd.merge(pd.merge(pd.merge(df,df2,on='RID', suffixes=('_original', '_redundant1')), df3, on='RID', suffixes=('_redundant2', '_redundant3')), df4, on='RID', suffixes=('_redundant4', '_redundant5'))
assert(all(merged.target_original==merged.target_redundant1))

Assign Stability and Time-Bin Flags based on the Target/Label

In [3]:
merged['target'] = merged.target_original
merged['time_bin'] = merged.target.apply(lambda x: (x+1) / 2)
merged['stable'] = merged.target.apply(lambda x: int(x%2 != 0))
merged[['target','time_bin', 'stable']].head()
Out[3]:
target time_bin stable
0 3 2 1
1 3 2 1
2 8 4 0
3 3 2 1
4 7 4 1

Check the Distribution of Labels in the Dataset

In [4]:
targets = list(merged['target_original'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Label')
plt.ylabel('% of Samples')
plt.title('Label Distribution in Dataset\n')
plt.xticks([1,2,3,4,5,6,7,8])

plt.figure()
targets = list(merged['stable'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('State')
plt.ylabel('% of Samples')
plt.title('State Distribution in Dataset\n')
plt.xticks([0,1],['Transitioning', 'Stable'])

plt.figure()
targets = list(merged['time_bin'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Bin')
plt.ylabel('% of Samples')
plt.title('Time Bin Distribution in Dataset\n')
plt.xticks([1,2,3,4], ['>4.25', '4.25-2.25', '2.25-1.25', '1.25-0'])

print 'Distributions:'
Distributions:

Ignore Columns that Could Pollute the Classifier

Right not this just involves ignoring copies of features from different data-sources, will grow to ignore features with the same "sortable" value

In [5]:
polluting_label_fragments = ['target','RID', 'time_bin', 'stable', 'source', 'redundant', 'Unnamed']
valid_classifier_columns = [c for c in merged.columns if not any(dirt in c for dirt in polluting_label_fragments)]
ignored_columns = [c for c in merged.columns if  any(dirt in c for dirt in polluting_label_fragments)]
print 'Ignoring {0} Columns.\nNon-redundant columns ignored: {1}'.format(len(ignored_columns), [i for i in ignored_columns if 'redundant' not in i])
Ignoring 189 Columns.
Non-redundant columns ignored: [u'RID', u'target_original', 'source_original', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'target', 'time_bin', 'stable']

Create Precision-Recall Curve for RandomForest Classifier

In [6]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn import cross_validation

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])

loo = cross_validation.LeaveOneOut(n=merged.shape[0])

X = merged[valid_classifier_columns].as_matrix()
y = merged['stable'].as_matrix()
predictions = []
class_predictions = []

for train_index, test_index in loo:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test =  y[train_index],  y[test_index]
    
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(X_train)
    X_train = imp.transform(X_train)
    X_test = imp.transform(X_test)

    clf = RandomForestClassifier(n_estimators=300)
    clf.fit(X_train, y_train)
    y_score = clf.predict_proba(X_test)[:,1]
    
    predictions.append(y_score[0])
    class_predictions.append(clf.predict(X_test)[0])

# Compute Precision-Recall and plot curve
n_classes = 2
average_precision = dict()

precision, recall, _ = precision_recall_curve(y,predictions)
average_precision = average_precision_score(y, predictions)

plt.plot(recall, precision, c='r', alpha=0.8)

plt.figure()

fpr, tpr, thresholds = roc_curve(y, predictions)
plt.plot(fpr,tpr,'r', alpha=0.8)
plt.plot([0,1],[0,1],'--',color='k')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC = {0}'.format(auc(fpr, tpr)))
Out[6]:
<matplotlib.text.Text at 0x114503e10>
In [7]:
from sklearn.metrics import classification_report

print "0 = Transitioning, 1 = Stable\n"
print classification_report(y, class_predictions)
0 = Transitioning, 1 = Stable

             precision    recall  f1-score   support

          0       0.67      0.78      0.72       171
          1       0.69      0.56      0.62       149

avg / total       0.68      0.68      0.68       320

In [8]:
from sklearn.metrics import precision_score, recall_score
import random
from numpy.random import choice


probability_of_stable = y.sum()/float(len(y))
probability_of_transition = 1 - probability_of_stable

precision_threshold = precision_score(y, np.array(class_predictions))
recall_threshold = recall_score(y, np.array(class_predictions))

guesses_above_precision_threshold = 0
guesses_above_recall_threshold = 0

n_trials = 100000

for i in range(n_trials):
    guess = choice([0,1], len(y), p=[probability_of_transition, probability_of_stable])
    guesses_above_precision_threshold += int(precision_score(y, guess) >= precision_threshold)
    guesses_above_recall_threshold += int(recall_score(y, guess) >= recall_threshold)

print "Out of {0} trials:".format(n_trials)
print "\t{0:4} had better or equal precision (p={1:.6f})".format(guesses_above_precision_threshold, guesses_above_precision_threshold/float(n_trials))
print "\t{0:4} had better or equal recall (p={1:.6f})".format(guesses_above_recall_threshold, guesses_above_recall_threshold/float(n_trials))

    
Out of 100000 trials:
	   0 had better or equal precision (p=0.000000)
	1006 had better or equal recall (p=0.010060)
In [9]:
from sklearn.tree import export_graphviz
import os
for i, sample_tree in enumerate(random.sample(clf.estimators_, 10)):
    export_graphviz(sample_tree, out_file='{0}.dot'.format('trees/sample_tree_'+str(i)), feature_names=valid_classifier_columns)
    os.system("dot -Tpng {0}.dot -o {0}.png".format('trees/sample_tree_'+str(i)))
    
In [10]:
pd.DataFrame(sorted(valid_classifier_columns), columns=['Feature']).to_csv('features_used.txt', index=False)
In [ ]:
 
In [ ]: