import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
%matplotlib inline
plt.style.use('ggplot')
df = pd.read_excel('juan_data1.xlsx', skip_footer=25)
df['source'] = 'datafile1'
df2 = pd.read_excel('juan_data3_mod.xlsx')
df['source'] = 'datafile3'
df3 = pd.read_excel('MRI_extension_data327x341.xlsx')
df['source'] = 'MRI'
merged = pd.merge(pd.merge(df,df2,on='RID', suffixes=('_original', '_redundant1')), df3, on='RID', suffixes=('_redundant2', '_redundant3'))
assert(all(merged.target_original==merged.target_redundant1))
merged['target'] = merged.target_original
merged['time_bin'] = merged.target.apply(lambda x: (x+1) / 2)
merged['stable'] = merged.target.apply(lambda x: int(x%2 != 0))
merged[['target','time_bin', 'stable']].head()
targets = list(merged['target_original'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Label')
plt.ylabel('% of Samples')
plt.title('Label Distribution in Dataset\n')
plt.xticks([1,2,3,4,5,6,7,8])
plt.figure()
targets = list(merged['stable'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('State')
plt.ylabel('% of Samples')
plt.title('State Distribution in Dataset\n')
plt.xticks([0,1],['Transitioning', 'Stable'])
plt.figure()
targets = list(merged['time_bin'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Bin')
plt.ylabel('% of Samples')
plt.title('Time Bin Distribution in Dataset\n')
plt.xticks([1,2,3,4], ['>4.25', '4.25-2.25', '2.25-1.25', '1.25-0'])
print 'Distributions:'
Right not this just involves ignoring copies of features from different data-sources, will grow to ignore features with the same "sortable" value
polluting_label_fragments = ['target','RID', 'time_bin', 'stable', 'source', 'redundant']
valid_classifier_columns = [c for c in merged.columns if not any(dirt in c for dirt in polluting_label_fragments)]
ignored_columns = [c for c in merged.columns if any(dirt in c for dirt in polluting_label_fragments)]
print 'Ignoring {0} Columns.\nNon-redundant: {1}'.format(len(ignored_columns), [i for i in ignored_columns if 'redundant' not in i])
In order to check the predictive value of different features, we will run 100 trials of predicting the "stability" label. This is not the most useful measurement, but it gives us a good view of feature importance to potentially select them later for stronger classifiers.
aucs = []
feature_importances = []
plt.plot([0,1],[0,1],'--',color='k')
for trial in range(200):
X_train, X_test, y_train, y_test = train_test_split(merged[valid_classifier_columns], merged['stable'], train_size=0.8)
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp = imp.fit(X_train)
X_train = imp.transform(X_train)
X_test = imp.transform(X_test)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, pred)
plt.plot(fpr,tpr,'r', alpha=0.3)
aucs.append(auc(fpr, tpr))
feature_importances.append(clf.feature_importances_)
print 'Average AUC:{0:.2f} (Publishable at ≥0.7)'.format(np.mean(aucs))
We print the top 10 average feature importances overall but ALSO, we weight feature importances by AUC so we can see which features are most informative in good models.
average_feature_importance = np.mean(np.array(feature_importances),axis=0)
print "\nTop Average Features (TOP 10):\n"
for i, f in sorted(zip(average_feature_importance, valid_classifier_columns), reverse=True)[:10]:
print '{0:.3f}\t{1}'.format(i,f)
print '\n'+'='*20+'\n'
weighted_feature_importance = np.mean(np.array(feature_importances) * np.array(aucs)[:,None], axis=0)
print "\nTop Average Features, weighted for good AUC (TOP 50):\n"
for i, f in sorted(zip(weighted_feature_importance, valid_classifier_columns), reverse=True)[:50]:
print '{0:.3f}\t{1}'.format(i,f)
vp = plt.violinplot(weighted_feature_importance)
plt.title('Average Weighted Feature Importance Distribution\n')
plt.xticks([])
plt.xlabel('Frequency')
plt.ylabel('Weighted Importance')
plt.setp(vp['bodies'], color='teal')
plt.setp(vp['cbars'], color='teal')
plt.setp(vp['cmaxes'], color='teal')
plt.setp(vp['cmins'], color='teal')
print