In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer
%matplotlib inline
plt.style.use('ggplot')

Read and Merge All Datafiles on RID

In [2]:
df = pd.read_excel('juan_data1.xlsx', skip_footer=25)
df['source'] = 'datafile1'
df2 = pd.read_excel('juan_data3_mod.xlsx')
df['source'] = 'datafile3'
df3 = pd.read_excel('MRI_extension_data327x341.xlsx')
df['source'] = 'MRI'
merged = pd.merge(pd.merge(df,df2,on='RID', suffixes=('_original', '_redundant1')), df3, on='RID', suffixes=('_redundant2', '_redundant3'))
assert(all(merged.target_original==merged.target_redundant1))

Assign Stability and Time-Bin Flags based on the Target/Label

In [3]:
merged['target'] = merged.target_original
merged['time_bin'] = merged.target.apply(lambda x: (x+1) / 2)
merged['stable'] = merged.target.apply(lambda x: int(x%2 != 0))
merged[['target','time_bin', 'stable']].head()
Out[3]:
target time_bin stable
0 3 2 1
1 3 2 1
2 8 4 0
3 3 2 1
4 7 4 1

Check the Distribution of Labels in the Dataset

In [4]:
targets = list(merged['target_original'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Label')
plt.ylabel('% of Samples')
plt.title('Label Distribution in Dataset\n')
plt.xticks([1,2,3,4,5,6,7,8])

plt.figure()
targets = list(merged['stable'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('State')
plt.ylabel('% of Samples')
plt.title('State Distribution in Dataset\n')
plt.xticks([0,1],['Transitioning', 'Stable'])

plt.figure()
targets = list(merged['time_bin'].values)
labels = sorted(list(set(targets)))
plt.bar(labels, [targets.count(l)/float(len(targets)) for l in labels], align='center')
plt.xlabel('Bin')
plt.ylabel('% of Samples')
plt.title('Time Bin Distribution in Dataset\n')
plt.xticks([1,2,3,4], ['>4.25', '4.25-2.25', '2.25-1.25', '1.25-0'])

print 'Distributions:'
Distributions:

Ignore Columns that Could Pollute the Classifier

Right not this just involves ignoring copies of features from different data-sources, will grow to ignore features with the same "sortable" value

In [5]:
polluting_label_fragments = ['target','RID', 'time_bin', 'stable', 'source', 'redundant']
valid_classifier_columns = [c for c in merged.columns if not any(dirt in c for dirt in polluting_label_fragments)]
ignored_columns = [c for c in merged.columns if  any(dirt in c for dirt in polluting_label_fragments)]
print 'Ignoring {0} Columns.\nNon-redundant: {1}'.format(len(ignored_columns), [i for i in ignored_columns if 'redundant' not in i])
Ignoring 162 Columns.
Non-redundant: [u'RID', u'target_original', 'source', 'target', 'time_bin', 'stable']

Test Simple Random Forest Classifier on Stability

In order to check the predictive value of different features, we will run 100 trials of predicting the "stability" label. This is not the most useful measurement, but it gives us a good view of feature importance to potentially select them later for stronger classifiers.

In [6]:
aucs = []
feature_importances = []
plt.plot([0,1],[0,1],'--',color='k')

for trial in range(200):
    X_train, X_test, y_train, y_test = train_test_split(merged[valid_classifier_columns], merged['stable'], train_size=0.8)

    imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    imp = imp.fit(X_train)
    X_train = imp.transform(X_train)
    X_test = imp.transform(X_test)


    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    pred = clf.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    plt.plot(fpr,tpr,'r', alpha=0.3)
    
    aucs.append(auc(fpr, tpr))
    feature_importances.append(clf.feature_importances_)
    
print 'Average AUC:{0:.2f} (Publishable at ≥0.7)'.format(np.mean(aucs))
Average AUC:0.64 (Publishable at ≥0.7)

Calculate Average Feature Importance, Weighted by How Well The Classifier Performed

We print the top 10 average feature importances overall but ALSO, we weight feature importances by AUC so we can see which features are most informative in good models.

In [7]:
average_feature_importance = np.mean(np.array(feature_importances),axis=0)
print "\nTop Average Features (TOP 10):\n"
for i, f in sorted(zip(average_feature_importance, valid_classifier_columns), reverse=True)[:10]:
    print '{0:.3f}\t{1}'.format(i,f)


print '\n'+'='*20+'\n'
    
    
weighted_feature_importance = np.mean(np.array(feature_importances) * np.array(aucs)[:,None], axis=0)
print "\nTop Average Features, weighted for good AUC (TOP 50):\n"
for i, f in sorted(zip(weighted_feature_importance, valid_classifier_columns), reverse=True)[:50]:
    print '{0:.3f}\t{1}'.format(i,f)
     
vp = plt.violinplot(weighted_feature_importance)
plt.title('Average Weighted Feature Importance Distribution\n')
plt.xticks([])
plt.xlabel('Frequency')
plt.ylabel('Weighted Importance')
plt.setp(vp['bodies'], color='teal')
plt.setp(vp['cbars'], color='teal')
plt.setp(vp['cmaxes'], color='teal')
plt.setp(vp['cmins'], color='teal')
print
Top Average Features (TOP 10):

0.013	ADNI_MEM450.bl_original
0.011	ST40TA_apc_L.MidTemp
0.011	temp_comp_apc
0.009	ST37SV_apc_L.LatVent
0.008	ST24CV
0.008	ST123TS
0.007	R_amygdala_pcntICV.bsc
0.007	ST96SV_apc_R.LatVent
0.007	ST123TA
0.006	ST83TA

====================


Top Average Features, weighted for good AUC (TOP 50):

0.008	ADNI_MEM450.bl_original
0.007	ST40TA_apc_L.MidTemp
0.007	temp_comp_apc
0.006	ST37SV_apc_L.LatVent
0.005	ST24CV
0.005	ST123TS
0.005	ST96SV_apc_R.LatVent
0.005	R_amygdala_pcntICV.bsc
0.004	ST123TA
0.004	ST83TA
0.004	ST71SV
0.004	L_amygdala_pcntICV.bsc
0.004	L_Hip_pcntICV.bsc
0.004	ST29SV
0.004	ST12SV
0.003	FAC3_10
0.003	CReactiveProteinCRPugmL.bl_original
0.003	ST83CV
0.003	ST64TA
0.003	ST64TS
0.003	ST72CV
0.003	ST52TA_apc_L.Prec
0.003	ST40CV
0.003	ST35CV
0.003	ST13TA
0.003	ST64CV
0.003	ST123CV
0.003	ST118TA
0.003	ST99CV
0.002	ST99TA
0.002	ST88SV
0.002	R_Hip_pcntICV.bsc
0.002	ST19SV_apc_L.GM_Ctx
0.002	ST29SV_apc_L.Hip
0.002	ST85CV
0.002	ST24TA_apc_L.ERC
0.002	ST40TA
0.002	ST78SV_apc_R.GM_Ctx
0.002	ST72SA
0.002	ST59CV
0.002	ST24TA
0.002	MacrophageInflammatoryProtein3alphapgml.bl_original
0.002	ST52TA
0.002	ST26TA
0.002	ST90TA
0.002	ST130TA
0.002	ST50TA_apc_L.PostCing
0.002	ST111TS
0.002	R_InfLatVentr_pcntICV.bsc
0.002	ApolipoproteinAIIApoAIIngml.bl_original

In [ ]:
 
In [ ]: