In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
df = pd.read_excel('data_july.xlsx','core1+FAC+MRI.m6')
In [3]:
df = df.dropna()
In [4]:
df.shape
Out[4]:
(329, 223)
In [5]:
predictor_keys = [u'Alpha1MicroglobulinA1Microugml.bl', u'Alpha2MacroglobulinA2MacromgmL.bl', u'Alpha1AntichymotrypsinAACTugml.bl', u'Alpha1AntitrypsinAATmgmL.bl', u'AngiotensinConvertingEnzymeACEngml.bl', u'AdiponectinugmL.bl', u'AlphaFetoproteinAFPngmL.bl', u'AgoutiRelatedProteinAGRPpgmL.bl', u'Angiopoietin2ANG2ngmL.bl', u'AngiotensinogenngmL.bl', u'ApolipoproteinAIApoAImgmL.bl', u'ApolipoproteinAIIApoAIIngml.bl', u'ApolipoproteinAIVApoAIVugml.bl', u'ApolipoproteinBApoBugml.bl', u'ApolipoproteinCIApoCIngml.bl', u'ApolipoproteinCIIIApoCIIIugmL.bl', u'ApolipoproteinDApoDugml.bl', u'ApolipoproteinEApoEugml.bl', u'ApolipoproteinHApoHugmL.bl', u'AXLReceptorTyrosineKinaseAXLngmL.bl', u'Beta2MicroglobulinB2MugmL.bl', u'BrainDerivedNeurotrophicFactorBDNFngmL.bl', u'BLymphocyteChemoattractantBLCpgml.bl', u'BoneMorphogeneticProtein6BMP6ngmL.bl', u'BrainNatriureticPeptideBNPpgml.bl', u'BetacellulinBTCpgmL.bl', u'ComplementC3C3mgmL.bl', u'CancerAntigen199CA199UmL.bl', u'CalcitoninpgmL.bl', u'CD40antigenCD40ngmL.bl', u'CD40LigandCD40LngmL.bl', u'CD5CD5Lngml.bl', u'CarcinoembryonicAntigenCEAngmL.bl', u'ChromograninACgAngmL.bl', u'CreatineKinaseMBCKMBngmL.bl', u'ClusterinCLUugml.bl', u'CiliaryNeurotrophicFactorCNTFpgmL.bl', u'ComplementFactorHugml.bl', u'CortisolCortisolngml.bl', u'Cpeptidengml.bl', u'CReactiveProteinCRPugmL.bl', u'CystatinCngml.bl', u'EpidermalGrowthFactorEGFpgmL.bl', u'EpidermalGrowthFactorReceptorEGFRngmL.bl', u'EpithelialDerivedNeutrophilActivatingngmL.bl', u'Eotaxin1pgmL.bl', u'Eotaxin3pgmL.bl', u'ESelectinngmL.bl', u'FattyAcidBindingProteinheartFABPngmL.bl', u'FactorVIIngmL.bl', u'FASLGReceptorFASngmL.bl', u'FasLigandFasLpgmL.bl', u'FetuinAugml.bl', u'FibroblastGrowthFactor4FGF4pgmL.bl', u'FibrinogenmgmL.bl', u'FerritinFRTNngmL.bl', u'FollicleStimulatingHormoneFSHmIUmL.bl', u'GrowthHormoneGHngmL.bl', u'GrowthRegulatedalphaproteinGROalphpgmL.bl', u'GlutathioneSTransferasealphaGSTalpngml.bl', u'HaptoglobinmgmL.bl', u'HeparinBindingEGFLikeGrowthFactorpgmL.bl', u'ChemokineCC4HCC4ngmL.bl', u'HepatocyteGrowthFactorHGFngmL.bl', u'TLymphocyteSecretedProteinI309I3pgmL.bl', u'IntercellularAdhesionMolecule1ICAMngmL.bl', u'ImmunoglobulinAIgAmgmL.bl', u'ImmunoglobulinEIgEngmL.bl', u'InsulinlikeGrowthFactorBindingProtengmL.bl', u'ImmunoglobulinMIGMmgmL.bl', u'Interleukin13IL13pgmL.bl', u'Interleukin16IL16pgmL.bl', u'Interleukin18IL18pgmL.bl', u'Interleukin3IL3ngmL.bl', u'Interleukin6receptorIL6rngmL.bl', u'Interleukin8IL8pgmL.bl', u'InsulinuIUmL.bl', u'InterferongammaInducedProtein10IPpgml.bl', u'KidneyInjuryMolecule1KIM1ngml.bl', u'LeptinngmL.bl', u'LuteinizingHormoneLHmIUmL.bl', u'ApolipoproteinaLpaugmL.bl', u'MonocyteChemotacticProtein1MCP1pgmL.bl', u'MonocyteChemotacticProtein2MCP2pgml.bl', u'MonocyteChemotacticProtein3MCP3pgmL.bl', u'MonocyteChemotacticProtein4MCP4pgml.bl', u'MacrophageColonyStimulatingFactor1ngmL.bl', u'MacrophageDerivedChemokineMDCpgmL.bl', u'MacrophageMigrationInhibitoryFactorngmL.bl', u'MonokineInducedbyGammaInterferonMIpgml.bl', u'MacrophageInflammatoryProtein1alphapgmL.bl', u'MacrophageInflammatoryProtein1betapgmL.bl', u'MacrophageInflammatoryProtein3alphapgml.bl', u'MatrixMetalloproteinase1MMP1ngml.bl', u'MatrixMetalloproteinase10MMP10ngml.bl', u'MatrixMetalloproteinase2MMP2ngmL.bl', u'MatrixMetalloproteinase7MMP7ngml.bl', u'MatrixMetalloproteinase9MMP9ngmL.bl', u'MatrixMetalloproteinase9totalMMP9ngml.bl', u'MyeloidProgenitorInhibitoryFactor1ngmL.bl', u'MyeloperoxidaseMPOngmL.bl', u'MyoglobinngmL.bl', u'NeutrophilGelatinaseAssociatedLipocalngml.bl', u'NeuronalCellAdhesionMoleculeNrCAMngmL.bl', u'Osteopontinngml.bl', u'PlasminogenActivatorInhibitor1PAI1ngmL.bl', u'ProstaticAcidPhosphatasePAPngmL.bl', u'PregnancyAssociatedPlasmaProteinAPmIUmL.bl', u'PulmonaryandActivationRegulatedChemongmL.bl', u'PlateletDerivedGrowthFactorBBPDGFpgml.bl', u'PlacentaGrowthFactorPLGFpgml.bl', u'PancreaticPolypeptidePPPpgml.bl', u'ProlactinPRLngml.bl', u'ProinsulinIntactpM.bl', u'ProinsulinTotalpM.bl', u'PeptideYYPYYpgmL.bl', u'ReceptorforadvancedglycosylationendngmL.bl', u'TCellSpecificProteinRANTESRANTESngmL.bl', u'Resistinngml.bl', u'SerumAmyloidPComponentSAPugmL.bl', u'StemCellFactorSCFpgmL.bl', u'SerumGlutamicOxaloaceticTransaminaseugmL.bl', u'SexHormoneBindingGlobulinSHBGnmolL.bl', u'SuperoxideDismutase1SolubleSOD1ngmL.bl', u'SortilinngmL.bl', u'ThyroxineBindingGlobulinTBGugmL.bl', u'ThymusExpressedChemokineTECKngmL.bl', u'TestosteroneTotalngml.bl', u'TrefoilFactor3TFF3ugml.bl', u'TammHorsfallUrinaryGlycoproteinTHPugml.bl', u'Thrombospondin1ngmL.bl', u'TissueInhibitorofMetalloproteinases1ngmL.bl', u'ThrombomodulinTMngml.bl', u'TenascinCTNCngmL.bl', u'TumorNecrosisFactoralphaTNFalphapgmL.bl', u'TumorNecrosisFactorReceptorLike2TngmL.bl', u'ThrombopoietinngmL.bl', u'TNFRelatedApoptosisInducingLigandRengmL.bl', u'SerotransferrinTransferrinmgdl.bl', u'ThyroidStimulatingHormoneTSHuIUmL.bl', u'TransthyretinTTRmgdl.bl', u'VascularCellAdhesionMolecule1VCAMngmL.bl', u'VascularEndothelialGrowthFactorVEGFpgmL.bl', u'Vitronectinugml.bl', u'VitaminKDependentProteinSVKDPSugml.bl', u'vonWillebrandFactorvWFugmL.bl', u'BMI', u'Gender_num', u'WHITMATHYP_450', u'ADNI_MEM450.bl', u'ADNI_EF450.bl', u'VSBPSYS', u'ApoE_gene_risk', u'age_123', u'New_IL6_ord_450', u'FAC1_10', u'FAC2_10', u'FAC3_10', u'FAC4_10', u'FAC5_10', u'L_Hip_pcntICV.bsc', u'R_Hip_pcntICV.bsc', u'L_amygdala_pcntICV.bsc', u'R_amygdala_pcntICV.bsc', u'L_InfLatVentr_pcntICV.bsc', u'R_InfLatVentr_pcntICV.bsc', u'L_Accumbens_pcntICV.bsc', u'R_Accumbens_pcntICV.bsc', u'L_CerebCtx_pcntICV.bsc', u'R_CerebCtx_pcntICV.bsc', u'L_CerebCtx_WM_pcntICV.bsc', u'R_CerebCtx_WM_pcntICV.bsc', u'L_VentDorsCol_pcntICV.bsc', u'R_VentDorsCol_pcntICV.bsc', u'L_LatVentr_pcntICV.bsc', u'R_LatVentr_pcntICV.bsc', u'L_SupParietal_pcntICV.bsc', u'R_SupParietal_pcntICV.bsc', u'L_Hip_pcntICV.m06', u'R_Hip_pcntICV.m06', u'L_amygdala_pcntICV.m06', u'R_amygdala_pcntICV.m06', u'L_InfLatVentr_pcntICV.m06', u'R_InfLatVentr_pcntICV.m06', u'L_Accumbens_pcntICV.m06', u'R_Accumbens_pcntICV.m06', u'L_CerebCtx_pcntICV.m06', u'R_CerebCtx_pcntICV.m06', u'L_CerebCtx_WM_pcntICV.m06', u'R_CerebCtx_WM_pcntICV.m06', u'L_VentDorsCol_pcntICV.m06', u'R_VentDorsCol_pcntICV.m06', u'L_LatVentr_pcntICV.m06', u'R_LatVentr_pcntICV.m06', u'L_SupParietal_pcntICV.m06', u'R_SupParietal_pcntICV.m06', u'FAC1_11', u'FAC2_11', u'FAC3_11', u'FAC4_11', u'FAC5_11', u'ST29SV_apc_L.Hip', u'ST88SV_apc_R.Hip', u'ST83TA_apc_R.ERC', u'ST52TA_apc_L.Prec', u'ST111TA_apc_R.Prec', u'ST50TA_apc_L.PostCing', u'ST109TA_apc_R.PostCing', u'ST37SV_apc_L.LatVent', u'ST96SV_apc_R.LatVent', u'ST20SV_apc_L.WM_Ctx', u'ST24TA_apc_L.ERC', u'ST19SV_apc_L.GM_Ctx', u'ST79SV_apc_R.WM_Ctx', u'ST78SV_apc_R.GM_Ctx', u'ST40TA_apc_L.MidTemp', u'ST99TA_apc_R.MidTemp', u'ST12SV_apc_L.Amyg', u'ST71SV_apc_R.Amyg', u'ST71SV_apc_R.Amyg.1', u'temp_comp_apc']
In [6]:
df['target_vs8'] = map(lambda x: 0 if x in [1,2,3,4,5,6,7] else 1 if x in [8] else np.nan, df['target'])
df['target_vs68'] = map(lambda x: 0 if x in [1,2,3,4,5] else 1 if x in [6, 8] else np.nan, df['target'])
df['target_vs468'] = map(lambda x: 0 if x in [1,2,3] else 1 if x in [4, 6, 8] else np.nan, df['target'])
In [7]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cross_validation import StratifiedKFold, permutation_test_score
In [8]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
In [10]:
for target in ["target_vs8", "target_vs68", "target_vs468"]:
    rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=10, n_estimators=10, n_jobs=1,
            oob_score=False,
            warm_start=False) 

    ab = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=20) 

    relevant_frame = df[predictor_keys + [target]].dropna()
    
    X, y = relevant_frame[predictor_keys], relevant_frame[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    for clf in [rf, ab]:
        clf.fit(X_train, y_train)
        
        predict_on_training = [score[1] for score in clf.predict_proba(X_train)]
        fpr, tpr, thresholds = roc_curve(y_train, predict_on_training)
        optfpr, opttpr, optimal = min(zip(fpr, tpr, thresholds), key=lambda a: abs(a[1] - (1 - a[0])))
        y_score = [score[1] for score in clf.predict_proba(X_test)]
        y_pred = [int(s>=optimal) for s in y_score]
#         print ['{0:.2f}'.format(val) for val in y_score]
        cm = confusion_matrix(y_test, y_pred)
#         fpr, tpr, thresholds = roc_curve(y_test, y_score)
        
        plt.figure()
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC {0} {1}'.format(str(clf)[:str(clf).find('(')], target))
        plt.legend(loc="lower right")
        plt.show()
    
    
    
        fig, axes = plt.subplots(2,1, figsize=(10,4), sharex=True)
        axes[0].scatter(predict_on_training, y_train, c='purple')
        axes[1].scatter(y_score, y_test, c=y_pred, cmap='bwr')
        axes[0].yaxis.set_ticks([0, 1])
        axes[1].yaxis.set_ticks([0, 1])
        axes[0].set_title('TRAINING SET PREDICTIONS VS VALIDATION SET PREDICTIONS')
        axes[1].set_xlabel('Score')
        axes[0].set_ylabel('True Label')
        axes[1].set_ylabel('True Label')
        axes[0].plot([optimal,optimal],[-1,2],'r--')
        axes[1].plot([optimal,optimal],[-1,2],'r--')
        fig.show()
        
        print "Optimal Split At fpr:{0:.2f} tpr:{1:.2f} using thredhold: {2:.2f}".format(optfpr, opttpr, optimal)

        
        plt.style.use('ggplot')
        plt.matshow(cm, cmap=plt.get_cmap('YlGnBu'))
        plt.colorbar()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()
        
        print classification_report(y_test, y_pred)

        if hasattr(clf, 'feature_importances_'):
            print 'TOP 10 FEATURES (IMPORTANCE):\n'
            for k,i in sorted(zip(predictor_keys,clf.feature_importances_), key=lambda x: x[1], reverse=True)[:10]:
                print "{1:40.3f}  {0:40}".format(k,i)
        print '\n'+'='*100
    
Optimal Split At fpr:0.01 tpr:1.00 using thredhold: 0.34
             precision    recall  f1-score   support

          0       0.79      0.83      0.81        63
          1       0.35      0.30      0.32        20

avg / total       0.68      0.70      0.69        83

TOP 10 FEATURES (IMPORTANCE):

                                   0.033  L_Hip_pcntICV.m06                       
                                   0.031  NeutrophilGelatinaseAssociatedLipocalngml.bl
                                   0.024  FAC3_10                                 
                                   0.024  ST96SV_apc_R.LatVent                    
                                   0.021  ThrombopoietinngmL.bl                   
                                   0.021  FAC3_11                                 
                                   0.020  ST40TA_apc_L.MidTemp                    
                                   0.019  R_CerebCtx_pcntICV.m06                  
                                   0.019  ADNI_MEM450.bl                          
                                   0.018  GlutathioneSTransferasealphaGSTalpngml.bl

====================================================================================================
Optimal Split At fpr:0.02 tpr:0.97 using thredhold: 0.48
             precision    recall  f1-score   support

          0       0.81      0.90      0.86        63
          1       0.54      0.35      0.42        20

avg / total       0.75      0.77      0.75        83

TOP 10 FEATURES (IMPORTANCE):

                                   0.100  ADNI_MEM450.bl                          
                                   0.050  Alpha1AntitrypsinAATmgmL.bl             
                                   0.050  ApolipoproteinAIIApoAIIngml.bl          
                                   0.050  ApolipoproteinDApoDugml.bl              
                                   0.050  ComplementFactorHugml.bl                
                                   0.050  CystatinCngml.bl                        
                                   0.050  ImmunoglobulinAIgAmgmL.bl               
                                   0.050  MacrophageInflammatoryProtein1betapgmL.bl
                                   0.050  MatrixMetalloproteinase9MMP9ngmL.bl     
                                   0.050  NeutrophilGelatinaseAssociatedLipocalngml.bl

====================================================================================================
Optimal Split At fpr:0.04 tpr:0.96 using thredhold: 0.48
             precision    recall  f1-score   support

        0.0       0.67      0.72      0.69        47
        1.0       0.48      0.41      0.44        29

avg / total       0.60      0.61      0.60        76

TOP 10 FEATURES (IMPORTANCE):

                                   0.073  ST40TA_apc_L.MidTemp                    
                                   0.044  ADNI_EF450.bl                           
                                   0.023  ST71SV_apc_R.Amyg                       
                                   0.021  R_amygdala_pcntICV.m06                  
                                   0.021  ADNI_MEM450.bl                          
                                   0.020  CreatineKinaseMBCKMBngmL.bl             
                                   0.019  L_Hip_pcntICV.m06                       
                                   0.018  L_CerebCtx_pcntICV.bsc                  
                                   0.018  R_Hip_pcntICV.bsc                       
                                   0.016  R_CerebCtx_pcntICV.bsc                  

====================================================================================================
Optimal Split At fpr:0.05 tpr:0.95 using thredhold: 0.49
             precision    recall  f1-score   support

        0.0       0.78      0.66      0.71        47
        1.0       0.56      0.69      0.62        29

avg / total       0.69      0.67      0.68        76

TOP 10 FEATURES (IMPORTANCE):

                                   0.100  ADNI_MEM450.bl                          
                                   0.050  ApolipoproteinCIIIApoCIIIugmL.bl        
                                   0.050  CD40LigandCD40LngmL.bl                  
                                   0.050  ChromograninACgAngmL.bl                 
                                   0.050  ComplementFactorHugml.bl                
                                   0.050  CReactiveProteinCRPugmL.bl              
                                   0.050  MacrophageColonyStimulatingFactor1ngmL.bl
                                   0.050  Vitronectinugml.bl                      
                                   0.050  BMI                                     
                                   0.050  VSBPSYS                                 

====================================================================================================
Optimal Split At fpr:0.03 tpr:0.98 using thredhold: 0.58
             precision    recall  f1-score   support

        0.0       0.59      0.61      0.60        31
        1.0       0.67      0.65      0.66        37

avg / total       0.63      0.63      0.63        68

TOP 10 FEATURES (IMPORTANCE):

                                   0.044  FAC1_11                                 
                                   0.031  ST40TA_apc_L.MidTemp                    
                                   0.026  R_InfLatVentr_pcntICV.m06               
                                   0.023  R_amygdala_pcntICV.m06                  
                                   0.022  R_Hip_pcntICV.bsc                       
                                   0.022  ST19SV_apc_L.GM_Ctx                     
                                   0.021  L_Hip_pcntICV.bsc                       
                                   0.019  Beta2MicroglobulinB2MugmL.bl            
                                   0.018  L_CerebCtx_pcntICV.m06                  
                                   0.018  R_CerebCtx_pcntICV.bsc                  

====================================================================================================
Optimal Split At fpr:0.03 tpr:0.98 using thredhold: 0.50
             precision    recall  f1-score   support

        0.0       0.59      0.65      0.62        31
        1.0       0.68      0.62      0.65        37

avg / total       0.64      0.63      0.63        68

TOP 10 FEATURES (IMPORTANCE):

                                   0.100  ADNI_MEM450.bl                          
                                   0.050  AlphaFetoproteinAFPngmL.bl              
                                   0.050  BrainNatriureticPeptideBNPpgml.bl       
                                   0.050  CancerAntigen199CA199UmL.bl             
                                   0.050  ChromograninACgAngmL.bl                 
                                   0.050  ESelectinngmL.bl                        
                                   0.050  FactorVIIngmL.bl                        
                                   0.050  Interleukin13IL13pgmL.bl                
                                   0.050  MacrophageMigrationInhibitoryFactorngmL.bl
                                   0.050  MyeloperoxidaseMPOngmL.bl               

====================================================================================================
In [ ]:
 
In [ ]: