In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import random

chickweight = pd.read_csv('chick_weight.csv')
In [19]:
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt
from itertools import combinations
In [20]:
def get_slope(X, Y):
    z = np.polyfit(X, Y, 1)
    p = np.poly1d(z)
    xp = np.linspace(0, 22, 100)
    return z[0]
In [21]:
def bootstrap(x):
    samp_x = []
    for i in range(len(x)):
        samp_x.append(random.choice(x))
    return samp_x
In [22]:
def bootstrap_data(data):
    new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet"])
    for i in range(22):
        temp = data.loc[data['Time']==i]
        count_row = temp.shape[0]
        if(count_row==0):
            continue
        else:
            temp['weight'] = bootstrap(temp['weight'].values)
            new_df = pd.concat([new_df, temp], ignore_index=True)
    return new_df
In [23]:
def get_slopes(chickweight):
    weight = 0
    time = 1
    uniqueDiets = chickweight.Diet.unique()
    chickweight = chickweight.drop(['Unnamed: 0'], axis=1)
    num_shuffles = 100
    slope_out_dict = {}
    for diet in uniqueDiets:
        slope_out = []
        for i in range(num_shuffles):
            if(i%10 == 0):
                print("Done: ", i)
            df_this_diet = chickweight[:][chickweight.Diet == diet]
            bootstrapped_df = bootstrap_data(df_this_diet)
            df_this_diet_values = bootstrapped_df.values
            X = np.array(df_this_diet_values[:,time], dtype=float)
            Y = np.array(df_this_diet_values[:,weight], dtype=float)
            
            slope = get_slope(X, Y)
            slope_out.append(slope)
        slope_out_dict[diet] = slope_out
    return slope_out_dict
In [24]:
observed_slope = {1: 6.8417972,
                  2: 8.60913629,
                  3: 11.42287097,
                  4: 9.71436556}
In [25]:
slope_out = get_slopes(chickweight)
Done:  0
/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
Done:  0
Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
Done:  0
Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
Done:  0
Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
In [26]:
import math
conf_interval = 0.9
for key, value in slope_out.items():
    diet = key
    value.sort()
    
    tails = (1 - conf_interval) / 2
    # in case our lower and upper bounds are not integers,
    # we decrease the range (the values we include in our interval),
    # so that we can keep the same level of confidence
    lower_bound = int(math.ceil(100 * tails))
    upper_bound = int(math.floor(100 * (1 - tails)))

    ######################################
    #
    # Output
    #
    ######################################

    # print observed value and then confidence interval
    print("**********Diet: ", diet,"**********")
    print ("Observed slope: %.2f" % observed_slope[key])
    print ("We have", conf_interval * 100, "% confidence that the true slope", end=" ")
    print ("is between: %.2f" % value[lower_bound], "and %.2f" % value[upper_bound])
    print (" ")
**********Diet:  1 **********
Observed slope: 6.84
We have 90.0 % confidence that the true slope is between: 6.18 and 7.41
 
**********Diet:  2 **********
Observed slope: 8.61
We have 90.0 % confidence that the true slope is between: 7.71 and 9.55
 
**********Diet:  3 **********
Observed slope: 11.42
We have 90.0 % confidence that the true slope is between: 10.63 and 12.49
 
**********Diet:  4 **********
Observed slope: 9.71
We have 90.0 % confidence that the true slope is between: 9.15 and 10.27