import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import random
chickweight = pd.read_csv('chick_weight.csv')
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt
from itertools import combinations
def get_slope(X, Y):
z = np.polyfit(X, Y, 1)
p = np.poly1d(z)
xp = np.linspace(0, 22, 100)
return z[0]
def bootstrap(x):
samp_x = []
for i in range(len(x)):
samp_x.append(random.choice(x))
return samp_x
def bootstrap_data(data):
new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet"])
for i in range(22):
temp = data.loc[data['Time']==i]
count_row = temp.shape[0]
if(count_row==0):
continue
else:
temp['weight'] = bootstrap(temp['weight'].values)
new_df = pd.concat([new_df, temp], ignore_index=True)
return new_df
def get_slopes(chickweight):
weight = 0
time = 1
uniqueDiets = chickweight.Diet.unique()
chickweight = chickweight.drop(['Unnamed: 0'], axis=1)
num_shuffles = 100
slope_out_dict = {}
for diet in uniqueDiets:
slope_out = []
for i in range(num_shuffles):
if(i%10 == 0):
print("Done: ", i)
df_this_diet = chickweight[:][chickweight.Diet == diet]
bootstrapped_df = bootstrap_data(df_this_diet)
df_this_diet_values = bootstrapped_df.values
X = np.array(df_this_diet_values[:,time], dtype=float)
Y = np.array(df_this_diet_values[:,weight], dtype=float)
slope = get_slope(X, Y)
slope_out.append(slope)
slope_out_dict[diet] = slope_out
return slope_out_dict
observed_slope = {1: 6.8417972,
2: 8.60913629,
3: 11.42287097,
4: 9.71436556}
slope_out = get_slopes(chickweight)
import math
conf_interval = 0.9
for key, value in slope_out.items():
diet = key
value.sort()
tails = (1 - conf_interval) / 2
# in case our lower and upper bounds are not integers,
# we decrease the range (the values we include in our interval),
# so that we can keep the same level of confidence
lower_bound = int(math.ceil(100 * tails))
upper_bound = int(math.floor(100 * (1 - tails)))
######################################
#
# Output
#
######################################
# print observed value and then confidence interval
print("**********Diet: ", diet,"**********")
print ("Observed slope: %.2f" % observed_slope[key])
print ("We have", conf_interval * 100, "% confidence that the true slope", end=" ")
print ("is between: %.2f" % value[lower_bound], "and %.2f" % value[upper_bound])
print (" ")