import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
chickweight = pd.read_csv('chick_weight.csv')
print("----HEAD----")
print(chickweight.head())
# print("----INFO----")
# print(chickweight.info())
# print("----DESCRIBE----")
# print(chickweight.describe())
# print("----COLUMNS----")
# print(chickweight.columns)
# print("Type")
# print(type(chickweight))
uniqueDiets = chickweight.Diet.unique()
DataFrameDict = {elem : pd.DataFrame for elem in uniqueDiets}
X = {elem : pd.DataFrame for elem in uniqueDiets}
Y = {elem : pd.DataFrame for elem in uniqueDiets}
for key in DataFrameDict.keys():
temp = chickweight[:][chickweight.Diet == key]
# temp = temp.drop(columns=['Diet'])
# temp = temp.drop(columns=['Unnamed: 0'])
DataFrameDict[key] = temp
# print(DataFrameDict[1])
X = {}
Y = {}
for key in DataFrameDict.keys():
df = DataFrameDict[key]
numpyArray = df.values
weight = 1
time = 2
chick = 3
X[key] = numpyArray[:,time]
Y[key] = numpyArray[:,weight]
print(X)
print("***********")
print(Y)
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt
print("The following plots show the fitted curve(orange curve) and the 95% confidence interval(green curves)","\n")
for key in X:
print("---Diet ",key,"---")
x = X[key]
y = Y[key]
z = np.polyfit(x, y, 1) #least squares polynomial fit. Fit a polynomial of degree deg to points (x,y) np.polyfit(x, y, deg)
print("Polynomial coefficients: ",z)
p = np.poly1d(z) #poly1d consructs the 1d polynomial from the given coefficients
xp = np.linspace(0, 22, 100) #linspace returns evenly spaced numbers over a specified interval np.linspace(start, stop, num_of_samples)
stdev = np.sqrt(sum((p(x) - y)**2) / (len(y) - 2))
#95% Confidence interval (Gaussian critical value for significance level 95% is 1.96)
conf_l = p(xp) - 1.96*stdev
conf_h = p(xp) + 1.96*stdev
#Plotting the three curves
plt.plot(x,y,'.',xp, p(xp),'-',xp, conf_l, 'g-', xp, conf_h, 'g-')
plt.show()
mean_final_weights_per_diet = {}
final_weights_df = {}
for key in DataFrameDict.keys():
df = DataFrameDict[key]
temp = df.loc[df['Time'] == 21]
final_weights_df[key] = temp
mean_final_weights_per_diet[key] = temp["weight"].mean()
print("Mean Final Weights Per Diet")
print(mean_final_weights_per_diet)
Diet 3 gives the highest weight, hence we do a shuffle test of Diet 3 against all the three diets
def shuffle(grps):
num_grps = len(grps)
pool = []
# pool all values
for i in range(num_grps):
pool.extend(grps[i])
# mix them up
random.shuffle(pool)
# reassign to groups of same size as original groups
new_grps = []
start_index = 0
end_index = 0
for i in range(num_grps):
end_index = start_index + len(grps[i])
new_grps.append(pool[start_index:end_index])
start_index = end_index
return new_grps
# subtracts group a mean from group b mean and returns result
def meandiff(grpA, grpB):
return sum(grpB) / float(len(grpB)) - sum(grpA) / float(len(grpA))
diets = [1,2,4]
import random
samples = [None, None]
samples[1] = list(final_weights_df[3]["weight"])
for diet in diets:
samples[0] = (list(final_weights_df[diet]["weight"]))
a = 0
b = 1
observed_mean_diff = meandiff(samples[a], samples[b])
count = 0
num_shuffles = 10000
for i in range(num_shuffles):
new_samples = shuffle(samples)
mean_diff = meandiff(new_samples[a], new_samples[b])
# if the observed difference is negative, look for differences that are smaller
# if the observed difference is positive, look for differences that are greater
if observed_mean_diff < 0 and mean_diff <= observed_mean_diff:
count = count + 1
elif observed_mean_diff >= 0 and mean_diff >= observed_mean_diff:
count = count + 1
######################################
#
# Output
#
######################################
print("**********Diet: ", diet, " vs Diet: 3**********")
print ("Observed difference of two means: %.2f" % observed_mean_diff)
print (count, "out of", num_shuffles, "experiments had a difference of two means", end=" ")
if observed_mean_diff < 0:
print ("less than or equal to", end=" ")
else:
print ("greater than or equal to", end=" ")
print ("%.2f" % observed_mean_diff, ".")
print ("The chance of getting a difference of two means", end=" ")
if observed_mean_diff < 0:
print ("less than or equal to", end=" ")
else:
print ("greater than or equal to", end=" ")
print ("%.2f" % observed_mean_diff, "is", (count / float(num_shuffles)), "\n")
# x is an array of sample values
# returns a new array with randomly picked
# (with replacement) values from x
def bootstrap(x):
samp_x = []
for i in range(len(x)):
samp_x.append(random.choice(x))
return samp_x
import random
import math
conf_interval = 0.9
samples = [None, None]
samples[1] = list(final_weights_df[3]["weight"])
for diet in diets:
samples[0] = (list(final_weights_df[diet]["weight"]))
a = 0
b = 1
observed_mean_diff = meandiff(samples[a], samples[b])
num_resamples = 10000 # number of times we will resample from our original samples
out = [] # will store results of each time we resample
for i in range(num_resamples):
# get bootstrap samples for each of our groups
# then compute our statistic of interest
# append statistic to out
bootstrap_samples = [] # list of lists
for sample in samples:
bootstrap_samples.append(bootstrap(sample))
# now we have a list of bootstrap samples, run meandiff
out.append(meandiff(bootstrap_samples[a], bootstrap_samples[b]))
out.sort()
tails = (1 - conf_interval) / 2
# in case our lower and upper bounds are not integers,
# we decrease the range (the values we include in our interval),
# so that we can keep the same level of confidence
lower_bound = int(math.ceil(num_resamples * tails))
upper_bound = int(math.floor(num_resamples * (1 - tails)))
######################################
#
# Output
#
######################################
# print observed value and then confidence interval
print("**********Diet: ", diet, " vs Diet: 3**********")
print ("Observed difference between the means: %.2f" % observed_mean_diff)
print ("We have", conf_interval * 100, "% confidence that the true difference between the means", end=" ")
print ("is between: %.2f" % out[lower_bound], "and %.2f" % out[upper_bound])
print (" ")
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt
from itertools import combinations
def get_slope(X, Y):
z = np.polyfit(X, Y, 1)
p = np.poly1d(z)
xp = np.linspace(0, 22, 100)
# plt.plot(X,Y,'.',xp, p(xp),'-')
# plt.show()
return z[0]
def get_slope_diff(diet_1_data, diet_2_data):
X1, Y1 = diet_1_data
X2, Y2 = diet_2_data
return (get_slope(X1, Y1) - get_slope(X2, Y2))
def shuffle_diet_labels(data):
new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet"])
for i in range(22):
temp = data.loc[data['Time']==i]
count_row = temp.shape[0]
if(count_row==0):
continue
else:
temp['Diet'] = np.random.permutation(temp['Diet'].values)
new_df = pd.concat([new_df, temp], ignore_index=True)
return new_df
def significance_test_diet_slope(chickweight):
weight = 0
time = 1
uniqueDiets = chickweight.Diet.unique()
diet_comparisons = list(combinations(uniqueDiets, 2))
diff_out_dict = {}
for ele in diet_comparisons:
diff_out = []
diet_1, diet_2 = ele
temp_1 = chickweight.loc[(chickweight['Diet'] == diet_1) | (chickweight['Diet'] == diet_2)]
temp_1 = temp_1.drop(['Unnamed: 0'], axis=1)
for i in range(500):
if(i%100 ==0):
print("Done: ", i)
new_df = shuffle_diet_labels(temp_1)
new_df_diet_1 = new_df[:][new_df.Diet == diet_1]
new_df_diet_2 = new_df[:][new_df.Diet == diet_2]
new_df_diet_1_values = new_df_diet_1.values
new_df_diet_2_values = new_df_diet_2.values
X1 = np.array(new_df_diet_1_values[:,time], dtype=float)
Y1 = np.array(new_df_diet_1_values[:,weight], dtype=float)
X2 = np.array(new_df_diet_2_values[:,time], dtype=float)
Y2 = np.array(new_df_diet_2_values[:,weight], dtype=float)
slope_diff = get_slope_diff((X1,Y1),(X2,Y2))
diff_out.append(slope_diff)
diff_out_dict[ele] = diff_out
return diff_out_dict
diff_out = significance_test_diet_slope(chickweight)
import math
observed_diff = {(1,2): -1.769,(1,3): -4.5828,(1,4): -2.874,(2,3): -2.8137,(2,4): -1.105,(3,4):1.7085}
conf_interval = 0.9
for key, value in diff_out.items():
diet1 = key[0]
diet2 = key[1]
value.sort()
tails = (1 - conf_interval) / 2
# in case our lower and upper bounds are not integers,
# we decrease the range (the values we include in our interval),
# so that we can keep the same level of confidence
lower_bound = int(math.ceil(500 * tails))
upper_bound = int(math.floor(500 * (1 - tails)))
######################################
#
# Output
#
######################################
# print observed value and then confidence interval
print("**********Diet: ", diet1, " vs Diet: ",diet2,"**********")
print ("Observed difference between the means: %.2f" % observed_diff[key])
print ("We have", conf_interval * 100, "% confidence that the true difference between the means", end=" ")
print ("is between: %.2f" % value[lower_bound], "and %.2f" % value[upper_bound])
print (" ")