In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
chickweight = pd.read_csv('chick_weight.csv')
print("----HEAD----")
print(chickweight.head())
# print("----INFO----")
# print(chickweight.info())
# print("----DESCRIBE----")
# print(chickweight.describe())
# print("----COLUMNS----")
# print(chickweight.columns)

# print("Type")
# print(type(chickweight))

uniqueDiets = chickweight.Diet.unique()


DataFrameDict = {elem : pd.DataFrame for elem in uniqueDiets}
X = {elem : pd.DataFrame for elem in uniqueDiets}
Y = {elem : pd.DataFrame for elem in uniqueDiets}
for key in DataFrameDict.keys():
     temp = chickweight[:][chickweight.Diet == key]
#      temp = temp.drop(columns=['Diet'])
#      temp = temp.drop(columns=['Unnamed: 0'])
     DataFrameDict[key] = temp
# print(DataFrameDict[1])
----HEAD----
   Unnamed: 0  weight  Time  Chick  Diet
0           1      42     0      1     1
1           2      51     2      1     1
2           3      59     4      1     1
3           4      64     6      1     1
4           5      76     8      1     1
In [2]:
X = {}
Y = {}
for key in DataFrameDict.keys():
    df = DataFrameDict[key]
    numpyArray = df.values
    weight = 1
    time = 2
    chick = 3
    X[key] = numpyArray[:,time]
    Y[key] = numpyArray[:,weight]
print(X)
print("***********")
print(Y)
{1: array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8,
       10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18,
       20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,
        6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14,
       16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,
        2,  4,  6,  8, 10, 12, 14, 16, 18, 20,  0,  2,  4,  6,  8, 10, 12,
       14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,
        0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8,
       10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18,
       20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,
        6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12,  0,  2,  4,  6,  8,
       10, 12, 14, 16, 18, 20, 21,  0,  2,  0,  2,  4,  6,  8, 10, 12, 14,
       16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21]), 2: array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8,
       10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18,
       20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,
        6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14,
       16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,
        2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10,
       12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20,
       21]), 3: array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8,
       10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18,
       20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,
        6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14,
       16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,
        2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10,
       12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20,
       21]), 4: array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8,
       10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18,
       20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18,  0,  2,  4,  6,  8,
       10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18,
       20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,
        6,  8, 10, 12, 14, 16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14,
       16, 18, 20, 21,  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 21])}
***********
{1: array([ 42,  51,  59,  64,  76,  93, 106, 125, 149, 171, 199, 205,  40,
        49,  58,  72,  84, 103, 122, 138, 162, 187, 209, 215,  43,  39,
        55,  67,  84,  99, 115, 138, 163, 187, 198, 202,  42,  49,  56,
        67,  74,  87, 102, 108, 136, 154, 160, 157,  41,  42,  48,  60,
        79, 106, 141, 164, 197, 199, 220, 223,  41,  49,  59,  74,  97,
       124, 141, 148, 155, 160, 160, 157,  41,  49,  57,  71,  89, 112,
       146, 174, 218, 250, 288, 305,  42,  50,  61,  71,  84,  93, 110,
       116, 126, 134, 125,  42,  51,  59,  68,  85,  96,  90,  92,  93,
       100, 100,  98,  41,  44,  52,  63,  74,  81,  89,  96, 101, 112,
       120, 124,  43,  51,  63,  84, 112, 139, 168, 177, 182, 184, 181,
       175,  41,  49,  56,  62,  72,  88, 119, 135, 162, 185, 195, 205,
        41,  48,  53,  60,  65,  67,  71,  70,  71,  81,  91,  96,  41,
        49,  62,  79, 101, 128, 164, 192, 227, 248, 259, 266,  41,  49,
        56,  64,  68,  68,  67,  68,  41,  45,  49,  51,  57,  51,  54,
        42,  51,  61,  72,  83,  89,  98, 103, 113, 123, 133, 142,  39,
        35,  43,  48,  55,  62,  65,  71,  82,  88, 106, 120, 144, 157,
        41,  47,  54,  58,  65,  73,  77,  89,  98, 107, 115, 117]), 2: array([ 40,  50,  62,  86, 125, 163, 217, 240, 275, 307, 318, 331,  41,
        55,  64,  77,  90,  95, 108, 111, 131, 148, 164, 167,  43,  52,
        61,  73,  90, 103, 127, 135, 145, 163, 170, 175,  42,  52,  58,
        74,  66,  68,  70,  71,  72,  72,  76,  74,  40,  49,  62,  78,
       102, 124, 146, 164, 197, 231, 259, 265,  42,  48,  57,  74,  93,
       114, 136, 147, 169, 205, 236, 251,  39,  46,  58,  73,  87, 100,
       115, 123, 144, 163, 185, 192,  39,  46,  58,  73,  92, 114, 145,
       156, 184, 207, 212, 233,  39,  48,  59,  74,  87, 106, 134, 150,
       187, 230, 279, 309,  42,  48,  59,  72,  85,  98, 115, 122, 143,
       151, 157, 150]), 3: array([ 42,  53,  62,  73,  85, 102, 123, 138, 170, 204, 235, 256,  41,
        49,  65,  82, 107, 129, 159, 179, 221, 263, 291, 305,  39,  50,
        63,  77,  96, 111, 137, 144, 151, 146, 156, 147,  41,  49,  63,
        85, 107, 134, 164, 186, 235, 294, 327, 341,  41,  53,  64,  87,
       123, 158, 201, 238, 287, 332, 361, 373,  39,  48,  61,  76,  98,
       116, 145, 166, 198, 227, 225, 220,  41,  48,  56,  68,  80,  83,
       103, 112, 135, 157, 169, 178,  41,  49,  61,  74,  98, 109, 128,
       154, 192, 232, 280, 290,  42,  50,  61,  78,  89, 109, 130, 146,
       170, 214, 250, 272,  41,  55,  66,  79, 101, 120, 154, 182, 215,
       262, 295, 321]), 4: array([ 42,  51,  66,  85, 103, 124, 155, 153, 175, 184, 199, 204,  42,
        49,  63,  84, 103, 126, 160, 174, 204, 234, 269, 281,  42,  55,
        69,  96, 131, 157, 184, 188, 197, 198, 199, 200,  42,  51,  65,
        86, 103, 118, 127, 138, 145, 146,  41,  50,  61,  78,  98, 117,
       135, 141, 147, 174, 197, 196,  40,  52,  62,  82, 101, 120, 144,
       156, 173, 210, 231, 238,  41,  53,  66,  79, 100, 123, 148, 157,
       168, 185, 210, 205,  39,  50,  62,  80, 104, 125, 154, 170, 222,
       261, 303, 322,  40,  53,  64,  85, 108, 128, 152, 166, 184, 203,
       233, 237,  41,  54,  67,  84, 105, 122, 155, 175, 205, 234, 264,
       264])}

Regression Plots

In [3]:
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt

print("The following plots show the fitted curve(orange curve) and the 95% confidence interval(green curves)","\n")
for key in X:
    print("---Diet ",key,"---")
    x = X[key]
    y = Y[key]
    z = np.polyfit(x, y, 1) #least squares polynomial fit. Fit a polynomial of degree deg to points (x,y) np.polyfit(x, y, deg)
    print("Polynomial coefficients: ",z)
    p = np.poly1d(z) #poly1d consructs the 1d polynomial from the given coefficients
    xp = np.linspace(0, 22, 100) #linspace returns evenly spaced numbers over a specified interval np.linspace(start, stop, num_of_samples)
    stdev = np.sqrt(sum((p(x) - y)**2) / (len(y) - 2))
    #95% Confidence interval (Gaussian critical value for significance level 95% is 1.96)
    conf_l = p(xp) - 1.96*stdev 
    conf_h = p(xp) + 1.96*stdev
    
    #Plotting the three curves
    plt.plot(x,y,'.',xp, p(xp),'-',xp, conf_l, 'g-', xp, conf_h, 'g-')
    plt.show()
The following plots show the fitted curve(orange curve) and the 95% confidence interval(green curves) 

---Diet  1 ---
Polynomial coefficients:  [ 6.8417972  30.93098028]
---Diet  2 ---
Polynomial coefficients:  [ 8.60913629 28.63359552]
---Diet  3 ---
Polynomial coefficients:  [11.42287097 18.25032522]
---Diet  4 ---
Polynomial coefficients:  [ 9.71436556 30.79211951]
In [4]:
mean_final_weights_per_diet = {}
final_weights_df = {}
for key in DataFrameDict.keys():
    df = DataFrameDict[key]
    temp = df.loc[df['Time'] == 21]
    final_weights_df[key] = temp
    mean_final_weights_per_diet[key] = temp["weight"].mean()
print("Mean Final Weights Per Diet")
print(mean_final_weights_per_diet)
Mean Final Weights Per Diet
{1: 177.75, 2: 214.7, 3: 270.3, 4: 238.55555555555554}

Diet 3 gives the highest weight, hence we do a shuffle test of Diet 3 against all the three diets

Shuffle_Test

In [5]:
def shuffle(grps):
	num_grps = len(grps)
	pool = []
	
	# pool all values
	for i in range(num_grps):
		pool.extend(grps[i])
	# mix them up
	random.shuffle(pool)
	# reassign to groups of same size as original groups
	new_grps = []
	start_index = 0
	end_index = 0
	for i in range(num_grps):
		end_index = start_index + len(grps[i])
		new_grps.append(pool[start_index:end_index])
		start_index = end_index
	return new_grps
In [6]:
# subtracts group a mean from group b mean and returns result
def meandiff(grpA, grpB):
	return sum(grpB) / float(len(grpB)) - sum(grpA) / float(len(grpA))
In [7]:
diets = [1,2,4]
In [8]:
import random
samples = [None, None]
samples[1] = list(final_weights_df[3]["weight"])
for diet in diets:
    samples[0] = (list(final_weights_df[diet]["weight"]))
    a = 0
    b = 1
    observed_mean_diff = meandiff(samples[a], samples[b])

    count = 0
    num_shuffles = 10000
    for i in range(num_shuffles):
        new_samples = shuffle(samples)
        mean_diff = meandiff(new_samples[a], new_samples[b])
        # if the observed difference is negative, look for differences that are smaller
        # if the observed difference is positive, look for differences that are greater
        if observed_mean_diff < 0 and mean_diff <= observed_mean_diff:
            count = count + 1
        elif observed_mean_diff >= 0 and mean_diff >= observed_mean_diff:
            count = count + 1

    ######################################
    #
    # Output
    #
    ######################################
    print("**********Diet: ", diet, " vs Diet: 3**********")

    print ("Observed difference of two means: %.2f" % observed_mean_diff)
    print (count, "out of", num_shuffles, "experiments had a difference of two means", end=" ")
    if observed_mean_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_mean_diff, ".")
    print ("The chance of getting a difference of two means", end=" ")
    if observed_mean_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_mean_diff, "is", (count / float(num_shuffles)), "\n")
**********Diet:  1  vs Diet: 3**********
Observed difference of two means: 92.55
7 out of 10000 experiments had a difference of two means greater than or equal to 92.55 .
The chance of getting a difference of two means greater than or equal to 92.55 is 0.0007 

**********Diet:  2  vs Diet: 3**********
Observed difference of two means: 55.60
561 out of 10000 experiments had a difference of two means greater than or equal to 55.60 .
The chance of getting a difference of two means greater than or equal to 55.60 is 0.0561 

**********Diet:  4  vs Diet: 3**********
Observed difference of two means: 31.74
1355 out of 10000 experiments had a difference of two means greater than or equal to 31.74 .
The chance of getting a difference of two means greater than or equal to 31.74 is 0.1355 

Confidence Interval

In [ ]:
# x is an array of sample values
# returns a new array with randomly picked 
# (with replacement) values from x
def bootstrap(x):
	samp_x = []
	for i in range(len(x)):
		samp_x.append(random.choice(x))
	return samp_x
In [ ]:
import random
import math
conf_interval = 0.9
samples = [None, None]
samples[1] = list(final_weights_df[3]["weight"])
for diet in diets:
    samples[0] = (list(final_weights_df[diet]["weight"]))
    a = 0
    b = 1
    observed_mean_diff = meandiff(samples[a], samples[b])

    num_resamples = 10000   # number of times we will resample from our original samples
    out = []                # will store results of each time we resample

    for i in range(num_resamples):
        # get bootstrap samples for each of our groups
        # then compute our statistic of interest
        # append statistic to out
        bootstrap_samples = []  # list of lists
        for sample in samples:
            bootstrap_samples.append(bootstrap(sample))
        # now we have a list of bootstrap samples, run meandiff
        out.append(meandiff(bootstrap_samples[a], bootstrap_samples[b]))

    out.sort()

    tails = (1 - conf_interval) / 2

    # in case our lower and upper bounds are not integers,
    # we decrease the range (the values we include in our interval),
    # so that we can keep the same level of confidence
    lower_bound = int(math.ceil(num_resamples * tails))
    upper_bound = int(math.floor(num_resamples * (1 - tails)))

    ######################################
    #
    # Output
    #
    ######################################

    # print observed value and then confidence interval
    print("**********Diet: ", diet, " vs Diet: 3**********")
    print ("Observed difference between the means: %.2f" % observed_mean_diff)
    print ("We have", conf_interval * 100, "% confidence that the true difference between the means", end=" ")
    print ("is between: %.2f" % out[lower_bound], "and %.2f" % out[upper_bound])
    print (" ")

Confidence Interval - slopes

In [2]:
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt
from itertools import combinations
In [1]:
def get_slope(X, Y):
    z = np.polyfit(X, Y, 1)
    p = np.poly1d(z)
    xp = np.linspace(0, 22, 100)
#     plt.plot(X,Y,'.',xp, p(xp),'-')
#     plt.show()
    return z[0]

def get_slope_diff(diet_1_data, diet_2_data):
    X1, Y1 = diet_1_data
    X2, Y2 = diet_2_data
    return (get_slope(X1, Y1) - get_slope(X2, Y2))

def shuffle_diet_labels(data):
    new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet"])
    for i in range(22):
        temp = data.loc[data['Time']==i]
        count_row = temp.shape[0]
        if(count_row==0):
            continue
        else:
            temp['Diet'] = np.random.permutation(temp['Diet'].values)
            new_df = pd.concat([new_df, temp], ignore_index=True)
    return new_df

def significance_test_diet_slope(chickweight):
    weight = 0
    time = 1
    uniqueDiets = chickweight.Diet.unique()
    diet_comparisons = list(combinations(uniqueDiets, 2))
    diff_out_dict = {}
    for ele in diet_comparisons:
        diff_out = []
        diet_1, diet_2 = ele
        
        temp_1 = chickweight.loc[(chickweight['Diet'] == diet_1) | (chickweight['Diet'] == diet_2)]
        temp_1 = temp_1.drop(['Unnamed: 0'], axis=1)
        
        
        for i in range(500):
            if(i%100 ==0):
                print("Done: ", i)
            new_df = shuffle_diet_labels(temp_1)
        
            new_df_diet_1 = new_df[:][new_df.Diet == diet_1]
            new_df_diet_2 = new_df[:][new_df.Diet == diet_2]

            new_df_diet_1_values = new_df_diet_1.values
            new_df_diet_2_values = new_df_diet_2.values

            X1 = np.array(new_df_diet_1_values[:,time], dtype=float)
            Y1 = np.array(new_df_diet_1_values[:,weight], dtype=float)
            X2 = np.array(new_df_diet_2_values[:,time], dtype=float)
            Y2 = np.array(new_df_diet_2_values[:,weight], dtype=float)

            slope_diff = get_slope_diff((X1,Y1),(X2,Y2))   
            diff_out.append(slope_diff)
        diff_out_dict[ele] = diff_out    
    return diff_out_dict
In [10]:
diff_out = significance_test_diet_slope(chickweight)
Done:  0
/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Done:  100
Done:  200
Done:  300
Done:  400
Done:  0
Done:  100
Done:  200
Done:  300
Done:  400
Done:  0
Done:  100
Done:  200
Done:  300
Done:  400
Done:  0
Done:  100
Done:  200
Done:  300
Done:  400
Done:  0
Done:  100
Done:  200
Done:  300
Done:  400
Done:  0
Done:  100
Done:  200
Done:  300
Done:  400
In [19]:
import math
observed_diff = {(1,2): -1.769,(1,3): -4.5828,(1,4):  -2.874,(2,3): -2.8137,(2,4): -1.105,(3,4):1.7085}
In [25]:
conf_interval = 0.9
for key, value in diff_out.items():
    diet1 = key[0]
    diet2 = key[1]
    value.sort()
    
    tails = (1 - conf_interval) / 2
    # in case our lower and upper bounds are not integers,
    # we decrease the range (the values we include in our interval),
    # so that we can keep the same level of confidence
    lower_bound = int(math.ceil(500 * tails))
    upper_bound = int(math.floor(500 * (1 - tails)))

    ######################################
    #
    # Output
    #
    ######################################

    # print observed value and then confidence interval
    print("**********Diet: ", diet1, " vs Diet: ",diet2,"**********")
    print ("Observed difference between the means: %.2f" % observed_diff[key])
    print ("We have", conf_interval * 100, "% confidence that the true difference between the means", end=" ")
    print ("is between: %.2f" % value[lower_bound], "and %.2f" % value[upper_bound])
    print (" ")
**********Diet:  1  vs Diet:  2 **********
Observed difference between the means: -1.77
We have 90.0 % confidence that the true difference between the means is between: -1.19 and 1.13
 
**********Diet:  1  vs Diet:  3 **********
Observed difference between the means: -4.58
We have 90.0 % confidence that the true difference between the means is between: -1.45 and 1.29
 
**********Diet:  1  vs Diet:  4 **********
Observed difference between the means: -2.87
We have 90.0 % confidence that the true difference between the means is between: -1.07 and 1.05
 
**********Diet:  2  vs Diet:  3 **********
Observed difference between the means: -2.81
We have 90.0 % confidence that the true difference between the means is between: -1.62 and 1.62
 
**********Diet:  2  vs Diet:  4 **********
Observed difference between the means: -1.10
We have 90.0 % confidence that the true difference between the means is between: -1.23 and 1.34
 
**********Diet:  3  vs Diet:  4 **********
Observed difference between the means: 1.71
We have 90.0 % confidence that the true difference between the means is between: -1.24 and 1.19