#!/usr/bin/python ###################################### # Difference between Two Means Confidence Interval # From: Statistics is Easy! By Dennis Shasha and Manda Wilson # # Uses shuffling & bootstrapping to get a 90% confidence interval for the difference between two means. # # Author: Manda Wilson # # Example of FASTA formatted input file (this is only for 2 groups): # >placebo_vals # 54 51 58 44 55 52 42 47 58 46 # >drug_vals # 54 73 53 70 73 68 52 65 65 # # Pseudocode: # # 1. Measure the difference between the two group means. The difference in means is measured # by sum(grpA) / len(grpA) - sum(grpB) / len(grpB). In this example the difference between # the two group means is 12.97. # # 2. Do the following 10,000 times: # a. For each sample we have get a bootstrap sample: # i. Create a new array of the same size as the original sample # ii. Fill the array with randomly picked values from the original sample (randomly picked with replacement) # b. Measure the difference between the two bootstrap group means, just as we did in step (1) # with the original samples. # # 3. Sort the differences computed in step (2). # # 4. Compute the size of each interval tail. If we want a 90% confidence interval, then 1 - 0.9 yields the # portion of the interval in the tails. We divide this by 2 to get the size of each tail, in this case 0.05. # # 5. Compute the upper and lower bounds. To get the lower bound we multiply the tail size by the number of # bootstraps we ran and round this value up to the nearest integer (to make sure this value maps # to an actual boostrap we ran). In this case we multiple 0.05 by 10,000 and get 500, which rounds up to 500. # # To compute the upper bound we subtract the tail size from 1 and then multiply that by the number of bootstraps # we ran. Round the result of the last step down to the nearest integer. Again, this is to ensure this value # maps to an actual bootstrap we ran. We round the lower bound up and the upper bound down to reduce the # confidence interval size, so that we can still say we have as much confidence in this result. # # 6. The bootstrap values at the lower bound and upper bound give us our confidence interval. # ###################################### import random import math import sys ###################################### # # Adjustable variables # ###################################### input_file = "Diff2Mean.vals" conf_interval = 0.9 ###################################### # # Subroutines # ###################################### # x is an array of sample values # returns a new array with randomly picked # (with replacement) values from x def bootstrap(x): samp_x = [] for i in range(len(x)): samp_x.append(random.choice(x)) return samp_x # subtracts group a mean from group b mean and returns result def meandiff(grpA, grpB): return sum(grpB) / float(len(grpB)) - sum(grpA) / float(len(grpA)) ###################################### # # Computations # ###################################### # list of lists samples = [] a = 0 b = 1 # file must be in FASTA format infile=open(input_file) for line in infile: if line.startswith('>'): # start of new sample samples.append([]) elif not line.isspace(): # line must contain values for previous sample samples[len(samples) - 1] += map(float,line.split()) infile.close() observed_mean_diff = meandiff(samples[a], samples[b]) num_resamples = 10000 # number of times we will resample from our original samples out = [] # will store results of each time we resample for i in range(num_resamples): # get bootstrap samples for each of our groups # then compute our statistic of interest # append statistic to out bootstrap_samples = [] # list of lists for sample in samples: bootstrap_samples.append(bootstrap(sample)) # now we have a list of bootstrap samples, run meandiff out.append(meandiff(bootstrap_samples[a], bootstrap_samples[b])) out.sort() tails = (1 - conf_interval) / 2 # in case our lower and upper bounds are not integers, # we decrease the range (the values we include in our interval), # so that we can keep the same level of confidence lower_bound = int(math.ceil(num_resamples * tails)) upper_bound = int(math.floor(num_resamples * (1 - tails))) ###################################### # # Output # ###################################### # print observed value and then confidence interval print ("Observed difference between the means: %.2f" % observed_mean_diff) print ("We have", conf_interval * 100, "% confidence that the true difference between the means", end=" ") print ("is between: %.2f" % out[lower_bound], "and %.2f" % out[upper_bound])