#!/usr/bin/python ###################################### # Linear Correlation Confidence Interval # From: Statistics is Easy! By Dennis Shasha and Manda Wilson # # Uses shuffling & bootstrapping to get a 90% confidence interval for the r statistic. # # Author: Manda Wilson # # Example of FASTA formatted input file: # >grp_x # 1350 1510 1420 1210 1250 1300 1580 1310 1290 1320 1490 1200 1360 # >grp_y # 3.6 3.8 3.7 3.3 3.9 3.4 3.8 3.7 3.5 3.4 3.8 3.0 3.1 # # Included in the code, but NOT in the pseudocode, # is the bias-corrected confidence interval. # See the chapter titled "Bias Corrected Confidence Intervals" in Statistics is Easy! # for more information on bias-corrected cofidence intervals. # # Pseudocode: # # 1. Calculate r for the observed values (in our example it is .58). # a. mean_x = sum of x values / number of x values # b. mean_y = sum of y values / number of y values # c. Calculate the sum of products: # i. Initialize total to 0 # ii. For each pair of x, y values: # I. product = (x - mean_x) * (y - mean_y) # II. total += product # d. Calculate the sum of squares for the x values: # i. Initialize total to 0 # ii. For each x value: # I. diff_sq = (x - mean_x)^2 # II. total += diff_sq # e. Calculate the sum of squares for the y values as we did for the x values in step (1d) # f. r = sum of products / square root(sum of squares x * sum of squares y) # # 2. Do the following 10,000 times: # a. # i. Create two new arrays of the same size as the original sample arrays (one for x and one for y) # ii. Fill the new arrays with randomly picked pairs from the original arrays (randomly picked with replacement) # b. Compute the regression line on the bootstrap samples, just as we did in step (1) # with the original samples. # # 3. Sort the differences computed in step (2). # # 4. Compute the size of each interval tail. If we want a 90% confidence interval, then 1 - 0.9 yields the # portion of the interval in the tails. We divide this by 2 to get the size of each tail, in this case 0.05. # # 5. Compute the upper and lower bounds. To get the lower bound we multiply the tail size by the number of # bootstraps we ran and round this value up to the nearest integer (to make sure this value maps # to an actual boostrap we ran). In this case we multiple 0.05 by 10,000 and get 500, which rounds up to 500. # # To compute the upper bound we subtract the tail size from 1 and then multiply that by the number of bootstraps # we ran. Round the result of the last step down to the nearest integer. Again, this is to ensure this value # maps to an actual bootstrap we ran. We round the lower bound up and the upper bound down to reduce the # confidence interval size, so that we can still say we have as much confidence in this result. # # 6. The bootstrap values at the lower bound and upper bound give us our confidence interval. # ###################################### import math import random ###################################### # # Adjustable variables # ###################################### input_file = 'Correlation.vals' conf_interval = 0.9 ###################################### # # Subroutines # ###################################### # maps proportion of values above statistic # to number of standard deviations above statistic # keys will be index / 100 \:[0-9]\.[0-9][0-9]\, area_to_sd_map = [0.0000, 0.0040, 0.0080, 0.0120, 0.0160, 0.0199, 0.0239, 0.0279, 0.0319, 0.0359, 0.0398, 0.0438, 0.0478, 0.0517, 0.0557, 0.0596, 0.0636, 0.0675, 0.0714, 0.0753, 0.0793, 0.0832, 0.0871, 0.0910, 0.0948, 0.0987, 0.1026, 0.1064, 0.1103, 0.1141, 0.1179, 0.1217, 0.1255, 0.1293, 0.1331, 0.1368, 0.1406, 0.1443, 0.1480, 0.1517, 0.1554, 0.1591, 0.1628, 0.1664, 0.1700, 0.1736, 0.1772, 0.1808, 0.1844, 0.1879, 0.1915, 0.1950, 0.1985, 0.2019, 0.2054, 0.2088, 0.2123, 0.2157, 0.2190, 0.2224, 0.2257, 0.2291, 0.2324, 0.2357, 0.2389, 0.2422, 0.2454, 0.2486, 0.2517, 0.2549, 0.2580, 0.2611, 0.2642, 0.2673, 0.2704, 0.2734, 0.2764, 0.2794, 0.2823, 0.2852, 0.2881, 0.2910, 0.2939, 0.2967, 0.2995, 0.3023, 0.3051, 0.3078, 0.3106, 0.3133, 0.3159, 0.3186, 0.3212, 0.3238, 0.3264, 0.3289, 0.3315, 0.3340, 0.3365, 0.3389, 0.3413, 0.3438, 0.3461, 0.3485, 0.3508, 0.3531, 0.3554, 0.3577, 0.3599, 0.3621, 0.3643, 0.3665, 0.3686, 0.3708, 0.3729, 0.3749, 0.3770, 0.3790, 0.3810, 0.3830, 0.3849, 0.3869, 0.3888, 0.3907, 0.3925, 0.3944, 0.3962, 0.3980, 0.3997, 0.4015, 0.4032, 0.4049, 0.4066, 0.4082, 0.4099, 0.4115, 0.4131, 0.4147, 0.4162, 0.4177, 0.4192, 0.4207, 0.4222, 0.4236, 0.4251, 0.4265, 0.4279, 0.4292, 0.4306, 0.4319, 0.4332, 0.4345, 0.4357, 0.4370, 0.4382, 0.4394, 0.4406, 0.4418, 0.4429, 0.4441, 0.4452, 0.4463, 0.4474, 0.4484, 0.4495, 0.4505, 0.4515, 0.4525, 0.4535, 0.4545, 0.4554, 0.4564, 0.4573, 0.4582, 0.4591, 0.4599, 0.4608, 0.4616, 0.4625, 0.4633, 0.4641, 0.4649, 0.4656, 0.4664, 0.4671, 0.4678, 0.4686, 0.4693, 0.4699, 0.4706, 0.4713, 0.4719, 0.4726, 0.4732, 0.4738, 0.4744, 0.4750, 0.4756, 0.4761, 0.4767, 0.4772, 0.4778, 0.4783, 0.4788, 0.4793, 0.4798, 0.4803, 0.4808, 0.4812, 0.4817, 0.4821, 0.4826, 0.4830, 0.4834, 0.4838, 0.4842, 0.4846, 0.4850, 0.4854, 0.4857, 0.4861, 0.4864, 0.4868, 0.4871, 0.4875, 0.4878, 0.4881, 0.4884, 0.4887, 0.4890, 0.4893, 0.4896, 0.4898, 0.4901, 0.4904, 0.4906, 0.4909, 0.4911, 0.4913, 0.4916, 0.4918, 0.4920, 0.4922, 0.4925, 0.4927, 0.4929, 0.4931, 0.4932, 0.4934, 0.4936, 0.4938, 0.4940, 0.4941, 0.4943, 0.4945, 0.4946, 0.4948, 0.4949, 0.4951, 0.4952, 0.4953, 0.4955, 0.4956, 0.4957, 0.4959, 0.4960, 0.4961, 0.4962, 0.4963, 0.4964, 0.4965, 0.4966, 0.4967, 0.4968, 0.4969, 0.4970, 0.4971, 0.4972, 0.4973, 0.4974, 0.4974, 0.4975, 0.4976, 0.4977, 0.4977, 0.4978, 0.4979, 0.4979, 0.4980, 0.4981, 0.4981, 0.4982, 0.4982, 0.4983, 0.4984, 0.4984, 0.4985, 0.4985, 0.4986, 0.4986, 0.4987, 0.4987, 0.4987, 0.4988, 0.4988, 0.4989, 0.4989, 0.4989, 0.4990, 0.4990] def sd_to_area(sd): sign = 1 if sd < 0: sign = -1 sd = math.fabs(sd) # get the absolute value of sd index = int(sd * 100) if len(area_to_sd_map) <= index: return sign * area_to_sd_map[-1] # return last element in array if index == (sd * 100): return sign * area_to_sd_map[index] return sign * (area_to_sd_map[index] + area_to_sd_map[index + 1]) / 2 def area_to_sd(area): sign = 1 if area < 0: sign = -1 area = math.fabs(area) for a in range(len(area_to_sd_map)): if area == area_to_sd_map[a]: return sign * a / 100 if 0 < a and area_to_sd_map[a - 1] < area and area < area_to_sd_map[a]: # our area is between this value and the previous # for simplicity, we will just take the sd half way between a - 1 and a return sign * (a - .5) / 100 return sign * (len(area_to_sd_map) - 1) / 100 # x, y are arrays of sample values # returns two new arrays with randomly picked # (with replacement) pairs from x and y def bootstrap(x, y): samp_x = [] samp_y = [] num_samples = len(x) for i in range(num_samples): index = random.randint(0, num_samples - 1) samp_x.append(x[index]) samp_y.append(y[index]) return (samp_x, samp_y) def sumofsq(vals, mean): count = len(vals) total = 0 for i in range (count): diff_sq = (vals[i] - mean)**2 total += diff_sq return total def sumofproducts(x_vals, y_vals, mean_x, mean_y): count = len(x_vals) total = 0 for i in range (count): product = (x_vals[i] - mean_x) * (y_vals[i] - mean_y) total += product return total def corrcoef(x, y): sum_x = sum(x) sum_y = sum(y) count_x = float(len(x)) count_y = float(len(y)) mean_x = sum_x / count_x mean_y = sum_y / count_y # get the sum of products sum_of_prod = sumofproducts(grp_x, grp_y, mean_x, mean_y) # get the sum of squares for x and y sum_of_sq_x = sumofsq(grp_x, mean_x) sum_of_sq_y = sumofsq(grp_y, mean_y) return sum_of_prod / math.sqrt(sum_of_sq_x * sum_of_sq_y) ###################################### # # Computations # ###################################### # list of lists samples = [] # file must be in FASTA format infile=open(input_file) for line in infile: if line.startswith('>'): # start of new sample samples.append([]) elif not line.isspace(): # line must contain values for previous sample samples[len(samples) - 1] += map(float,line.split()) infile.close() grp_x = samples[0] grp_y = samples[1] observed_r = corrcoef(grp_x, grp_y) num_resamples = 10000 # number of times we will resample from our original samples num_below_observed = 0 # count the number of bootstrap values below the observed sample statistic out = [] # will store results of each time we resample for i in range(num_resamples): # bootstrap - then compute our statistic of interest # keep pairs together # append statistic to out (boot_x, boot_y) = bootstrap(grp_x, grp_y) # now we have a list of bootstrap samples, run corrcoef boot_r = corrcoef(boot_x, boot_y) if boot_r < observed_r: num_below_observed += 1 out.append(boot_r) out.sort() # standard confidence interval computations tails = (1 - conf_interval) / 2 # in case our lower and upper bounds are not integers, # we decrease the range (the values we include in our interval), # so that we can keep the same level of confidence lower_bound = int(math.ceil(num_resamples * tails)) upper_bound = int(math.floor(num_resamples * (1 - tails))) # bias-corrected confidence interval computations p = num_below_observed / float(num_resamples) # proportion of bootstrap values below the observed value dist_from_center = p - .5 # if this is negative, the original is below the center, if positive, it is above z_0 = area_to_sd(dist_from_center) # now we want to find the proportion that should be between the mean and one of the tails tail_sds = area_to_sd(conf_interval / 2) z_alpha_over_2 = 0 - tail_sds z_1_minus_alpha_over_2 = tail_sds # in case our lower and upper bounds are not integers, # we decrease the range (the values we include in our interval), # so that we can keep the same level of confidence bias_corr_lower_bound = int(math.ceil(num_resamples * (0.5 + sd_to_area(z_alpha_over_2 + (2 * z_0))))) bias_corr_upper_bound = int(math.floor(num_resamples * (0.5 + sd_to_area(z_1_minus_alpha_over_2 + (2 * z_0))))) ###################################### # # Output # ###################################### # print observed value and then confidence interval print "Observed r: %.2f" % observed_r print "We have ", conf_interval * 100, "% confidence that the true r", print "is between: %.2f" % out[lower_bound], "and %.2f" % out[upper_bound] print "***** Bias Corrected Confidence Interval *****" print "We have", conf_interval * 100, "% confidence that the true r", print "is between: %.2f" % out[bias_corr_lower_bound], "and %.2f" % out[bias_corr_upper_bound]