#!/usr/bin/python

######################################
# Difference between Two Means Significance Test
# From: Statistics is Easy! By Dennis Shasha and Manda Wilson
# 
# Assuming that there is no significant difference in the means of 
# the two samples, tests to see the probability of getting a difference 
# greater than or equal to the observed difference in the means by chance 
# alone.  Uses shuffling & bootstrapping to get a distribution to compare 
# to the observed statistic.
# 
# Author: Manda Wilson
#
# Example of FASTA formatted input file (this is only for 2 groups): 
# >placebo_vals
# 54 51 58 44 55 52 42 47 58 46
# >drug_vals
# 54 73 53 70 73 68 52 65 65
#
# Pseudocode:
#
# 1. Measure the difference between the two group means.  The difference in means is measured
#    by (sum(grpA) / len(grpA)) - (sum(grpB) / len(grpB)).  In this example the difference between
#    the two group means is 12.97.
#
# 2. Set a counter to 0, this will count the number of times we get a difference
#    between the means greater than or equal to 12.97.  
#
# 3. Do the following 10,000 times:
#    a. Shuffle the original measurements.  To do this:
#       i. put the values from all the groups into one array but remembering the start and end indexes of each group
#       ii. shuffle the values in the array, effectively reassigning the values to different groups
#    b. Measure the difference between the two group means, just as we did in step (1).
#    c. If the difference from step (3b) is greater than or equal to 12.97, increment our counter 
#       from step (2). Note: if our original difference between the means were a negative value 
#       we would check for values less than or equal to that value.
#
# 4. counter / 10,000 equals the probability of getting our observed difference of two means greater than
#    or equal to 12.97, if there is in fact no significant difference.
#
######################################

import random

######################################
#
# Adjustable variables
#
######################################

input_file = "Diff2Mean.vals"

######################################
#
# Subroutines
#
######################################

# takes a list of groups (two or more)
# pools all values, shuffles them, and makes new groups 
# of same size as original groups
# returns these new groups
# example of shuffle with more than two groups: OneWayAnovaSig.py
def shuffle(grps):
	num_grps = len(grps)
	pool = []
	
	# pool all values
	for i in range(num_grps):
		pool.extend(grps[i])
	# mix them up
	random.shuffle(pool)
	# reassign to groups of same size as original groups
	new_grps = []
	start_index = 0
	end_index = 0
	for i in range(num_grps):
		end_index = start_index + len(grps[i])
		new_grps.append(pool[start_index:end_index])
		start_index = end_index
	return new_grps

# subtracts group a mean from group b mean and returns result
def meandiff(grpA, grpB):
	return sum(grpB) / float(len(grpB)) - sum(grpA) / float(len(grpA))

######################################
#
# Computations
#
######################################

# list of lists
samples = [] 
a = 0
b = 1

# file must be in FASTA format
infile=open(input_file)
for line in infile:
	if line.startswith('>'):
		# start of new sample
		samples.append([])
	elif not line.isspace():
		# line must contain values for previous sample
		samples[len(samples) - 1] += map(float,line.split())
infile.close()

observed_mean_diff = meandiff(samples[a], samples[b])

count = 0
num_shuffles = 10000

for i in range(num_shuffles):
	new_samples = shuffle(samples)
	mean_diff = meandiff(new_samples[a], new_samples[b])
	# if the observed difference is negative, look for differences that are smaller
	# if the observed difference is positive, look for differences that are greater
	if observed_mean_diff < 0 and mean_diff <= observed_mean_diff:
		count = count + 1
	elif observed_mean_diff >= 0 and mean_diff >= observed_mean_diff:
		count = count + 1

######################################
#
# Output
#
######################################

print( "Observed difference of two means: %.2f " % observed_mean_diff )
print( count, "out of ", num_shuffles, " experiments had a difference of two means", end = " ")
if observed_mean_diff < 0:
	print( "less than or equal to", end="")
else: 
	print( "greater than or equal to", end = " ")
print( " %.2f " % observed_mean_diff, ".")
print( "The chance of getting a difference of two means", end = " ")
if observed_mean_diff < 0:
	print( "less than or equal to", end = " ")
else:
	print( "greater than or equal to", end = " ")
print( "%.2f" % observed_mean_diff, "is", (count / float(num_shuffles)), ".")