#!/usr/bin/python

######################################
# Linear Correlation Significance Test
# From: Statistics is Easy! By Dennis Shasha and Manda Wilson
#
# Assuming that x is not a good predictor of y, tests to see the probability
# of getting a r by chance alone greater than or equal to the one observed (less
# than or equal to the one observed if the observed r is negative).
# Uses shuffling to get a distribution to compare
# to the observed r.
#
# Author: Manda Wilson
#
# Example of FASTA formatted input file:
# >grp_x
# 1350 1510 1420 1210 1250 1300 1580 1310 1290 1320 1490 1200 1360
# >grp_y
# 3.6 3.8 3.7 3.3 3.9 3.4 3.8 3.7 3.5 3.4 3.8 3.0 3.1
# 
# Pseudocode:
#
# 1. Calculate r for the observed values (in our example it is .58).
#    a. mean_x = sum of x values / number of x values
#    b. mean_y = sum of y values / number of y values
#    c. Calculate the sum of products:
#       i. Initialize total to 0
#       ii. For each pair of x, y values:
#           I. product = (x - mean_x) * (y - mean_y)
#           II. total += product
#    d. Calculate the sum of squares for the x values:
#       i. Initialize total to 0
#       ii. For each x value:
#           I. diff_sq = (x - mean_x)^2
#           II. total += diff_sq
#    e. Calculate the sum of squares for the y values as we did for the x values in step (1d) 
#    f. r = sum of products / square root(sum of squares x  * sum of squares y) 
#
# 2. Set a counter to 0, this will count the number of times we get a r 
#    greater than or equal to 0.58.  Note: if you have a negative r, count
#    the number of times you get a slope less than or equal to the original negative r.
#
# 3. Do the following 10,000 times:
#    a. Shuffle the y values.
#    b. Calculate r on the results from step (3a), just as we did in step (1)
#    c. If r from step (3b) is greater than or equal to our observed r (0.58),
#       increment our counter from step (2).
#
# 3. counter / 10,000 equals the probability of getting a r greater than or equal to
#    0.58, assuming there is no difference between the groups
#
######################################

import math
import random

######################################
#
# Adjustable variables
#
######################################

input_file = 'Correlation.vals'

######################################
#
# Subroutines
#
######################################

def sumofsq(vals, mean):
	count = len(vals)
	total = 0
	for i in range (count):
		diff_sq = (vals[i] - mean)**2
		total += diff_sq
	return total

def sumofproducts(x_vals, y_vals,  mean_x, mean_y):
	count = len(x_vals)
	total = 0
	for i in range (count):
		product = (x_vals[i] - mean_x) * (y_vals[i] - mean_y)
		total += product
	return total

def corrcoef(x, y):
	sum_x = sum(x)
	sum_y = sum(y)
	count_x = float(len(x))
	count_y = float(len(y))
	mean_x = sum_x / count_x
	mean_y = sum_y / count_y

	# get the sum of products
	sum_of_prod = sumofproducts(grp_x, grp_y, mean_x, mean_y)

	# get the sum of squares for x and y
	sum_of_sq_x = sumofsq(grp_x, mean_x)
	sum_of_sq_y = sumofsq(grp_y, mean_y)

	return sum_of_prod / math.sqrt(sum_of_sq_x * sum_of_sq_y)


######################################
#
# Computations
#
######################################
# list of lists
samples = []

# file must be in FASTA format
infile=open(input_file)
for line in infile:
        if line.startswith('>'):
                # start of new sample
                samples.append([])
        elif not line.isspace():
                # line must contain values for previous sample
                samples[len(samples) - 1] += map(float,line.split())
infile.close()

grp_x = samples[0]
grp_y = samples[1]

observed_r = corrcoef(grp_x, grp_y)

count = 0
num_shuffles = 10000

for i in range(num_shuffles):
	# we want to break the relationship between the pairs, so just shuffle one group 
	random.shuffle(grp_y)
	r = corrcoef(grp_x, grp_y)
	if (observed_r > 0 and r >= observed_r) or (observed_r < 0 and r <= observed_r):
		count = count + 1

######################################
#
# Output
#
######################################

print "Observed r: %.2f" % observed_r 
print count, "out of 10000 experiments had a r",
if observed_r > 0:
	print "greater than or equal to",
else:
	print "less than or equal to",
print "%.2f" % observed_r
print "Probability that chance alone gave us a r",
if observed_r > 0:
	print "greater than or equal to",
else:
	print "less than or equal to",
print "%.2f" %observed_r, "is %.2f" % (count / float(num_shuffles))