#!/usr/bin/python ###################################### # Matrix comparison significance test # By Jacopo Cirrone and Dennis Shasha # # Given two arrays, compare every two locations to determine whether # they have the same relation to one another in the two arrays, e.g. # does for all i, j compare v1[i] <= v1[j] and v2[i] <= v2[j]. # If they are both true or both false, then add one to the similarity score. # Otherwise, do not. # Then 10,000 times shuffle one array and try this again. # This will give a p-value. # import random input_file = "vivinput" ###################################### # # Subroutines # ###################################### # takes a list of groups (two or more) # pools all values, shuffles them, and makes new groups # of same size as original groups # returns these new groups # example of shuffle with more than two groups: OneWayAnovaSig.py def shuffle(myarray): random.shuffle(myarray) return myarray # count similar relationships greater than or equal to or # less than relationships def countsim(arr1, arr2): if len(arr1) != len(arr2): print("Array lengths are not the same.") return 0 mysim = 0 for i in range(len(arr1)): j = i+1 while (j < len(arr1)): if (arr1[i] <= arr1[j]) and (arr2[i] <= arr2[j]): mysim+= 1 if (arr1[i] > arr1[j]) and (arr2[i] > arr2[j]): mysim+= 1 j+= 1 return mysim # find p-value for a pair of arrays def findpvalue(orig1, orig2, numshuffles): if len(orig1) != len(orig2): print("Array lengths are not the same.") return 1 origcount = countsim(orig1, orig2) countbetter = 0 for i in range(numshuffles): randarr = random.sample(orig2, k=len(orig2)) # print(randarr) randcount = countsim(orig1, randarr) if randcount >= origcount: countbetter+= 1 return countbetter / numshuffles ###################################### # # Computations # ###################################### numshuffles = 1000 # file must be in format name followed by numbers (e.g. a four by four # matrix will have 16 numbers infile=open(input_file, 'r') allnames = [] allmat = [] alllines = infile.readlines() for line in alllines: x = line[:-1].split(" ") allnames.append(x[0]) y = [] for e in x[1:]: y.append(float(e)) allmat.append(y) infile.close() for i in range(len(allmat)): j= i+1 while j < len(allmat): pval = findpvalue(allmat[i], allmat[j], numshuffles) print("Measurements " + allnames[i] + " and " + allnames[j] + " have similarity p-value of: " + str(pval)) j+= 1