#!/usr/bin/python ## Author: Dennis Shasha (shasha@courant.nyu.edu) 20150513 # THIS IS THE ONLY PART TO CHANGE #inputfiles = ['FoldChange1.csv.tmp'] #inputfiles = ['GSE5612_FoldChange.csv','E-MEXP-1299_FoldChange.csv','NASCexp196_FoldChange.csv','GSE10016_FoldChange.csv','E-MEXP-1304_FoldChange.csv'] inputfiles = ['Final_FoldChange.csv'] # END OF PART TO CHANGE # As of this writing April, 2015, the target gene is always the one at position # 0 of the boolean data. # We are generating data booldata rather than having it delivered to us. # Also we generate the plausible sets of genes (corresponding to myindexes) # that we want to test. We have two ways of doing this: one just takes # a whole set of indexes and generates the necessary tests. # The other takes specific sets. # The method works, measures impurity properly # and can identify the correct triple of genes (indexes) that influence # the target. # We are doing inference on matrices having columns of the form # val1, val2, val3, ... valk, valk+1 # where we are going to collect certain columns by indexes and then # see whether we have good purity when inferring the last in a group # from earlier members in a group. # So our function will take a set of "experiments" each consisting of k+1 # columns and a set of index sets cach consisting of some sequence of # size S of # numbers between 0 and k # where S is the number of putative causal elements. # Algorithm works in two steps: # for each index set # go through each experiment and collect in a dictionary all the # values corresponding to that index set, e.g. for index set 13, 4, 7, 91 # you'll collect the values in those columns and index them by that index # set. # At the end, we'll have # '13_4_7_91': [ [values from first experiment], # [values from second experiment], ...] # Then we'll go through each element of this dictionary # and evaluate "purity". Purity means we take all but the last element # for each array of values and create a hash structure of those. Then # we'll see how often we get a consistent value for each of them. # For example, if we have # [0, 0, 0, 1] # [0, 0, 0, 0] # [0, 0, 1, 1] # [0, 0, 1, 1] # [0, 1, 0, 0] # [0, 1, 0, 1] # [0, 1, 0, 0] # etc. then the total "impurity" equals sum number of elements having # the same values of the three first attributes (in this case) * (1-fraction # of values having the most frequent value). # In this case 2 * (1-1/2) + 2 * 0 + 3 * (1-2/3). # We want combinations with minimum impurity. import sys import math import csv import os import copy import operator import doctest import collections import datetime import random from operator import itemgetter, attrgetter from itertools import groupby from sets import Set sys.setrecursionlimit(20000) now = datetime.datetime.now() currentyear = now.year # BASICS # x is a list and we want index of max element # comprehension def argmax(x): L = [i for i in range(len(x)) if x[i] == max(x)] return L[0] # x is a list and we want index of func element def argfunc(x, func): L = [i for i in range(len(x)) if x[i] == func(x)] return L[0] # APPLICATION-SPECIFIC # flatten an array into a string def flatten(arr): out="" for a in arr: out+=str(a) return out # fill in ansarray with independent values only (nothing on target) def genanswerarray(num): out = [] i = 0 while(i < numlogicvalues): oneval = [] if (num >= 2): g = genanswerarray(num-1) for r in g: oneval=[convertlogicval(i)] for y in r: oneval.append(y) out.append(oneval) else: out.append([convertlogicval(i)]) i+=1 return out # convert logic val from 0 to numlogicvalues to something symmetric around 0 # if numlogicvalues = 2 then 0 and 1, no change # if numlogicvalues = 3 then -1, 0, 1 , so -1 # if numlogicvalues = 4 then -1, 0, 1, 2, so -1 # if numlogicvalues = 5 then -2, -1, 0, 1, 2, so -2 # if numlogicvalues = 6 then -2, -1, 0, 1, 2, 2, so -2 # if numlogicvalues = 7 then -3, -2, -1, 0, 1, 2, 3, so -3 def convertlogicval(x): d = math.floor((numlogicvalues - 1)/2) return x - int(d) # fill in ans dictionary from ansarray # for now the critical index are always the first numpre def genans(ansarray): ans = {} for y in ansarray: r = y[:numpre] x = random.randint(0,numlogicvalues-1) ans[flatten(r)] = convertlogicval(x) return ans # fill in ans dictionary from ansarray but in a monotonic manner def genansmonotone(ansarray, hiflag, currenttargetval, pchange): ans = {} for y in ansarray: r = y[:numpre] ans[flatten(r)] = currenttargetval if (random.random() < 5*pchange): if (hiflag == 1) & (currenttargetval < convertlogicval(numlogicvalues - 1)): currenttargetval+= 1 if (hiflag == 0) & (currenttargetval > convertlogicval(0)): currenttargetval-= 1 return ans # generate boolean data based on probability of correctness def genbooldata(): i = 0 row = [] while(i < numexperiments): exper = [] j = 0 while(j < numpre): x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) exper.append(x) j+=1 goodanswer = ans[flatten(exper)] if (probcorrect >= random.random()): exper.append(goodanswer) else: # take another answer x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) while(x == goodanswer): x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) exper.append(x) while(j < numtotalgenes): # continue with j x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) exper.append(x) j+= 1 row.append(exper) i+= 1 return row # given the boolean data and the sets of indexes # (reflecting the genes to consider) # Our target is always index 0. # for each index set we're going to go through each # row of booldata and see how well it does. def findscores(booldata, indexsets): impurityscore = [] for myind in indexsets: pair = [[myind]] myindflat = flatten(myind) myset = [] for myrow in booldata: x = [] for i in myind: x.append(myrow[i]) x.append(myrow[0]) myset.append(x) y = evaluateimpurity(myset) pair.append(y) impurityscore.append(pair) return impurityscore # find the impurity in a set def evaluateimpurity(myset): mydict = {} for s in myset: x = flatten(s[:-1]) y = s[-1] if x in mydict: mydict[x].append(y) else: mydict[x]=[y] myimpurity = 0 for d in mydict: myvals = sorted(mydict[d]) counts = [] current = myvals[0] x = 0 for v in myvals: if (v == current): x+= 1 else: counts.append(x) current = v x = 1 counts.append(x) m = max(counts) s = 0.0 + len(myvals) # myimpurity += (1 - (m/s)) * s myimpurity += s - m # prob of being incorrect within this group # of input values times size of group # Note that (1 - (m/s)) is prob of being incorrect # within the group (anything but majority) and s # is size of group. (1 - (m/s)) * s = ((s-m)/s)*s # = s- m return myimpurity # given the boolean data and a single set of indexes # (reflecting the genes to consider) # Our target is always index 0. # find out what each boolean combination of inputs predicts about the output. def findpredict(booldata, myind, impurityscore): xh = [] for i in myind: xh.append(headers[i]) xh.append(headers[0]) # xh has headers of all columns including target plus the impurity score myset = [] for myrow in booldata: x = [] for i in myind: x.append(myrow[i]) x.append(myrow[0]) myset.append(x) # all values pertaining to these indexes subpair = findplurality(myset) y = subpair[0] xh.append(subpair[1]) pair = [xh, y] return pair # find the basic prediction for each set of boolean input values def findplurality(myset): out = [] countcorrect = 0 counttotal = 0 mydict = {} for s in myset: x = flatten(s[:-1]) y = s[-1] if x in mydict: mydict[x].append(y) else: mydict[x]=[y] # mydict is grouped by predictors for d in mydict: myvals = sorted(mydict[d]) mycounts = [] myuniques = [] for k, g in groupby(myvals, lambda x:x): mycounts.append(len(list(g))) myuniques.append(k) m = max(mycounts) countcorrect+= m counttotal+= len(myvals) j = argmax(mycounts) bestval = myuniques[j] out.append([d, bestval, m, (m+0.0)/len(myvals), computepvalue(m, len(myvals), 1.0/len(set(myvals)))]) out.sort() return (out, round(((countcorrect+0.0)/counttotal),2)) # This generates index sets of size mynum from the list inputindexes # starting at i def genindexsets(mynum, myinputindexes, i): out = [] if (mynum == 1): while(i < len(myinputindexes) ): out.append([myinputindexes[i]]) i+= 1 return out newmynum = mynum - 1 while(i < (len(myinputindexes) - mynum)): listoflists = genindexsets(newmynum, myinputindexes, i+1) for x in listoflists: if (x != None): x.append(myinputindexes[i]) out.append(x) i+= 1 # print 'out is ', out return out # Decision tree portion # computes the entropy of a single vector def entropy(vec): answercount = {} # empty dictionary totlen = len(vec) for x in vec: if x in answercount: answercount[x] += 1.0 else: answercount[x] = 1.0 ent = 0 for x in answercount: prob = (answercount[x])/totlen ent += - prob * math.log(prob,2) return ent # computes the entropy of vec2 depending on vec1 def condentropy(vec1, vec2): if len(vec1) != len(vec2): return -1 # error totlen = len(vec1) answervec = {} i = 0 while i < totlen: x = vec1[i] if x in answervec: answervec[x].append(vec2[i]) else: answervec[x] = [] answervec[x].append(vec2[i]) i+= 1 # print "Debugging answervec: ", answervec condent = 0 for x in answervec: weight = (0.0+ len(answervec[x])) / totlen # cond entropy weight of x # print ("Debugging letter: "), x, (" has weight: "), weight condent += weight * entropy(answervec[x]) return condent # s are values in predictvec and then we have predictvec and target # We print out the best predicted value and the number of times that is correct # and the p-value assuming all values in target are equally likely # For each s we identify a best target value and then list that and the p-val def fillin(myindent, s, predictvec, target, mydict, bestk): out = [] for v in s: # targets corresponding to this value of predict vec myvals_unsorted = [target[i] for i in range(len(target)) if predictvec[i] == v] myvals = sorted(myvals_unsorted) mycounts = [] myuniques = [] for k, g in groupby(myvals, lambda x:x): mycounts.append(len(list(g))) myuniques.append(k) j = argmax(mycounts) m = mycounts[j] bestval = myuniques[j] myx = "" r = 0 while(r < myindent): myx+= " " r+=1 if 1 < len(set(myvals)): print myx, headers[bestk].rjust(myindent), "=", v, " predicts ", bestval, m, " times out of ", len(myvals), " with prob correct: ",round((m+0.0)/len(myvals),3), " and p-value ", computepvalue(m, len(myvals), 1.0/len(set(myvals))) else: print myx, headers[bestk].rjust(myindent), "=", v, " predicts unique value: ", bestval, m, "times." if (2 < len(mydict)) and (1 < len(set(myvals))) and (1 < len(s)): # still room to recurse newmydict = {} for k in mydict: if (k != bestk): newmydict[k] =[mydict[k][i] for i in range(len(mydict[k])) if mydict[bestk][i] == v] buildtree(myindent+4, newmydict) # print ' ' # find next level of decision tree # Given booldata (which is the remaining uncertainty) and myinds in booldata, try to find out which # of the columns in myinds best predict column 0 in booldata. # Then for each value of that column try to figure out the majority value in that column and see whether # you need to keep going. def findpredictinfo(booldata, myind): xh = [] mydict = {} for i in myind: xh.append(headers[i]) mydict[i] = [] xh.append(headers[0]) mydict[0] = [] # xh has headers of all columns including target plus the impurity score for myrow in booldata: x = [] for i in myind: x.append(myrow[i]) mydict[i].append(myrow[i]) mydict[0].append(myrow[0]) buildtree(1, mydict) # find the p-value of getting that number of successes in # numbertotal tries with probability probsuccess def computepvalue(numbersuccess, numbertotal, probsuccess): i = 0 num = 0.0 numtries = 1000.0 # print 'numbersuccess is:', numbersuccess # print 'numbertotal is:', numbertotal # print 'probsuccess is:', probsuccess while i < numtries: mycount = 0 j = 0 while j < numbertotal: if (random.random() <= probsuccess): mycount+= 1 j+= 1 if (mycount >= numbersuccess): num+= 1.0 i+= 1 if (num == 0): return 1.0/numtries return num/numtries # build a decision tree line at indent level and then # recurse. mydict has the various variables with mydict[0] representing # the target def buildtree(indentlevel, mydict): target = mydict[0] currentent = entropy(target) # entropy of target bestk = 0 if (currentent > 0): p = [] for k in mydict: if (k != 0) and (condentropy(mydict[k], target) < currentent): bestk = k currentent = condentropy(mydict[k], target) # print "debugging: bestk = ", headers[bestk] # print "debugging: condentropy = ", condentropy(mydict[bestk], target) # print headers[bestk].rjust(indentlevel), " = " s = set(mydict[bestk]) fillin(indentlevel, s, mydict[bestk], target, mydict, bestk) # DATA numlogicvalues = 3 for f in inputfiles: inputbool = f boolin = csv.reader(open(inputbool, 'rb'), delimiter =',', quotechar='"') booldata = [] i = 0 for b in boolin: if (i == 0): headers = copy.copy(b[3:]) # print headers else: booldata.append(b[3:]) i+= 1 # print 'booldata is', booldata # print ' ' # print ' ' # print ' ' print ' ' print 'File ', f, 'has len(booldata[0]) of', len(booldata[0]) # EXECUTION # Excluding location 0 numpre = 5 inputindexes = range(1,len(booldata[0])) print 'original inputindexes is', inputindexes k = 1 while (k <= numpre) and (k <= len(inputindexes)): print 'inputindexes is', inputindexes # This generates index sets of size numpre from the list inputindexes # starting at location 0 in inputindexes if (k == len(inputindexes)): x = copy.deepcopy(inputindexes) indexsets = [x] else: indexsets = genindexsets(k, inputindexes, 0) print 'len(indexsets) is', len(indexsets) for t in indexsets: t.reverse() print "k is : ", k print "Just before loop, indexsets: ", indexsets impscore = findscores(booldata, indexsets) # print 'impscores are: ', impscore minscore = 10000 bestpair = [] print "Just before loop: impscore: ", impscore for mypair in impscore: if mypair[1] < minscore: bestpair = mypair[0] # the indexes minscore = mypair[1] print "Just before findpredict: bestpair: ", bestpair ans = findpredict(booldata, bestpair[0], minscore) print ' Best predicting genes: ', ans[0][0:-2], ' for target: ', ans[0][-2], 'with percentage correct: ', ans[0][-1] print ' For set size = ', k, ' All gene sets that are at least close: ' alldecents = [] for mypair in impscore: decents =[] decentindexes = [] for x in mypair[0][0]: if (not mypair[0] == bestpair) and mypair[1] < 1.4 * minscore: decents.append(headers[x]) decentindexes.append(x) if mypair[1] < 1.4 * minscore: print " decents = ", decents, " with score ", mypair[1], " compared to minscore", minscore alldecents.append([mypair[1], decentindexes]) x = [] for d in decentindexes: if not d in bestpair[0]: x.append(headers[d]) print "for k = ", k, " decent alternative genes = ", x # print "predictor vals, predicted target, number correct, percentage correct, pval" # for r in ans[1]: # print r print 'Decision tree in lieu of a boolean circuit:' findpredictinfo(booldata, bestpair[0]) k+= 1 if (k >= 4) and (len(inputindexes) >= 14): alldecents.sort() # in ascending order by score x = [] for a in alldecents: for ind in a[1]: if not ind in x: x.append(ind) x = x[:14] for i in inputindexes: if (not i in x) and (12 > len(x)): x.append(i) inputindexes = copy.deepcopy(x)