#!/usr/bin/python ## Author: Dennis Shasha (shasha@courant.nyu.edu) 20180504 # If you use keeperfile, then numpre will start with k = 2 #from optparse import OptionParser # This adds the ability to take in a keeper file of those genes # that can be kept # Example: python infer2018.py -i FoldChange0.csv -k keeperfile import sys import math import csv import os import copy import operator import doctest import collections import datetime import random from operator import itemgetter, attrgetter from itertools import groupby from sets import Set sys.setrecursionlimit(20000) import optparse parser = optparse.OptionParser() usage = "usage: %prog [options]" parser.add_option('-i', type="string", help="input file (foldchange)", dest="foldchange") parser.add_option('-k', type="string", help="input file (keeperfile)", dest="keeperfile") parser.add_option('-n', type="int", help="nombre de TF", dest="nb") parser.add_option('-s', type="float", help="stringency", dest="stri") parser.add_option('-a', type="float", help="atleast", dest="atl") (options, args) = parser.parse_args() print("options: " + str(options)) print("args: " + str(args)) keeperflag = False if options.keeperfile == None: print("options.keeperfile is None: "+ str(options.keeperfile)) else: print("options.keeperfile is something: "+ str(options.keeperfile)) keepersall = csv.reader(open((options.keeperfile),'rb'), delimiter =',', quotechar='"') keeperflag = True for k in keepersall: keepers = copy.copy(k) print("keepers is: " + str(keepers)) if options.stri == None: stringency = 2.5 else: stringency = options.stri # fixed p-value so that the probability of an outcome is always 1/3 # This makes sense because the a priori probability of any outcome -1,0,1 # is 1/3. # pval_ceiling # for number of causative factors k: # we want at least 1 case where 0 ==> 0 with a pval <= pval_ceiling # we want at least 1 case where non-0 ==> non-0 with a pval <= pval_ceiling # non-0 here means all preconditions are non-0 # There should be two for each level of indentation. #pval_ceiling = 0.05 pval_ceiling = 0.05 # if onlyshow_belowceiling is True # then show the lines below the pvalue ceiling only # This removes the clutter though I don't recommend for final result. onlyshow_belowceiling = False ## Author: Dennis Shasha (shasha@courant.nyu.edu) 20160614 # Flag so we don't restrict our search to those that are important at first. # Flag trees in which the last gene is ALWAYS most influential when 0 # Why are we seeing the target gene in the display of the tree? # THIS IS THE ONLY PART TO CHANGE #inputfiles = ['FoldChange0.csv'] inputfiles = [options.foldchange] # nombre de TF #numpre = 1 numpre = options.nb ## moins stringent : ## Line 531: ## if (mypair[1] <= 1.01* minscore) and (not chosenbestpair == mypair[0]): ## Tu changes 1.01 a disons 1.1 #stringency = 4.0 # stringency = 2.5 # stringency = options.stri # how much allowance in impurity score; increasing stringency to 4 # for example allows additional gene sets of size k that # have 4 times the impurity of # the best set of size k. # 20160518: Remember the percentage correct of each set of genes and # consider a new set S only if its percentage error has been reduced by # the factor atleast # with respect to every one of its subsets. # So, for example atleast = 2 means that the error must be reduced # by a factor of 2. # So increasing this makes it harder for larger sets to be offered. #atleast = 1.4 #atleast = 1.2 #atleast = 0.9 atleast = options.atl # filtercandidates = True means that a set of size k = 4 or greater # will be considered # only if at least one of the genes in that set has been in a set of # smaller size. filtercandidates = False means that all genes will # be considered # Note that every gene will be a candidate for sets of size 3 or less. filtercandidates = False # elite = True means that a set of size k+1 will be made up only of # genes each of which is in some set of size k. This is stronger # than filtercandidates = True and applies to all k > 1. # elite = False doesn't do that elite = True # END OF PART TO CHANGE # As of this writing April, 2015, the target gene is always the one at position # 0 of the boolean data. # We are generating data booldata rather than having it delivered to us. # Also we generate the plausible sets of genes (corresponding to myindexes) # that we want to test. We have two ways of doing this: one just takes # a whole set of indexes and generates the necessary tests. # The other takes specific sets. # The method works, measures impurity properly # and can identify the correct triple of genes (indexes) that influence # the target. # We are doing inference on matrices having columns of the form # val1, val2, val3, ... valk, valk+1 # where we are going to collect certain columns by indexes and then # see whether we have good purity when inferring the last in a group # from earlier members in a group. # So our function will take a set of "experiments" each consisting of k+1 # columns and a set of index sets cach consisting of some sequence of # size S of # numbers between 0 and k # where S is the number of putative causal elements. # Algorithm works in two steps: # for each index set # go through each experiment and collect in a dictionary all the # values corresponding to that index set, e.g. for index set 13, 4, 7, 91 # you'll collect the values in those columns and index them by that index # set. # At the end, we'll have # '13_4_7_91': [ [values from first experiment], # [values from second experiment], ...] # Then we'll go through each element of this dictionary # and evaluate "purity". Purity means we take all but the last element # for each array of values and create a hash structure of those. Then # we'll see how often we get a consistent value for each of them. # For example, if we have # [0, 0, 0, 1] # [0, 0, 0, 0] # [0, 0, 1, 1] # [0, 0, 1, 1] # [0, 1, 0, 0] # [0, 1, 0, 1] # [0, 1, 0, 0] # etc. then the total "impurity" equals sum number of elements having # the same values of the three first attributes (in this case) * (1-fraction # of values having the most frequent value). # In this case 2 * (1-1/2) + 2 * 0 + 3 * (1-2/3). # We want combinations with minimum impurity. now = datetime.datetime.now() currentyear = now.year # BASICS # x is a list and we want index of max element # comprehension def argmax(x): L = [i for i in range(len(x)) if x[i] == max(x)] return L[0] # x is a list and we want index of func element def argfunc(x, func): L = [i for i in range(len(x)) if x[i] == func(x)] return L[0] # APPLICATION-SPECIFIC # flatten an array into a string def flatten(arr): out="" for a in arr: out+=str(a)+"_" return out[:-1] # fill in ansarray with independent values only (nothing on target) def genanswerarray(num): out = [] i = 0 while(i < numlogicvalues): oneval = [] if (num >= 2): g = genanswerarray(num-1) for r in g: oneval=[convertlogicval(i)] for y in r: oneval.append(y) out.append(oneval) else: out.append([convertlogicval(i)]) i+=1 return out # convert logic val from 0 to numlogicvalues to something symmetric around 0 # if numlogicvalues = 2 then 0 and 1, no change # if numlogicvalues = 3 then -1, 0, 1 , so -1 # if numlogicvalues = 4 then -1, 0, 1, 2, so -1 # if numlogicvalues = 5 then -2, -1, 0, 1, 2, so -2 # if numlogicvalues = 6 then -2, -1, 0, 1, 2, 2, so -2 # if numlogicvalues = 7 then -3, -2, -1, 0, 1, 2, 3, so -3 def convertlogicval(x): d = math.floor((numlogicvalues - 1)/2) return x - int(d) # fill in ans dictionary from ansarray # for now the critical index are always the first numpre def genans(ansarray): ans = {} for y in ansarray: r = y[:numpre] x = random.randint(0,numlogicvalues-1) ans[flatten(r)] = convertlogicval(x) return ans # fill in ans dictionary from ansarray but in a monotonic manner def genansmonotone(ansarray, hiflag, currenttargetval, pchange): ans = {} for y in ansarray: r = y[:numpre] ans[flatten(r)] = currenttargetval if (random.random() < 5*pchange): if (hiflag == 1) & (currenttargetval < convertlogicval(numlogicvalues - 1)): currenttargetval+= 1 if (hiflag == 0) & (currenttargetval > convertlogicval(0)): currenttargetval-= 1 return ans # generate boolean data based on probability of correctness def genbooldata(): i = 0 row = [] while(i < numexperiments): exper = [] j = 0 while(j < numpre): x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) exper.append(x) j+=1 goodanswer = ans[flatten(exper)] if (probcorrect >= random.random()): exper.append(goodanswer) else: # take another answer x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) while(x == goodanswer): x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) exper.append(x) while(j < numtotalgenes): # continue with j x = random.randint(0,numlogicvalues-1) x = convertlogicval(x) exper.append(x) j+= 1 row.append(exper) i+= 1 return row # given the boolean data and the sets of indexes # (reflecting the genes to consider) # Our target is always index 0. # for each index set we're going to go through each # row of booldata and see how well it does. def findscores(booldata, indexsets): impurityscore = [] for myind in indexsets: pair = [[myind]] myindflat = flatten(myind) myset = [] for myrow in booldata: x = [] for i in myind: x.append(myrow[i]) x.append(myrow[0]) myset.append(x) y = evaluateimpurity(myset) pair.append(y) impurityscore.append(pair) return impurityscore # find the impurity in a set def evaluateimpurity(myset): mydict = {} for s in myset: x = flatten(s[:-1]) y = s[-1] if x in mydict: mydict[x].append(y) else: mydict[x]=[y] myimpurity = 0 for d in mydict: myvals = sorted(mydict[d]) counts = [] current = myvals[0] x = 0 for v in myvals: if (v == current): x+= 1 else: counts.append(x) current = v x = 1 counts.append(x) m = max(counts) s = 0.0 + len(myvals) # myimpurity += (1 - (m/s)) * s myimpurity += s - m # prob of being incorrect within this group # of input values times size of group # Note that (1 - (m/s)) is prob of being incorrect # within the group (anything but majority) and s # is size of group. (1 - (m/s)) * s = ((s-m)/s)*s # = s- m return myimpurity # given the boolean data and a single set of indexes # (reflecting the genes to consider) # Our target is always index 0. # find out what each boolean combination of inputs predicts about the output. def findpredict(booldata, myind, impurityscore, oldnumgoodpval): xh = [] for i in myind: xh.append(headers[i]) xh.append(headers[0]) # xh has headers of all columns including target plus the impurity score myset = [] for myrow in booldata: x = [] for i in myind: x.append(myrow[i]) x.append(myrow[0]) myset.append(x) # all values pertaining to these indexes subtrip = findplurality(myset, oldnumgoodpval) y = subtrip[0] xh.append(subtrip[1]) trip = [xh, y, subtrip[2]] return trip # find the basic prediction for each set of boolean input values def findplurality(myset, oldnumgoodpval): out = [] countcorrect = 0 counttotal = 0 mydict = {} numgoodpval = oldnumgoodpval for s in myset: x = flatten(s[:-1]) y = s[-1] if x in mydict: mydict[x].append(y) else: mydict[x]=[y] # mydict is grouped by predictors for d in mydict: myvals = sorted(mydict[d]) mycounts = [] myuniques = [] for k, g in groupby(myvals, lambda x:x): mycounts.append(len(list(g))) myuniques.append(k) m = max(mycounts) countcorrect+= m counttotal+= len(myvals) j = argmax(mycounts) bestval = myuniques[j] mypval = computepvalue(m, len(myvals), 1.0/3.0) # out.append([d, bestval, m, (m+0.0)/len(myvals), computepvalue(m, len(myvals), 1.0/len(set(myvals)))]) out.append([d, bestval, m, (m+0.0)/len(myvals), mypval]) dsplit = [int(x) for x in d.split("_")] dzero = (min(dsplit) == 0) and (max(dsplit) == 0) dnonzero = (min(dsplit) != 0) and (max(dsplit) != 0) if (int(bestval) == 0) and dzero and (mypval <= pval_ceiling): numgoodpval+= 1 if (int(bestval) != 0) and dnonzero and (mypval <= pval_ceiling): numgoodpval+= 1 out.sort() return (out, round(((countcorrect+0.0)/counttotal),2), numgoodpval) # This generates index sets of size mynum from the list inputindexes # starting at i def genindexsets(mynum, myinputindexes, i): out = [] if (mynum == 1): while(i < len(myinputindexes) ): out.append([myinputindexes[i]]) i+= 1 return out newmynum = mynum - 1 # 9 avril 2018 - correction Geoffrey # while(i < (len(myinputindexes) - mynum)): while(i < (len(myinputindexes) - newmynum)): listoflists = genindexsets(newmynum, myinputindexes, i+1) for x in listoflists: if (x != None): x.append(myinputindexes[i]) out.append(x) i+= 1 return out # Decision tree portion # computes the entropy of a single vector def entropy(vec): answercount = {} # empty dictionary totlen = len(vec) for x in vec: if x in answercount: answercount[x] += 1.0 else: answercount[x] = 1.0 ent = 0 for x in answercount: prob = (answercount[x])/totlen ent += - prob * math.log(prob,2) return ent # computes the entropy of vec2 depending on vec1 def condentropy(vec1, vec2): if len(vec1) != len(vec2): return -1 # error totlen = len(vec1) answervec = {} i = 0 while i < totlen: x = vec1[i] if x in answervec: answervec[x].append(vec2[i]) else: answervec[x] = [] answervec[x].append(vec2[i]) i+= 1 condent = 0 for x in answervec: weight = (0.0+ len(answervec[x])) / totlen # cond entropy weight of x condent += weight * entropy(answervec[x]) return condent # s are values in predictvec and then we have predictvec and target # We print out the best predicted value and the number of times that is correct # and the p-value assuming all values in target are equally likely # For each s we identify a best target value and then list that and the p-val def fillin(zeromajflag, myindent, s, predictvec, target, mydict, bestk): out = [] for v in s: # targets corresponding to this value of predict vec myvals_unsorted = [target[i] for i in range(len(target)) if predictvec[i] == v] myvals = sorted(myvals_unsorted) mycounts = [] myuniques = [] for k, g in groupby(myvals, lambda x:x): mycounts.append(len(list(g))) myuniques.append(k) j = argmax(mycounts) m = mycounts[j] bestval = myuniques[j] myx = "" r = 0 while(r < myindent): myx+= " " r+=1 if (1 < len(set(myvals))) or (not bestk == 0): if zeromajflag == False: mypval = computepvalue(m, len(myvals), 1.0/3.0) if (onlyshow_belowceiling == False) or (mypval <= pval_ceiling): print myx, headers[bestk].rjust(myindent), "=", v, "correctly predicts", bestval, m, "times out of", len(myvals), "with prob correct:",round((m+0.0)/len(myvals),3), "and p-value", mypval if False and 1 == len(set(myvals)): print "SINGLE VALUE: ", set(myvals) if zeromajflag == True: mypval = computepvalue(m, len(myvals), 1.0/3.0) if (onlyshow_belowceiling == False) or (mypval <= pval_ceiling): print myx, headers[bestk].rjust(myindent), "=", v, "correctly predicts", bestval, m, "times out of", len(myvals), "with prob correct:",round((m+0.0)/len(myvals),3), "and p-value", mypval, " ZeroMajority" if False and 1 == len(set(myvals)): print "SINGLE VALUE (ZeroMajority): ", set(myvals) if (2 < len(mydict)) and (1 < len(set(myvals))) and (1 < len(s)): # still room to recurse newmydict = {} for k in mydict: if (k != bestk): newmydict[k] =[mydict[k][i] for i in range(len(mydict[k])) if mydict[bestk][i] == v] # buildtree(zeromajflag, myindent+4, newmydict) buildtree(zeromajflag, myindent+2, newmydict) # s are values in predictvec and then we have predictvec and target # We compute the best predicted value and the number of times that is correct. # For each s we identify a best target value and then list that and the p-val def zerotest_fillin(myindent, s, predictvec, target, mydict, bestk): outvals = [] outcounts = [] for v in s: # targets corresponding to this value of predict vec myvals_unsorted = [target[i] for i in range(len(target)) if predictvec[i] == v] outvals.append(v) outcounts.append(len(myvals_unsorted)) if (2 < len(mydict)) and (1 < len(set(myvals_unsorted))) and (1 < len(s)): # still room to recurse newmydict = {} for k in mydict: if (k != bestk): newmydict[k] =[mydict[k][i] for i in range(len(mydict[k])) if mydict[bestk][i] == v] # zerotest_buildtree(myindent+4, newmydict) zerotest_buildtree(myindent+2, newmydict) i = argmax(outcounts) zeromajvectorindent.append(myindent) if (outvals[i] == '0') and (outcounts[i] >= 0.5 * sum(outcounts)): zeromajvectortruthval.append(True) else: zeromajvectortruthval.append(False) # find next level of decision tree # Given booldata (which is the remaining uncertainty) and myinds in booldata, try to find out which # of the columns in myinds best predict column 0 in booldata. # Then for each value of that column try to figure out the majority value in that column and see whether # you need to keep going. def findpredictinfo(zeromajflag, booldata, myind): xh = [] mydict = {} for i in myind: xh.append(headers[i]) mydict[i] = [] xh.append(headers[0]) mydict[0] = [] # xh has headers of all columns including target plus the impurity score for myrow in booldata: x = [] for i in myind: x.append(myrow[i]) mydict[i].append(myrow[i]) mydict[0].append(myrow[0]) buildtree(zeromajflag, 1, mydict) zeromajvectortruthval = [] zeromajvectorindent = [] # Just test if the very last level of the tree consists of zero in the majority # if so return True otherwise False # Uses the global zeromajvector def zerotest_findpredictinfo(booldata, myind): xh = [] mydict = {} for i in myind: xh.append(headers[i]) mydict[i] = [] xh.append(headers[0]) mydict[0] = [] # xh has headers of all columns including target plus the impurity score for myrow in booldata: x = [] for i in myind: x.append(myrow[i]) mydict[i].append(myrow[i]) mydict[0].append(myrow[0]) zerotest_buildtree(1, mydict) maxindent = max(zeromajvectorindent) # return truth values this entire decision tree for i in range(len(zeromajvectorindent)): if (zeromajvectorindent[i] == maxindent) and (zeromajvectortruthval[i] == False): return False return True # find the p-value of getting that number of successes in # numbertotal tries with probability probsuccess def computepvalue(numbersuccess, numbertotal, probsuccess): i = 0 num = 0.0 numtries = 1000.0 while i < numtries: mycount = 0 j = 0 while j < numbertotal: if (random.random() <= probsuccess): mycount+= 1 j+= 1 if (mycount >= numbersuccess): num+= 1.0 i+= 1 if (num == 0): return 1.0/numtries return num/numtries # build a decision tree line at indent level and then # recurse. mydict has the various variables with mydict[0] representing # the target def buildtree(zeromajflag, indentlevel, mydict): target = mydict[0] currentent = entropy(target) # entropy of target bestk = 0 if (currentent > 0): p = [] for k in mydict: if (k != 0) and (condentropy(mydict[k], target) < currentent): bestk = k currentent = condentropy(mydict[k], target) if False and bestk == 0: print "debugging: mydict is: ", mydict print "debugging: bestk = ", headers[bestk] print "debugging: condentropy = ",condentropy(mydict[bestk], target) print "debugging: currentent = ", currentent s = set(mydict[bestk]) fillin(zeromajflag, indentlevel, s, mydict[bestk], target, mydict, bestk) # test whether the last level of the decision tree has majority 0 cases def zerotest_buildtree(indentlevel, mydict): target = mydict[0] currentent = entropy(target) # entropy of target bestk = 0 returnval = True # if this is the last level then by default the answer is that we have # majority 0 cases if (currentent == 0): print 'currentent is 0 case in zerotest_buildtree' return True if (currentent > 0): p = [] for k in mydict: if (k != 0) and (condentropy(mydict[k], target) < currentent): bestk = k currentent = condentropy(mydict[k], target) s = set(mydict[bestk]) x = zerotest_fillin(indentlevel, s, mydict[bestk], target, mydict, bestk) return ([indentlevel, x]) # Only if this list of genes reduces error by the factor atleast # compared with its subsets already in memor do we return True def betterenough(setofgenes, percentcorrect): for m in memo: if (m[0].issubset(setofgenes)) and ((1-m[1])/atleast) <= (1-percentcorrect): return False return True # keep only the genes that are good def slimindexes(headers, keepers): goodindexes = [] i = 0 while (i < len(headers)): if headers[i] in keepers: goodindexes.append(i) i+= 1 print("goodindexes is: " + str(goodindexes)) return goodindexes # DATA memo = [] numlogicvalues = 3 for f in inputfiles: inputbool = f boolin = csv.reader(open(inputbool, 'rb'), delimiter =',', quotechar='"') booldata = [] i = 0 for b in boolin: if (i == 0): headers = copy.copy(b[3:]) if options.keeperfile != None: goodindexes = slimindexes(headers, keepers) headers = copy.copy([headers[i] for i in goodindexes]) print("headers is: " + str(headers)) # print headers else: if options.keeperfile == None: booldata.append(b[3:]) else: print("b[3:] is: " + str(b[3:])) print("goodindexes is: " + str(goodindexes)) booldata.append([(b[3:])[i] for i in goodindexes]) i+= 1 print 'File',f,'( genes:',len(booldata[0]),'- gene names:',len(headers),')' # EXECUTION # Excluding location 0 # numpre = 5 inputindexes = range(1,len(booldata[0])) k = 1 if (keeperflag == True): # if using the keeperfile, then start with k = 2 k = 2 while (k <= numpre): indexsets = genindexsets(k, inputindexes, 0) for t in indexsets: t.reverse() impscore = findscores(booldata, indexsets) minscore = 10000 # bestpair = [] # for mypair in impscore: # if mypair[1] < minscore: # bestpair = mypair[0] # the indexes # minscore = mypair[1] # if (0 < len(bestpair)): # ans = findpredict(booldata, bestpair[0], minscore, 0) # goaheadflag = True # else: # goaheadflag = False # if goaheadflag and ((1 == len(ans[0][0:-2])) or (betterenough(set(ans[0][0:-2]), ans[0][-1]))): # print ' ' # print '####' # print 'Best predicting genes:', ans[0][0:-2], 'for target:', ans[0][-2], 'with percentage correct:', ans[0][-1] # memo.append([set(ans[0][0:-2]), ans[0][-1]]) # zeromajvectortruthval = [] # zeromajvectorindent = [] # zeromajorityflag = zerotest_findpredictinfo(booldata, bestpair[0]) # findpredictinfo(zeromajorityflag, booldata, bestpair[0]) # if goaheadflag: # chosenbestpair = copy.deepcopy(bestpair) alldecents = [] for mypair in impscore: decents =[] decentindexes = [] if (mypair[1] <= stringency* minscore): ans = findpredict(booldata, mypair[0][0], minscore, 0) if ((1==len(ans[0][0:-2])) or (betterenough(set(ans[0][0:-2]),ans[0][-1]))) and (ans[2] >= 2) : memo.append([set(ans[0][0:-2]), ans[0][-1]]) zeromajvectortruthval = [] zeromajvectorindent = [] zeromajorityflag = zerotest_findpredictinfo(booldata, mypair[0][0]) print ' ' if zeromajorityflag == True: print 'All predicting genes:', ans[0][0:-2], 'for target:', ans[0][-2], 'with percentage correct:', ans[0][-1], 'ZeroMajority' elif zeromajorityflag == False: print 'Alternative predicting genes:', ans[0][0:-2], 'for target:', ans[0][-2], 'with percentage correct:', ans[0][-1] findpredictinfo(zeromajorityflag, booldata, mypair[0][0]) for x in mypair[0][0]: decents.append(headers[x]) decentindexes.append(x) alldecents.append([mypair[1], decentindexes]) x = [] print ' ' # print '===',decentindexes for d in decentindexes: if not d in mypair[0]: x.append(headers[d]) k+= 1 if (k >= 4) and (len(inputindexes) >= 14) and (filtercandidates): alldecents.sort() # in ascending order by score x = [] for a in alldecents: for ind in a[1]: if not ind in x: x.append(ind) x = x[:14] for i in inputindexes: if (not i in x) and (12 > len(x)): x.append(i) inputindexes = copy.deepcopy(list(set(x))) elif elite: xout = [] for x in alldecents: xout.extend(x[1]) inputindexes = copy.deepcopy(list(set(xout)))