#!/usr/bin/python
## Author: Dennis Shasha (shasha@courant.nyu.edu) 20150513

# THIS IS THE ONLY PART TO CHANGE

#inputfiles = ['FoldChange1.csv.tmp']
#inputfiles = ['GSE5612_FoldChange.csv','E-MEXP-1299_FoldChange.csv','NASCexp196_FoldChange.csv','GSE10016_FoldChange.csv','E-MEXP-1304_FoldChange.csv']
inputfiles = ['Final_FoldChange.csv']

# END OF PART TO CHANGE

# As of this writing April, 2015, the target gene is always the one at position
# 0 of the boolean data. 
# We are generating data booldata rather than having it delivered to us.
# Also we generate the plausible sets of genes (corresponding to myindexes) 
# that we want to test. We have two ways of doing this: one just takes
# a whole set of indexes and generates the necessary tests.
# The other takes specific sets.

# The method works, measures impurity properly 
# and can identify the correct triple of genes (indexes) that influence
# the target.


# We are doing inference on matrices having columns  of the form
# val1, val2, val3, ... valk, valk+1
# where we are going to collect certain columns by indexes and then
# see whether we have good purity when inferring the last in a group
# from earlier members in a group.
# So our function will take a set of "experiments" each consisting of k+1
# columns and a set of index sets cach consisting of some sequence of
# size S of
# numbers between 0 and k
# where S is the number of putative causal elements.

# Algorithm works in two steps:
# for each index set
#  go through each experiment and collect in a dictionary all the
# values corresponding to that index set, e.g. for index set 13, 4, 7, 91
# you'll collect the values in those columns and index them by that index
# set.
# At the end, we'll have 
# '13_4_7_91': [ [values from first experiment], 
# [values from second experiment], ...]
# Then we'll go through each element  of this dictionary
# and evaluate "purity". Purity means we take all but the last element
# for each array of values and create a hash structure of those. Then
# we'll see how often we get a consistent value for each of them.
# For example, if we have 
# [0, 0, 0, 1]
# [0, 0, 0, 0]
# [0, 0, 1, 1]
# [0, 0, 1, 1]
# [0, 1, 0, 0]
# [0, 1, 0, 1]
# [0, 1, 0, 0]
# etc. then the total "impurity" equals sum number of elements having
# the same values of the three first attributes (in this case) * (1-fraction
# of values having the most frequent value).
# In this case 2 * (1-1/2) + 2 * 0 + 3 * (1-2/3).
# We want combinations with minimum impurity.


import sys
import math
import csv
import os
import copy
import operator
import doctest
import collections
import datetime
import random
from operator import itemgetter, attrgetter
from itertools import groupby
from sets import Set
sys.setrecursionlimit(20000) 

now = datetime.datetime.now()
currentyear = now.year

# BASICS

# x is a list and we want index of max element
# comprehension
def argmax(x):
  L = [i for i in range(len(x)) if x[i] == max(x)]
  return L[0]

# x is a list and we want index of func element
def argfunc(x, func):
  L = [i for i in range(len(x)) if x[i] == func(x)]
  return L[0]

# APPLICATION-SPECIFIC

# flatten an array into a string
def flatten(arr):
     out=""
     for a in arr:
        out+=str(a)
     return out


# fill in ansarray with independent values only (nothing on target)
def genanswerarray(num):
  out = []
  i = 0
  while(i < numlogicvalues):
	oneval = []
	if (num >= 2):
		g = genanswerarray(num-1)
		for r in g:
			oneval=[convertlogicval(i)]
			for y in r:
				oneval.append(y)
			out.append(oneval)
	else:
		out.append([convertlogicval(i)])
	i+=1
  return out


# convert logic val from 0 to numlogicvalues to something symmetric around 0
# if numlogicvalues = 2 then 0 and 1, no change
# if numlogicvalues = 3 then -1, 0, 1 , so -1
# if numlogicvalues = 4 then -1, 0, 1, 2, so -1
# if numlogicvalues = 5 then -2, -1, 0, 1, 2, so -2
# if numlogicvalues = 6 then -2, -1, 0, 1, 2, 2, so -2
# if numlogicvalues = 7 then -3, -2, -1, 0, 1, 2, 3, so -3
def convertlogicval(x):
	d = math.floor((numlogicvalues - 1)/2)
	return x - int(d)

# fill in ans dictionary from ansarray 
# for now the critical index are always the first numpre
def genans(ansarray):
    ans = {}
    for y in ansarray:
	r = y[:numpre]
    	x = random.randint(0,numlogicvalues-1)
	ans[flatten(r)] = convertlogicval(x)
    return ans

# fill in ans dictionary from ansarray but in a monotonic manner
def genansmonotone(ansarray, hiflag, currenttargetval, pchange):
    ans = {}
    for y in ansarray:
	r = y[:numpre]
	ans[flatten(r)] = currenttargetval
	if (random.random() < 5*pchange):
		if (hiflag == 1) & (currenttargetval < convertlogicval(numlogicvalues - 1)): 
			currenttargetval+= 1
		if (hiflag == 0) & (currenttargetval > convertlogicval(0)): 
			currenttargetval-= 1
    return ans


# generate boolean data based on probability of correctness
def genbooldata():
  i = 0
  row = []
  while(i < numexperiments):
	exper = []
	j = 0
	while(j < numpre):
		x = random.randint(0,numlogicvalues-1)
		x = convertlogicval(x)
		exper.append(x)
		j+=1
	goodanswer = ans[flatten(exper)]
	if (probcorrect >= random.random()):
		exper.append(goodanswer)
	else: # take another answer
		x = random.randint(0,numlogicvalues-1)
		x = convertlogicval(x)
		while(x == goodanswer):
			x = random.randint(0,numlogicvalues-1)
			x = convertlogicval(x)
		exper.append(x)
	while(j < numtotalgenes): # continue with j
		x = random.randint(0,numlogicvalues-1)
		x = convertlogicval(x)
		exper.append(x)
		j+= 1
	row.append(exper)
	i+= 1
  return row


# given the boolean data and the sets of indexes 
# (reflecting the genes to consider)
# Our target is always index 0.
# for each index set we're going to go through each
# row of booldata and see how well it does.
def findscores(booldata, indexsets):
  impurityscore = []
  for myind in indexsets:
	pair = [[myind]]
	myindflat = flatten(myind)
	myset = []
	for myrow in booldata:
		x = []
		for i in myind:
			x.append(myrow[i])
		x.append(myrow[0])
		myset.append(x)
	y = evaluateimpurity(myset)
	pair.append(y)
	impurityscore.append(pair)
  return impurityscore


# find the impurity in a set 
def evaluateimpurity(myset):
	mydict = {}
	for s in myset:
		x = flatten(s[:-1])
		y = s[-1]
		if x in mydict:
			mydict[x].append(y)
		else:
			mydict[x]=[y]
	myimpurity = 0
	for d in mydict:
		myvals = sorted(mydict[d])
		counts = []
		current = myvals[0]
		x = 0
		for v in myvals:
			if (v == current): 
				x+= 1
			else:
				counts.append(x)
				current = v
				x = 1
		counts.append(x)
		m = max(counts)
		s = 0.0 + len(myvals)
		# myimpurity += (1 - (m/s)) * s
		myimpurity += s - m
			# prob of being incorrect within this group
			# of input values times size of group
			# Note that (1 - (m/s)) is prob of being incorrect
			# within the group (anything but majority) and s
			# is size of group. (1 - (m/s)) * s = ((s-m)/s)*s
			# = s- m
	return myimpurity

# given the boolean data and a single set of indexes
# (reflecting the genes to consider)
# Our target is always index 0.
# find out what each boolean combination of inputs predicts about the output.
def findpredict(booldata, myind, impurityscore):
	xh = []
	for i in myind:
		xh.append(headers[i])
	xh.append(headers[0])
	# xh has headers of all columns including target plus the impurity score
	myset = []
	for myrow in booldata:
		x = []
		for i in myind:
			x.append(myrow[i])
		x.append(myrow[0])
		myset.append(x) # all values pertaining to these indexes
	subpair = findplurality(myset)
	y = subpair[0]
	xh.append(subpair[1])
	pair = [xh, y] 
  	return pair


# find the basic prediction for each set of boolean input values
def findplurality(myset):
	out = []
	countcorrect = 0
	counttotal = 0
	mydict = {}
	for s in myset:
		x = flatten(s[:-1])
		y = s[-1]
		if x in mydict:
			mydict[x].append(y)
		else:
			mydict[x]=[y]
	# mydict is grouped by predictors
	for d in mydict:
		myvals = sorted(mydict[d])
		mycounts = []
		myuniques = []
		for k, g in groupby(myvals, lambda x:x):
			mycounts.append(len(list(g)))
			myuniques.append(k)
		m = max(mycounts)
		countcorrect+= m
		counttotal+= len(myvals)
		j = argmax(mycounts)
		bestval = myuniques[j]
		out.append([d, bestval, m, (m+0.0)/len(myvals), computepvalue(m, len(myvals), 1.0/len(set(myvals)))])
	out.sort()
	return (out, round(((countcorrect+0.0)/counttotal),2))
	
# This generates index sets of size mynum from the list inputindexes
# starting at i
def genindexsets(mynum, myinputindexes, i):
	out = []
	if (mynum == 1):
		while(i < len(myinputindexes) ):
			out.append([myinputindexes[i]])
			i+= 1
		return out
	newmynum = mynum - 1
	while(i < (len(myinputindexes) - mynum)):
		listoflists = genindexsets(newmynum, myinputindexes, i+1)
		for x in listoflists:
			if (x != None): 
				x.append(myinputindexes[i])
				out.append(x)
		i+= 1
	# print 'out is ', out
	return out

# Decision tree portion


# computes the entropy of a single vector
def entropy(vec):
	answercount = {} # empty dictionary
	totlen = len(vec)
	for x in vec:
	   if x in answercount:
		answercount[x] += 1.0
	   else: 
		answercount[x] = 1.0
	ent = 0
	for x in answercount:
	   prob = (answercount[x])/totlen
	   ent += - prob * math.log(prob,2)
	return ent

# computes the entropy of vec2 depending on vec1
def condentropy(vec1, vec2):
	if len(vec1) != len(vec2): return -1 # error
	totlen = len(vec1)
	answervec = {}
	i = 0
	while i < totlen:
	   x = vec1[i]
	   if x in answervec:
		answervec[x].append(vec2[i])
	   else:
		answervec[x] = []
		answervec[x].append(vec2[i])
	   i+= 1
	# print "Debugging answervec: ", answervec
	condent = 0
	for x in answervec:
		weight = (0.0+ len(answervec[x])) / totlen 
			# cond entropy weight of x
		# print ("Debugging letter: "), x, (" has weight: "), weight
		condent += weight * entropy(answervec[x])
	return condent

# s are values in predictvec and then we have predictvec and target
# We print out the best predicted value and the number of times that is correct
# and the p-value assuming all values in target are equally likely
# For each s we identify a best target value and then list that and the p-val
def fillin(myindent, s, predictvec, target, mydict, bestk):
  out = []
  for v in s:
	# targets corresponding to this value of predict vec
	myvals_unsorted = [target[i] for i in range(len(target)) if predictvec[i] == v]
	myvals = sorted(myvals_unsorted)
	mycounts = []
	myuniques = []
	for k, g in groupby(myvals, lambda x:x):
		mycounts.append(len(list(g)))
		myuniques.append(k)
	j = argmax(mycounts)
	m = mycounts[j]
	bestval = myuniques[j]
	myx = ""
	r = 0
	while(r < myindent): 
		myx+= " "
		r+=1
	if 1 < len(set(myvals)):
		print myx, headers[bestk].rjust(myindent), "=", v, " predicts ", bestval, m, " times out of ", len(myvals), " with prob correct: ",round((m+0.0)/len(myvals),3), " and p-value ", computepvalue(m, len(myvals), 1.0/len(set(myvals)))
	else:
		print myx, headers[bestk].rjust(myindent), "=", v, " predicts unique value: ", bestval, m, "times."
	if (2 < len(mydict)) and (1 < len(set(myvals))) and (1 < len(s)): # still room to recurse
	    newmydict = {}
	    for k in mydict:
	      if (k != bestk):
	    	newmydict[k] =[mydict[k][i] for i in range(len(mydict[k])) if mydict[bestk][i] == v]
	    buildtree(myindent+4, newmydict)
#  print '    '

# find next level of decision tree
# Given booldata (which is the remaining uncertainty) and myinds in booldata, try to find out which
# of the columns in myinds best predict column 0 in booldata.
# Then for each value of that column try to figure out the majority value in that column and see whether
# you need to keep going.
def findpredictinfo(booldata, myind):
	xh = []
	mydict = {}
	for i in myind:
		xh.append(headers[i])
		mydict[i] = []
	xh.append(headers[0])
	mydict[0] = []
	# xh has headers of all columns including target plus the impurity score
	for myrow in booldata:
		x = []
		for i in myind:
			x.append(myrow[i])
			mydict[i].append(myrow[i]) 
		mydict[0].append(myrow[0]) 
	buildtree(1, mydict)


# find the p-value of getting that number of successes in
# numbertotal tries with probability probsuccess
def computepvalue(numbersuccess, numbertotal, probsuccess):
        i = 0
        num = 0.0
        numtries = 1000.0
        # print 'numbersuccess is:', numbersuccess
        # print 'numbertotal is:', numbertotal
        # print 'probsuccess is:', probsuccess
        while i < numtries:
                mycount = 0
                j = 0
                while j < numbertotal:
                        if (random.random() <= probsuccess):
                                mycount+= 1
                        j+= 1
                if (mycount >= numbersuccess): 
                        num+= 1.0
                i+= 1
	if (num == 0): return 1.0/numtries
        return num/numtries


# build a decision tree line at indent level and then
# recurse. mydict has the various variables with mydict[0] representing
# the target
def buildtree(indentlevel, mydict):
   target = mydict[0]
   currentent = entropy(target) # entropy of target
   bestk = 0
   if (currentent > 0):
	p = []
	for k in mydict:
	    if (k != 0) and (condentropy(mydict[k], target) < currentent):
		bestk = k
		currentent = condentropy(mydict[k], target)
	# print "debugging: bestk = ", headers[bestk]
	# print "debugging: condentropy = ", condentropy(mydict[bestk], target)
	# print headers[bestk].rjust(indentlevel), " = "
	s = set(mydict[bestk])
	fillin(indentlevel, s, mydict[bestk], target, mydict, bestk)

# DATA

numlogicvalues = 3 
for f in inputfiles:
  inputbool = f
  boolin = csv.reader(open(inputbool, 'rb'), delimiter =',', quotechar='"')
  booldata = []
  i = 0
  for b in boolin:
	if (i == 0):
		headers = copy.copy(b[3:])
		# print headers
	else: 
		booldata.append(b[3:])
	i+= 1

  # print 'booldata is', booldata
#  print '           '
#  print '           '
#  print '           '
  print '           '
  print 'File ', f, 'has len(booldata[0]) of', len(booldata[0])
  
# EXECUTION

  # Excluding location 0
  numpre = 5
  inputindexes = range(1,len(booldata[0]))
  print 'original inputindexes is', inputindexes
  k = 1
  while (k <= numpre) and (k <= len(inputindexes)):
	print 'inputindexes is', inputindexes
	# This generates index sets of size numpre from the list inputindexes
	# starting at location 0 in inputindexes
	if (k == len(inputindexes)):
		x = copy.deepcopy(inputindexes)
		indexsets = [x]
	else:
		indexsets = genindexsets(k, inputindexes, 0)
		print 'len(indexsets) is', len(indexsets)
		for t in indexsets:
			t.reverse()
	print "k is : ", k
	print "Just before loop, indexsets: ", indexsets
	impscore = findscores(booldata, indexsets)
    # print 'impscores are: ', impscore
	minscore = 10000
	bestpair = []
	print "Just before loop: impscore: ", impscore
	for mypair in impscore:
		if mypair[1] < minscore:
			bestpair = mypair[0] # the indexes
			minscore = mypair[1]
	print "Just before findpredict: bestpair: ", bestpair
	ans = findpredict(booldata, bestpair[0], minscore)
	print '  Best predicting genes: ', ans[0][0:-2], ' for target: ', ans[0][-2], 'with percentage correct: ', ans[0][-1]
	print '  For set size = ', k, ' All gene sets that are at least close: '
	alldecents = []
	for mypair in impscore:
		decents =[]
		decentindexes = []
		for x in mypair[0][0]:
			if (not mypair[0] == bestpair) and mypair[1] < 1.4 * minscore:
				decents.append(headers[x])
			decentindexes.append(x)
		if mypair[1] < 1.4 * minscore:
			print " decents = ", decents, " with score ", mypair[1], " compared to minscore", minscore
		alldecents.append([mypair[1], decentindexes])
	x = []
	for d in decentindexes:
	   if not d in bestpair[0]:
		x.append(headers[d])
	print "for k = ", k, " decent alternative genes = ", x
	# print "predictor vals, predicted target, number correct, percentage correct, pval"
	# for r in ans[1]:
		# print r
	print 'Decision tree in lieu of a boolean circuit:'
	findpredictinfo(booldata, bestpair[0])
	k+= 1
	if (k >= 4) and (len(inputindexes) >= 14):
		alldecents.sort() # in ascending order by score
		x = []
		for a in alldecents:
		    for ind in a[1]:
		       if not ind in x:
			  x.append(ind)
		x = x[:14]
		for i in inputindexes:
		    if (not i in x) and (12 > len(x)):
			x.append(i)
		inputindexes = copy.deepcopy(x)