#!/usr/bin/python
## Author: Dennis Shasha (shasha@courant.nyu.edu) 20180504
# If you use keeperfile, then numpre will start with k = 2

#from optparse import OptionParser
# This adds the ability to take in a keeper file of those genes
# that can be kept
# Example:  python infer2018.py -i FoldChange0.csv -k keeperfile
import sys
import math
import csv
import os
import copy
import operator
import doctest
import collections
import datetime
import random
from operator import itemgetter, attrgetter
from itertools import groupby
from sets import Set
sys.setrecursionlimit(20000) 
import optparse
parser = optparse.OptionParser()
usage = "usage: %prog [options]"
parser.add_option('-i', type="string", help="input file (foldchange)",
			dest="foldchange")
parser.add_option('-k', type="string", help="input file (keeperfile)",
			dest="keeperfile")
parser.add_option('-n', type="int", help="nombre de TF", dest="nb")
parser.add_option('-s', type="float", help="stringency",
			dest="stri")
parser.add_option('-a', type="float", help="atleast", dest="atl")
(options, args) = parser.parse_args()
print("options: " + str(options))
print("args: " + str(args))
keeperflag = False
if options.keeperfile == None:
   print("options.keeperfile is None: "+ str(options.keeperfile))
else: 
   print("options.keeperfile is something: "+ str(options.keeperfile))
   keepersall = csv.reader(open((options.keeperfile),'rb'), delimiter =',', quotechar='"')
   keeperflag = True
   for k in keepersall:
     keepers = copy.copy(k)
   print("keepers is: " + str(keepers))
if options.stri == None:
   stringency = 2.5
else:
   stringency = options.stri


# fixed p-value so that the probability of an outcome is always 1/3
# This makes sense because the a priori probability of any outcome -1,0,1 
# is 1/3.

# pval_ceiling
# for number of causative factors k:
# we want at least 1 case where 0 ==> 0 with a pval <= pval_ceiling
# we want at least 1 case where non-0 ==> non-0 with a pval <= pval_ceiling
# non-0 here means all preconditions are non-0
# There should be two for each level of indentation.
#pval_ceiling = 0.05
pval_ceiling = 0.05

# if onlyshow_belowceiling is True
# then show the lines below the pvalue ceiling only
# This removes the clutter though I don't recommend for final result.
onlyshow_belowceiling = False

## Author: Dennis Shasha (shasha@courant.nyu.edu) 20160614

# Flag so we don't restrict our search to those that are important at first.

# Flag trees in which the last gene is ALWAYS most influential when 0

# Why are we seeing the target gene in the display of the tree?

# THIS IS THE ONLY PART TO CHANGE
#inputfiles = ['FoldChange0.csv']
inputfiles = [options.foldchange]

# nombre de TF
#numpre = 1
numpre = options.nb

## moins stringent :
## Line 531:
## if (mypair[1] <= 1.01* minscore) and (not chosenbestpair == mypair[0]):
## Tu changes 1.01 a disons 1.1
#stringency = 4.0
# stringency = 2.5
# stringency = options.stri
# how much allowance in impurity score; increasing stringency to 4
# for example allows additional gene sets of size k that 
# have 4 times the impurity of 
# the best set of size k.


# 20160518: Remember the percentage correct of each set of genes and
# consider a new set S only if its percentage error has been reduced by
# the factor atleast
# with respect to every one of its subsets.
# So, for example atleast = 2 means that the error must be reduced
# by a factor of 2.
# So increasing this makes it harder for larger sets to be offered.
#atleast = 1.4
#atleast = 1.2
#atleast = 0.9
atleast = options.atl

# filtercandidates = True means that a set of size k = 4 or greater
#  will be considered
# only if at least one of the genes in that set has been in a set of 
# smaller size. filtercandidates = False means that all genes will
# be considered
# Note that every gene will be a candidate for sets of size 3 or less.
filtercandidates = False

# elite = True means that a set of size k+1 will be made up only of
# genes each of which is in some set of size k. This is stronger
# than filtercandidates = True and applies to all k > 1.
# elite = False doesn't do that
elite = True

# END OF PART TO CHANGE

# As of this writing April, 2015, the target gene is always the one at position
# 0 of the boolean data. 
# We are generating data booldata rather than having it delivered to us.
# Also we generate the plausible sets of genes (corresponding to myindexes) 
# that we want to test. We have two ways of doing this: one just takes
# a whole set of indexes and generates the necessary tests.
# The other takes specific sets.

# The method works, measures impurity properly 
# and can identify the correct triple of genes (indexes) that influence
# the target.


# We are doing inference on matrices having columns  of the form
# val1, val2, val3, ... valk, valk+1
# where we are going to collect certain columns by indexes and then
# see whether we have good purity when inferring the last in a group
# from earlier members in a group.
# So our function will take a set of "experiments" each consisting of k+1
# columns and a set of index sets cach consisting of some sequence of
# size S of
# numbers between 0 and k
# where S is the number of putative causal elements.

# Algorithm works in two steps:
# for each index set
#  go through each experiment and collect in a dictionary all the
# values corresponding to that index set, e.g. for index set 13, 4, 7, 91
# you'll collect the values in those columns and index them by that index
# set.
# At the end, we'll have 
# '13_4_7_91': [ [values from first experiment], 
# [values from second experiment], ...]
# Then we'll go through each element  of this dictionary
# and evaluate "purity". Purity means we take all but the last element
# for each array of values and create a hash structure of those. Then
# we'll see how often we get a consistent value for each of them.
# For example, if we have 
# [0, 0, 0, 1]
# [0, 0, 0, 0]
# [0, 0, 1, 1]
# [0, 0, 1, 1]
# [0, 1, 0, 0]
# [0, 1, 0, 1]
# [0, 1, 0, 0]
# etc. then the total "impurity" equals sum number of elements having
# the same values of the three first attributes (in this case) * (1-fraction
# of values having the most frequent value).
# In this case 2 * (1-1/2) + 2 * 0 + 3 * (1-2/3).
# We want combinations with minimum impurity.


now = datetime.datetime.now()
currentyear = now.year

# BASICS

# x is a list and we want index of max element
# comprehension
def argmax(x):
  L = [i for i in range(len(x)) if x[i] == max(x)]
  return L[0]

# x is a list and we want index of func element
def argfunc(x, func):
  L = [i for i in range(len(x)) if x[i] == func(x)]
  return L[0]

# APPLICATION-SPECIFIC

# flatten an array into a string
def flatten(arr):
     out=""
     for a in arr:
        out+=str(a)+"_"
     return out[:-1]


# fill in ansarray with independent values only (nothing on target)
def genanswerarray(num):
  out = []
  i = 0
  while(i < numlogicvalues):
	oneval = []
	if (num >= 2):
		g = genanswerarray(num-1)
		for r in g:
			oneval=[convertlogicval(i)]
			for y in r:
				oneval.append(y)
			out.append(oneval)
	else:
		out.append([convertlogicval(i)])
	i+=1
  return out


# convert logic val from 0 to numlogicvalues to something symmetric around 0
# if numlogicvalues = 2 then 0 and 1, no change
# if numlogicvalues = 3 then -1, 0, 1 , so -1
# if numlogicvalues = 4 then -1, 0, 1, 2, so -1
# if numlogicvalues = 5 then -2, -1, 0, 1, 2, so -2
# if numlogicvalues = 6 then -2, -1, 0, 1, 2, 2, so -2
# if numlogicvalues = 7 then -3, -2, -1, 0, 1, 2, 3, so -3
def convertlogicval(x):
	d = math.floor((numlogicvalues - 1)/2)
	return x - int(d)

# fill in ans dictionary from ansarray 
# for now the critical index are always the first numpre
def genans(ansarray):
    ans = {}
    for y in ansarray:
	r = y[:numpre]
    	x = random.randint(0,numlogicvalues-1)
	ans[flatten(r)] = convertlogicval(x)
    return ans

# fill in ans dictionary from ansarray but in a monotonic manner
def genansmonotone(ansarray, hiflag, currenttargetval, pchange):
    ans = {}
    for y in ansarray:
	r = y[:numpre]
	ans[flatten(r)] = currenttargetval
	if (random.random() < 5*pchange):
		if (hiflag == 1) & (currenttargetval < convertlogicval(numlogicvalues - 1)): 
			currenttargetval+= 1
		if (hiflag == 0) & (currenttargetval > convertlogicval(0)): 
			currenttargetval-= 1
    return ans


# generate boolean data based on probability of correctness
def genbooldata():
  i = 0
  row = []
  while(i < numexperiments):
	exper = []
	j = 0
	while(j < numpre):
		x = random.randint(0,numlogicvalues-1)
		x = convertlogicval(x)
		exper.append(x)
		j+=1
	goodanswer = ans[flatten(exper)]
	if (probcorrect >= random.random()):
		exper.append(goodanswer)
	else: # take another answer
		x = random.randint(0,numlogicvalues-1)
		x = convertlogicval(x)
		while(x == goodanswer):
			x = random.randint(0,numlogicvalues-1)
			x = convertlogicval(x)
		exper.append(x)
	while(j < numtotalgenes): # continue with j
		x = random.randint(0,numlogicvalues-1)
		x = convertlogicval(x)
		exper.append(x)
		j+= 1
	row.append(exper)
	i+= 1
  return row


# given the boolean data and the sets of indexes 
# (reflecting the genes to consider)
# Our target is always index 0.
# for each index set we're going to go through each
# row of booldata and see how well it does.
def findscores(booldata, indexsets):
  impurityscore = []
  for myind in indexsets:
	pair = [[myind]]
	myindflat = flatten(myind)
	myset = []
	for myrow in booldata:
		x = []
		for i in myind:
			x.append(myrow[i])
		x.append(myrow[0])
		myset.append(x)
	y = evaluateimpurity(myset)
	pair.append(y)
	impurityscore.append(pair)
  return impurityscore


# find the impurity in a set 
def evaluateimpurity(myset):
	mydict = {}
	for s in myset:
		x = flatten(s[:-1])
		y = s[-1]
		if x in mydict:
			mydict[x].append(y)
		else:
			mydict[x]=[y]
	myimpurity = 0
	for d in mydict:
		myvals = sorted(mydict[d])
		counts = []
		current = myvals[0]
		x = 0
		for v in myvals:
			if (v == current): 
				x+= 1
			else:
				counts.append(x)
				current = v
				x = 1
		counts.append(x)
		m = max(counts)
		s = 0.0 + len(myvals)
		# myimpurity += (1 - (m/s)) * s
		myimpurity += s - m
			# prob of being incorrect within this group
			# of input values times size of group
			# Note that (1 - (m/s)) is prob of being incorrect
			# within the group (anything but majority) and s
			# is size of group. (1 - (m/s)) * s = ((s-m)/s)*s
			# = s- m
	return myimpurity

# given the boolean data and a single set of indexes
# (reflecting the genes to consider)
# Our target is always index 0.
# find out what each boolean combination of inputs predicts about the output.
def findpredict(booldata, myind, impurityscore, oldnumgoodpval):
	xh = []
	for i in myind:
		xh.append(headers[i])
	xh.append(headers[0])
	# xh has headers of all columns including target plus the impurity score
	myset = []
	for myrow in booldata:
		x = []
		for i in myind:
			x.append(myrow[i])
		x.append(myrow[0])
		myset.append(x) # all values pertaining to these indexes
	subtrip = findplurality(myset, oldnumgoodpval)
	y = subtrip[0]
	xh.append(subtrip[1])
	trip = [xh, y, subtrip[2]] 
  	return trip


# find the basic prediction for each set of boolean input values
def findplurality(myset, oldnumgoodpval):
	out = []
	countcorrect = 0
	counttotal = 0
	mydict = {}
	numgoodpval = oldnumgoodpval
	for s in myset:
		x = flatten(s[:-1])
		y = s[-1]
		if x in mydict:
			mydict[x].append(y)
		else:
			mydict[x]=[y]
	# mydict is grouped by predictors
	for d in mydict:
		myvals = sorted(mydict[d])
		mycounts = []
		myuniques = []
		for k, g in groupby(myvals, lambda x:x):
			mycounts.append(len(list(g)))
			myuniques.append(k)
		m = max(mycounts)
		countcorrect+= m
		counttotal+= len(myvals)
		j = argmax(mycounts)
		bestval = myuniques[j]
		mypval = computepvalue(m, len(myvals), 1.0/3.0)
		# out.append([d, bestval, m, (m+0.0)/len(myvals), computepvalue(m, len(myvals), 1.0/len(set(myvals)))])
		out.append([d, bestval, m, (m+0.0)/len(myvals), mypval])
		dsplit = [int(x) for x in d.split("_")]
		dzero = (min(dsplit) == 0) and (max(dsplit) == 0)
		dnonzero = (min(dsplit) != 0) and (max(dsplit) != 0)
		if (int(bestval) == 0) and dzero and (mypval <= pval_ceiling):
			numgoodpval+= 1
		if (int(bestval) != 0) and dnonzero and (mypval <= pval_ceiling):
			numgoodpval+= 1
	out.sort()
	return (out, round(((countcorrect+0.0)/counttotal),2), numgoodpval)
	
# This generates index sets of size mynum from the list inputindexes
# starting at i
def genindexsets(mynum, myinputindexes, i):
	out = []
	if (mynum == 1):
		while(i < len(myinputindexes) ):
			out.append([myinputindexes[i]])
			i+= 1
		return out
	newmynum = mynum - 1
# 9 avril 2018 - correction Geoffrey
#	while(i < (len(myinputindexes) - mynum)):
	while(i < (len(myinputindexes) - newmynum)):
		listoflists = genindexsets(newmynum, myinputindexes, i+1)
		for x in listoflists:
			if (x != None): 
				x.append(myinputindexes[i])
				out.append(x)
		i+= 1
	return out

# Decision tree portion


# computes the entropy of a single vector
def entropy(vec):
	answercount = {} # empty dictionary
	totlen = len(vec)
	for x in vec:
	   if x in answercount:
		answercount[x] += 1.0
	   else: 
		answercount[x] = 1.0
	ent = 0
	for x in answercount:
	   prob = (answercount[x])/totlen
	   ent += - prob * math.log(prob,2)
	return ent

# computes the entropy of vec2 depending on vec1
def condentropy(vec1, vec2):
	if len(vec1) != len(vec2): return -1 # error
	totlen = len(vec1)
	answervec = {}
	i = 0
	while i < totlen:
	   x = vec1[i]
	   if x in answervec:
		answervec[x].append(vec2[i])
	   else:
		answervec[x] = []
		answervec[x].append(vec2[i])
	   i+= 1
	condent = 0
	for x in answervec:
		weight = (0.0+ len(answervec[x])) / totlen 
			# cond entropy weight of x
		condent += weight * entropy(answervec[x])
	return condent

# s are values in predictvec and then we have predictvec and target
# We print out the best predicted value and the number of times that is correct
# and the p-value assuming all values in target are equally likely
# For each s we identify a best target value and then list that and the p-val
def fillin(zeromajflag, myindent, s, predictvec, target, mydict, bestk):
  out = []
  for v in s:
	# targets corresponding to this value of predict vec
	myvals_unsorted = [target[i] for i in range(len(target)) if predictvec[i] == v]
	myvals = sorted(myvals_unsorted)
	mycounts = []
	myuniques = []
	for k, g in groupby(myvals, lambda x:x):
		mycounts.append(len(list(g)))
		myuniques.append(k)
	j = argmax(mycounts)
	m = mycounts[j]
	bestval = myuniques[j]
	myx = ""
	r = 0
	while(r < myindent): 
		myx+= " "
		r+=1
	if (1 < len(set(myvals))) or (not bestk == 0):
	    if zeromajflag == False:
		mypval = computepvalue(m, len(myvals), 1.0/3.0)
		if (onlyshow_belowceiling == False) or (mypval <= pval_ceiling):
			print myx, headers[bestk].rjust(myindent), "=", v, "correctly predicts", bestval, m, "times out of", len(myvals), "with prob correct:",round((m+0.0)/len(myvals),3), "and p-value", mypval
		if False and 1 == len(set(myvals)):
			print "SINGLE VALUE: ", set(myvals)
	    if zeromajflag == True:
		mypval = computepvalue(m, len(myvals), 1.0/3.0)
		if (onlyshow_belowceiling == False) or (mypval <= pval_ceiling):
			print myx, headers[bestk].rjust(myindent), "=", v, "correctly predicts", bestval, m, "times out of", len(myvals), "with prob correct:",round((m+0.0)/len(myvals),3), "and p-value", mypval, " ZeroMajority"
		if False and 1 == len(set(myvals)):
			print "SINGLE VALUE (ZeroMajority): ", set(myvals)
	if (2 < len(mydict)) and (1 < len(set(myvals))) and (1 < len(s)): # still room to recurse
	    newmydict = {}
	    for k in mydict:
	      if (k != bestk):
	    	newmydict[k] =[mydict[k][i] for i in range(len(mydict[k])) if mydict[bestk][i] == v]
#	    buildtree(zeromajflag, myindent+4, newmydict)
	    buildtree(zeromajflag, myindent+2, newmydict)

# s are values in predictvec and then we have predictvec and target
# We compute the best predicted value and the number of times that is correct.
# For each s we identify a best target value and then list that and the p-val
def zerotest_fillin(myindent, s, predictvec, target, mydict, bestk):
  outvals = []
  outcounts = []
  for v in s:
	# targets corresponding to this value of predict vec
	myvals_unsorted = [target[i] for i in range(len(target)) if predictvec[i] == v]
	outvals.append(v)
	outcounts.append(len(myvals_unsorted))
	if (2 < len(mydict)) and (1 < len(set(myvals_unsorted))) and (1 < len(s)): # still room to recurse
	    newmydict = {}
	    for k in mydict:
	      if (k != bestk):
	    	newmydict[k] =[mydict[k][i] for i in range(len(mydict[k])) if mydict[bestk][i] == v]
#	    zerotest_buildtree(myindent+4, newmydict)
	    zerotest_buildtree(myindent+2, newmydict)
  i = argmax(outcounts)  
  zeromajvectorindent.append(myindent)
  if (outvals[i] == '0') and (outcounts[i] >= 0.5 * sum(outcounts)):
	zeromajvectortruthval.append(True)
  else:
	zeromajvectortruthval.append(False)
  
# find next level of decision tree


# Given booldata (which is the remaining uncertainty) and myinds in booldata, try to find out which
# of the columns in myinds best predict column 0 in booldata.
# Then for each value of that column try to figure out the majority value in that column and see whether
# you need to keep going.
def findpredictinfo(zeromajflag, booldata, myind):
	xh = []
	mydict = {}
	for i in myind:
		xh.append(headers[i])
		mydict[i] = []
	xh.append(headers[0])
	mydict[0] = []
	# xh has headers of all columns including target plus the impurity score
	for myrow in booldata:
		x = []
		for i in myind:
			x.append(myrow[i])
			mydict[i].append(myrow[i]) 
		mydict[0].append(myrow[0]) 
	buildtree(zeromajflag, 1, mydict)


zeromajvectortruthval = []
zeromajvectorindent = []

# Just test if the very last level of the tree consists of zero in the majority
# if so return True otherwise False
# Uses the global zeromajvector
def zerotest_findpredictinfo(booldata, myind):
	xh = []
	mydict = {}
	for i in myind:
		xh.append(headers[i])
		mydict[i] = []
	xh.append(headers[0])
	mydict[0] = []
	# xh has headers of all columns including target plus the impurity score
	for myrow in booldata:
		x = []
		for i in myind:
			x.append(myrow[i])
			mydict[i].append(myrow[i]) 
		mydict[0].append(myrow[0]) 
	zerotest_buildtree(1, mydict)
	maxindent = max(zeromajvectorindent)
	# return truth values this entire decision tree
	for i in range(len(zeromajvectorindent)):
		if (zeromajvectorindent[i] == maxindent) and (zeromajvectortruthval[i] == False):
			return False
	return True

# find the p-value of getting that number of successes in
# numbertotal tries with probability probsuccess
def computepvalue(numbersuccess, numbertotal, probsuccess):
        i = 0
        num = 0.0
        numtries = 1000.0
        while i < numtries:
                mycount = 0
                j = 0
                while j < numbertotal:
                        if (random.random() <= probsuccess):
                                mycount+= 1
                        j+= 1
                if (mycount >= numbersuccess): 
                        num+= 1.0
                i+= 1
	if (num == 0): return 1.0/numtries
        return num/numtries


# build a decision tree line at indent level and then
# recurse. mydict has the various variables with mydict[0] representing
# the target
def buildtree(zeromajflag, indentlevel, mydict):
   target = mydict[0]
   currentent = entropy(target) # entropy of target
   bestk = 0
   if (currentent > 0):
	p = []
	for k in mydict:
	    if (k != 0) and (condentropy(mydict[k], target) < currentent):
		bestk = k
		currentent = condentropy(mydict[k], target)
	if False and bestk == 0:
	    print "debugging: mydict is: ", mydict
	    print "debugging: bestk = ", headers[bestk]
	    print "debugging: condentropy = ",condentropy(mydict[bestk], target)
	    print "debugging: currentent = ", currentent
	s = set(mydict[bestk])
	fillin(zeromajflag, indentlevel, s, mydict[bestk], target, mydict, bestk)

# test whether the last level of the decision tree has majority 0 cases
def zerotest_buildtree(indentlevel, mydict):
   target = mydict[0]
   currentent = entropy(target) # entropy of target
   bestk = 0
   returnval = True # if this is the last level then by default the answer is that we have
	# majority 0 cases
   if (currentent == 0):
	print 'currentent is 0 case in zerotest_buildtree'
	return True
   if (currentent > 0):
	p = []
	for k in mydict:
	    if (k != 0) and (condentropy(mydict[k], target) < currentent):
		bestk = k
		currentent = condentropy(mydict[k], target)
	s = set(mydict[bestk])
	x = zerotest_fillin(indentlevel, s, mydict[bestk], target, mydict, bestk)
	return ([indentlevel, x])


# Only if this list of genes reduces error by the factor atleast
# compared with its subsets already in memor do we return True
def betterenough(setofgenes, percentcorrect):
	for m in memo:
		if (m[0].issubset(setofgenes)) and ((1-m[1])/atleast) <= (1-percentcorrect):
			return False
	return True

# keep only the genes that are good
def slimindexes(headers, keepers):
   goodindexes = []
   i = 0
   while (i < len(headers)):
     if headers[i] in keepers:
       goodindexes.append(i)
     i+= 1
   print("goodindexes is: " + str(goodindexes))
   return goodindexes

# DATA

memo = []
numlogicvalues = 3 
for f in inputfiles:
  inputbool = f
  boolin = csv.reader(open(inputbool, 'rb'), delimiter =',', quotechar='"')
  booldata = []
  i = 0
  for b in boolin:
	if (i == 0):
		headers = copy.copy(b[3:])
		if options.keeperfile != None:
		  goodindexes = slimindexes(headers, keepers)
		  headers = copy.copy([headers[i] for i in goodindexes])
		  print("headers is: " + str(headers))
		# print headers
	else: 
		if options.keeperfile == None:
		  booldata.append(b[3:])
		else:
		  print("b[3:] is: " + str(b[3:]))
		  print("goodindexes is: " + str(goodindexes))
		  booldata.append([(b[3:])[i] for i in goodindexes])
	i+= 1

  print 'File',f,'( genes:',len(booldata[0]),'- gene names:',len(headers),')'
  
# EXECUTION

  # Excluding location 0
  # numpre = 5
  inputindexes = range(1,len(booldata[0]))
  k = 1
  if (keeperflag == True): # if using the keeperfile, then start with k = 2
    k = 2
  while (k <= numpre):
	indexsets = genindexsets(k, inputindexes, 0)
	for t in indexsets:
		t.reverse()

	impscore = findscores(booldata, indexsets)
	minscore = 10000
#	bestpair = []
#	for mypair in impscore:
#		if mypair[1] < minscore:
#			bestpair = mypair[0] # the indexes
#			minscore = mypair[1]
#	if (0 < len(bestpair)):
#		ans = findpredict(booldata, bestpair[0], minscore, 0)
#		goaheadflag = True
#	else:
#		goaheadflag = False
#	if goaheadflag and ((1 == len(ans[0][0:-2])) or (betterenough(set(ans[0][0:-2]), ans[0][-1]))):
#		print '    '
#		print '####'
#		print 'Best predicting genes:', ans[0][0:-2], 'for target:', ans[0][-2], 'with percentage correct:', ans[0][-1]
#		memo.append([set(ans[0][0:-2]), ans[0][-1]])
#		zeromajvectortruthval = []
#		zeromajvectorindent = []
#		zeromajorityflag = zerotest_findpredictinfo(booldata, bestpair[0])
#		findpredictinfo(zeromajorityflag, booldata, bestpair[0])
#	if goaheadflag:
#		chosenbestpair = copy.deepcopy(bestpair)
	alldecents = []
	for mypair in impscore:
		decents =[]
		decentindexes = []
		if (mypair[1] <= stringency* minscore):
			ans = findpredict(booldata, mypair[0][0], minscore, 0)
			if ((1==len(ans[0][0:-2])) or (betterenough(set(ans[0][0:-2]),ans[0][-1]))) and (ans[2] >=  2) :
				memo.append([set(ans[0][0:-2]), ans[0][-1]])
				zeromajvectortruthval = []
				zeromajvectorindent = []
				zeromajorityflag = zerotest_findpredictinfo(booldata, mypair[0][0])
				print '          '
				if zeromajorityflag == True:
					print 'All  predicting genes:', ans[0][0:-2], 'for target:', ans[0][-2], 'with percentage correct:', ans[0][-1], 'ZeroMajority'
				elif zeromajorityflag == False:
					print 'Alternative predicting genes:', ans[0][0:-2], 'for target:', ans[0][-2], 'with percentage correct:', ans[0][-1]
				findpredictinfo(zeromajorityflag, booldata, mypair[0][0])
				for x in mypair[0][0]:
					decents.append(headers[x]) 
					decentindexes.append(x)
				alldecents.append([mypair[1], decentindexes])
	x = []
	print '    '
#	print '===',decentindexes
	for d in decentindexes:
	   if not d in mypair[0]:
		x.append(headers[d])
	k+= 1
	if (k >= 4) and (len(inputindexes) >= 14) and (filtercandidates):
		alldecents.sort() # in ascending order by score
		x = []
		for a in alldecents:
		    for ind in a[1]:
		       if not ind in x:
			  x.append(ind)
		x = x[:14]
		for i in inputindexes:
		    if (not i in x) and (12 > len(x)):
			x.append(i)
		inputindexes = copy.deepcopy(list(set(x)))
	elif elite:
		xout = []
		for x in alldecents:
			xout.extend(x[1])
		inputindexes = copy.deepcopy(list(set(xout)))