#!/usr/bin/env python
#
# This program computes entropy in a vector
# michael yuen and luna dai
import math


def entropy(vec):
    answercount = {} # empty dictionary
    totlen = len(vec)
    for x in vec:
        if x in answercount:
            answercount[x] += 1.0
        else: 
            answercount[x] = 1.0
    ent = 0
    for x in answercount:
        prob = (answercount[x])/totlen
        ent += - prob * math.log(prob,2)
    return ent

# computes the entropy of vec2 depending on vec1
def condentropy(vec1, vec2):
    if len(vec1) != len(vec2): return -1 # error
    totlen = len(vec1)
    answervec = {}
    i = 0
    while i < totlen:
        x = vec1[i]
        if x in answervec:
            answervec[x].append(vec2[i])
        else:
            answervec[x] = []
            answervec[x].append(vec2[i])
        i+= 1
    # print "Debugging answervec: ", answervec
    condent = 0
    for x in answervec:
        weight = (0.0+ len(answervec[x])) / totlen 
                	# cond entropy weight of x
        	# print ("Debugging letter: "), x, (" has weight: "), weight
        condent += weight * entropy(answervec[x])
    return condent

#-----------Data---------------

def filein(name):
   file = open(name,"r")
   d = file.readlines()
   file.close()
   i = 0
   for x in d:
       if x[len(x)-1:] == "\n":
           d[i] = x[:len(x)-1]
       if i == 0:
           d[i] = d[i].split(" ")
       else:
           d[i] = d[i].split("  ")
       i += 1
   length = len(d)-1
   i = 0
   parsed = {}
   for x in d:
       w = 0
       for z in x:
           if i == 0:
               parsed[z]=[]
           else:
               topics = d[0]
               parsed[topics[w]].append(z)
               w +=1
       i += 1
   return parsed

name = "MakeATree.py"
d={}
d=filein(name)

t=d["target"][0:] #save targets info somewhere else
del d["target"]#remove target from data


indent = 0 #to declare
def makeatree(d,t):

  global indent#to modify indent
  #to increase spacing
  global tempsentence#assign as empty string
  tempsentence=""
  for i in range(0,indent):#to see how many indents to put
    tempsentence+="    "  # add this each time
  
  
  
  data = []#to add all the information under each D
  data.append(d["D1"])
  data.append(d["D2"])
  data.append(d["D3"])
  data.append(d["D4"])

  data_ent = []#holds condent of all d's
  best = 1#to compare to find lowest entropy
  best_d = []#data from that d
  for x in range(0,len(data)):#find the  loweest entropy 
      ent = condentropy(data[x],t)
      data_ent.append(ent)
  for y in data_ent:
      if y < best:
          best = y  
  if best==1:
      print tempsentence+"no more d can be applied"
  else:
      for x in range(0,len(data)):
        ent = condentropy(data[x],t)
        if ent==best:
          best_d=data[x]#sends data of lowest d to best_d
      bleh=""#name of best_d
      for x in d:
        if d[x]==best_d:
          bleh=x#names bleh d1,2,3,4
      #create baskets
      t2 = {}#for second level
      newd={}#for recursive purposes
      
      for x in d:#create the level of Ds
        newd[x]=[]
      for x in best_d:
        t2[x]=[]#do same for t, separate 0,1s
      #put stuffs into the basket
      for x in best_d: 
        t2[x].append(t[0])#put first elment of t into either basket
        t=t[1:]#throw away the first element in the t
      newt=[]
      #t2 is a dict
      #t2 = {A:[A,B,C,DD,D],B:[A,B,A,B,C,D],C:[A,D,C,D,D,D]}
      e=0
      for x in t2:#x can possibly equal to A B C D
        if entropy(t2[x])==0:#here is one of the base case - see if all same
          print tempsentence+"%s: " %bleh, x #prints with spacing
          print tempsentence+t2[x][0],"happens %s times" %len(t2[x])
        e+=entropy(t2[x])
      if e==0:
          print tempsentence+"the branch ends"
          indent=0
          tempsentence=""
          for i in range(0,indent):#to see how many indents to put
            tempsentence+="    "  # add this each time
      else:
          tempd = {} 
          for x in d:#to make a copy of the data
              tempd[x] = d[x]
          idd=0
          for x in t2:
            idd+=1
            if entropy(t2[x]) !=0: #here goes to next level
              print tempsentence+"%s: " %bleh, x

              newt=t2[x] #picks one still with disorder
              #here we have the new d
              for p in tempd: # fill in with new data
                  d[p]=tempd[p]
              for j in d: # gives new clean list
                  newd[j]=[]
              for j in d:#j can be D1-D4 name
                for k in best_d:#k can possibly equal to A B C D
                  if k==x: #chooses the basket
                    newd[j].append(d[j][0])#saves the letter that matches unfinished a/b/c
                  d[j]=d[j][1:]# throws away rest

              indent+=1

              makeatree(newd,newt)#sends information back through
    

makeatree(d,t)

