import math
def prob(vec,check):
    answercount = {}
    prob = 0
    totlen = len(vec)
    for x in vec:
        if x in answercount:
            answercount[x] += 1.0
        else:
            answercount[x] = 1.0
    ent = 0
    for x in answercount:
        if x == check:
            prob = (answercount[x])/totlen
    return prob

def entropy(vec):
    answercount = {}
    totlen = len(vec)
    for x in vec:
        if x in answercount:
            answercount[x] += 1.0
        else:
            answercount[x] = 1.0
    ent = 0
    for x in answercount:
        prob = (answercount[x])/totlen
        ent += - prob * math.log(prob,2)
    return ent

def condent(vec1,vec2):
    used = {}
    cond = 0
    totlen = len(vec1)
    if totlen == len(vec2):
        i = 0
        for x in vec1:
            if x in used:
                used[x].append(vec2[i])
            else:
                used[x] = [vec2[i]]
            i += 1
        for x in used:
            ent = entropy(used[x])
            cond += prob(vec1,x)*ent
        return cond
    else:
        return "Error: Invalid Vector Length"

def infogain(datax,datay):
    data = {}
    for dset in datax:
        data[dset] = entropy(datay) - condent(datax[dset],datay)
    return data

def filein(name):
    file = open(name,"r")
    text = file.readlines()
    file.close()
    i = 0
    for x in text:
        if x[len(x)-1:] == "\n":
            text[i] = x[:len(x)-1]
        if i == 0:
            text[i] = text[i].split(" ")
        else:
            text[i] = text[i].split("  ")
        i += 1
    length = len(text)-1
    i = 0
    parsed = {}
    for x in text:
        w = 0
        for z in x:
            if i == 0:
                parsed[z]=[]
            else:
                topics = text[0]
                parsed[topics[w]].append(z)
                w +=1
        i += 1
    return parsed

def makedict(targetfile,item,p):
    data = filein(targetfile)
    if (p in data):
        done = []
        t = 0
        while len(data[p]) > t:
            if data[p][t] != item:
                for v in data:
                    data[v].pop(t)
                t -= 1
            t += 1
    return data

def check(data,target,p):
    predict = {}
    hold = {}
    u = 0
    for x in data[p]:
        if x in hold:
            hold[x].append(data[target][u])
        else:
            hold[x] = []
            hold[x].append(data[target][u])
        u += 1
    for x in hold:
        count = len(hold[x])
        pos = 0
        for y in hold[x]:
            pos += int(y)
        predict[x] = [False,count-pos,pos]
        if pos == 0:
            predict[x] = [True,0,count]
        elif count-pos == 0:
            predict[x] = [True,1,count]
    if predict != {}:
        return predict

def sort(data):
    return sorted(data, key=data.__getitem__, reverse=True)

def main(targetfile,target,data,ordkeys,i,next):
    result = check(data,target,ordkeys[0])
    ordered = sorted(result.keys())
    line = "L"+str(i)
    tree[line] = []
    for x in ordered:
        text = line+": Is "+ordkeys[0]+" = "+x+"? (yes=L"+str(next)+")"
        next += 1
        i += 1
        if not result[x][0]:
            tempdata = makedict(targetfile,x,ordkeys[0])
            tempdatay = tempdata[target]
            del tempdata[target]
            del tempdata[ordkeys[0]]
            tempord = infogain(tempdata,tempdatay)
            tempord = sort(tempord)
            tempdata = makedict(targetfile,x,ordkeys[0])
            main(targetfile,target,tempdata,tempord,i,next+1)
        tree[line].append([text,line,'L'+str(next-1),result[x]])
    return tree

def printtree(line, result, tab, oldline):
    for x in result[line]:
        if line[-1] < oldline[-1]:
            tab = tab[:len(tab)-5]
        print(tab+x[0])
        if not x[3][0]:
            tab += "     "
            printtree(x[2],result,tab,line)
        else:
            print(tab+"     "+x[2]+": "+str(x[3][1])+" happens "+str(x[3][2])+" times.")



targetfile = input("What is the name of the data file? ")
target = input("What is the name of the target column? ")
print()
tree={}


data = filein(targetfile)
datay = data[target]
del data[target]
data = infogain(data, datay)
ordkeys = sort(data)
data = filein(targetfile)
i = 0
next = 1
result = main(targetfile,target,data,ordkeys,i,next)
tab = ""
printtree("L0",result,tab,"LO")
