/ Code author: Dennis Shasha, 2000, 2001, 2002, 2003, 2004, 2005.
/ Joint work with Tsong-Li Wang and Kaizhong Zhang
/ tree.k
/ This file computes the tree distance between two trees.
/ It implements the algorithms of the Zhang and Shasha Siam 89 paper
/ and the subsequent journal of algorithms paper.
/ Three options: tree-to-tree dist (treedist) 
/ or tree-to-tree dist within alloweddist (treedistwithin)
/ or all subtree-to-subtree distances (treedistmatrix)
/ The file treelayout has the dumped form of the tree.

/ As of March 2002, this spits out the matching when using +r.
/ As of February, 2002, this file implements all pairs.
/ k tree filename or k tree [+f filename] [+n num] [+s] [+b]
/ When filename is missing, then input data comes from tempin
/ When num is missing then an all-pairs test is done. 
/ Otherwise the first num trees from the input data are compared against
/ all the others.
/ When +s then the comparison is a tree to subtree (i.e. find out whether
/ T1 is a subtree or approximate subtree of T2)
/ When +b then the comparison is to find the best match only for each probe.
/ When +r report the label mapping between trees in standard out.
/ As of August 2005, I fixed the findmapping.
filename: "treein"
numtoprobe: -1
subtreeflag: 0
findbestonly: 0
reportmapping: 0
alphabetizeflag: 0
okflag: 1
processargs:{[args]
  i: 0
  while[(i < #args) & okflag
	okflag:: 0
	addextra: 0
	if[args[i] ~ "+f"
		filename:: args[i+1]
		addextra: 1
		okflag:: 1
	] 
	if[args[i] ~ "+n"
		numtoprobe:: 0 $ args[i+1]
		addextra: 1
		okflag:: 1
	] 
	if[args[i] ~ "+a"
		alphabetizeflag:: 1
		okflag:: 1
	] 
	if[args[i] ~ "+s"
		subtreeflag:: 1
		okflag:: 1
	]
	if[args[i] ~ "+b"
		findbestonly:: 1
		okflag:: 1
	]
	if[args[i] ~ "+r"
		reportmapping:: 1
		okflag:: 1
	]
	i+: 1 + addextra
	if[0 = okflag
		` 0: "format is k tree [+f filename] [+n num] [+a] [+s] [+b] [+r]\n"
		` 0: "filename is file for input\n"
		` 0: "num is distance allowed\n"
		` 0: "+a says rearrange the children of all nodes \n"
	         ` 0: "    in increasing alphabetical order (useful for unordered tree comparison)\n" 
		` 0: "+s says ask for best subtree in data tree\n"
		` 0: "+b says find best matching tree only\n"
		` 0: "+r says report the mapping\n"
	]
  ]
}
processargs[_i]

/ 
/ Several representations are possible:
/ 1. tabular:
/ #tree| treeid| parentid| childrenid 
/ treea |0 |1 2 5 6 8 
/ treea |2 |3 4 
/ treea |6 |7 
/ treeb |0 |1 2 3 6 
/ treeb |3 |4 5 
/ treeb |6 |7 
/ treeb |7 |8 
/ treeb |8 |9 

/ # treelabel|treeid|nodeid|label 
/ treea |0 |x 
/ treea |1 |y 
/ treea |2 |z 
/ treea |3 |w 
/ treea |4 |v 
/ treea |5 |q 

/ 2. Postorder:
/ Each tree is represented as a node, followed by its children.
/ e.g. a
/    /  \
/   b    c
/   |
/   d
/ is represented by 1 a, 2 b, 3 c, 4 d in treelabel. 
/ These should be laid out in preorder (breadth first search).
/ A second table (tree) encodes the parent child relationship in preorder.
/ A third relationship (treepost) associates a postorder number with each node.

/ 3. Indented in outline format
/ in1:("x"
/ "  y"
/ "  z"
/ "    w"
/ "    v"
/ "  q"
/ "  r"
/ "    p"
/ "  r")

/ Need to keep a backward pointer to best point we came from.
/ For each tree calc, we keep this matching and then recursively
/ fetch it.

/ How many subtrees are there? Each node has a subtree going
/ to the leaves. But it has many more when going down to 
/ any level of the tree.
/ What if we can say that either all the children of a node
/ are present or none are. Still no good. Try to do this by looking
/ at the post order with gaps.

/ finds the leftmost node in preorder given an id and parent/children
findleftmost:{[parent; children; nodeid]
  j: parent ? nodeid
  if[j = #parent; : nodeid]
  :findleftmost[parent; children; children[j;0]]
}

/ determines whether a node has a left sibling (return 1) or not (return 0)
hasleftsibling:{[parent; children; nodeid]
  j: & nodeid _in/: children
  if[0 = #j; :1] / if the root, then no left sibling, but still calculate
  : 0 < children[*j] ? nodeid / if 0th position then no left sibling, else yes
}

/ convertprepost converts from preorder to postorder
convertprepost:{[preordernodes; postordernodes; id]
   i: preordernodes ? id
   :postordernodes[i]
}

/ findparent findst the parent of a node in preorder
findparent:{[parent; children;id]
  j: & id _in/: children
  if[0 = #j; :id]
  :parent[*j]
}

/ fill the postorder table.
fillpost:{[treeid]
  i: & tree.treeid = treeid
  parent: tree.parentid[i]
  children: tree.childrenid[i]
  j: & treelabel.treeid = treeid
  labelnodeids: treelabel.nodeid[j]
  labellabels: treelabel.label[j]
  treepost.treeid,: (#j) # treeid
  treepost.postid,: !(#j)
  xx: differ[parent; ,/children] 
  xxi: parent ? *xx
  y: findpost[parent[xxi]; parent; children] / order preorder nodes in their
	/ display accoring to postorder
  treepost.nodeid,: y / preorder numberings of those nodes
  treepost.label,: labellabels[labelnodeids ?/: y]
  ypar: findparent[parent;children]'y
  treepost.parentid,: convertprepost[y; !(#j)]'ypar
  yleft: findleftmost[parent; children]'y
  treepost.leftmost,: convertprepost[y; !(#j)]'yleft
  yleftsib: hasleftsibling[parent;children]'y
  treepost.hasleftsib,: yleftsib
}

/ findpost takes a preorder traversal and produces a postorder numbering
findpost:{[parentid; parent; children]
  j: parent ? parentid
  if[j = #parent; :,parentid]
  kids: children[j]
  out: ()
  i: 0
  while[i < #kids
	out,: findpost[kids[i]; parent; children]
	i+: 1
  ]
  out,: parentid
  :out
}

/ END OF POSTORDER


/ BASICS


/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :x[?i[j]]
  ]
  i: y ?/: x
  j: & i < #y
  :y[?i[j]]
}

/finds intersection of two lists
/ fastest of all
hasintersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  : (&/i) < #x
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :(#x) = (#intersect[x;y])
}

subset:{[x;y] (#x) = (#intersect[x;y])}

subset:{[x;y] 
  j: y ?/: x
   / if any js are as big as #y, then there is a member of x not in y
  :0 = |/ j = #y
}

/ proportion of x that y covers
approxsubset:{[x;y]
  j: y ?/: x
  / any equal to #y are in x but not in y
  :(# & j < (#y)) % (#x)
}

/ finds indexes in x and y that intersect
/ If x and y are both sets, then the results will be of the same length
/ fastest of all
intersectindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :(i[j];j)
}

/finds indexes in x that intersect with y
intersectleftindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :i[j]
}

/finds intersection of two lists
/ and returns index pairs of matches. Assumes no duplicates
/ in either list
intersectbothindexes: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  pairs: (i ,' (!#y))
  k: & pairs[;0] < #x
  :pairs[k]
}


/ this is a set intersection so we remove duplicates
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

/ A faster difference
differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

/ Count the difference
differcount:{[x;y]
  i: y ?/: x
  :# & i = #y
}

avg:{(+/ x) % # x}

var:{avg[_sqr x] - _sqr avg[x]}
std:{_sqrt var[x]}
cov:{avg[x * y] - avg[x] * avg[y]}
corr:{ (cov[x;y])%((std[x]) * (std[y]))}
/ delay based search

/ END OF BASICS


/ GEN TREE

/ generate a tree with random indents
gentree:{[size]
  labels: `a1 `b1 `c1 `d1 `e1
  out: ()
  words: $labels[size _draw #labels]
  out,: ,words[0]
  lastindent: 0 / in single step
  stepsize: 2
  i: 1
  while[i < size
	lastindent: |/(1 ; * 1 _draw lastindent+2)
	out,: ,((stepsize*lastindent) # " "),(words[i])
	i+: 1
  ]
  :out
}

/ END OF GEN TREE

/ READ TREE BY CONVERSION FROM A FILE OF THE FORM
/ # tree|treeid|parentid|childrenid
/ treea|0|1 2 5 6 8
/ treea|2|3 4
/ treea|6|7
/ 
/ # treelabel|treeid|nodeid|label
/ treea|0|x
/ treea|1|y
/ treea|2|z
/ treea|3|w
/ treea|4|v
/ treea|5|q
/ treea|6|r
/ treea|7|p
/ treea|8|r


inputfile: "treein"
if[0 < #_i
	inputfile: _i[0]
]
/ Null values should be represented by having only blanks in a field.
/ Tables are separated by a single blank line

/ a list is numeric if each member is either empty, a period
/ or a digit
isnumeric:{[list]
  s: ,/list
  nums: "1234567890"
  x: nums ?/: s
  y: (#nums) > |/x / if greater then everything in s met its match
  if[1 = y; :2] / 2 means integer
  nums: "1234567890."
  x: nums ?/: s
  :(#nums) > |/x / if greater then everything in s met its match
	/ 1 means float and 0 means not a number
}

/ parses a field based on vertical bars
getfields:{[line]
  i: line = "|"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  :(0,(j1 - !size)) _ line
}

/ get rid of blanks at either end of the string
delendblanks:{[string]
  if[0 = #string; :""]
  if[string ~ ,"" ; :""]
  i: & ~ string = " "
  if[(#string) = (#i); :string]
  if[0 = (#i); :""]
  string: (- ((#string)  - (1 + *|i))) _ string
  :(*i) _ string
}
  

/ Handles one line of input at a time according to table schema above.
/ A little more space efficient than the previous version.
processline:{[line]
 emptyorblank:{[line] (0 = #delendblanks[line])}
 emptyflag: emptyorblank[line]
 if[~emptyflag
  justempty:: 0
  if[line[0] = "#"
    newline: delendblanks'getfields[1 _ line]
    if[(1 < #newline) 
	globaltable:: *newline
	alltables,: ` $ globaltable
	globalatt:: 1 _ newline
	globalvals:: () / initialize
    ] / newline
  ] / table declaration
  if[~ line[0] = "#"
	newline: delendblanks'getfields[line]
	if[(1 < #newline) | (0 < #newline[0]) 
		globalvals,: ,newline
	]
  ] / end of test on data line
 ] / end of test on line is non-empty
 if[emptyflag & (0 = justempty)
	justempty:: 1
	/ if[ ((#globalatt) > 1) | (~ (#globalatt) = (^globalvals)[0])
		/ globalvals:: + globalvals / take transpose
	/ ]
	if[~ (#globalatt) = (#globalvals[0]); !-1]
	numericflag:()
	hh: 0
	while[hh < #globalatt
		numericflag,: isnumeric[globalvals[;hh]] 
		hh+: 1
	]
	inindex:: 0
	while[inindex < #globalatt
	  / table.att[2] :: ` $ vals[3]
		string1: globaltable, (".")
		string1,: globalatt[inindex]
		teststring: ("0 = #"), string1
		/ For big sizes, uncomment following two statements
		/ User will get an error message, but it will work.
		/ x: @[.: ; teststring; :]
		/ currentlyempty: *x / error means empty
		/ For big sizes, comment following
		currentlyempty: 1
		/ if empty, then assign, else append
		string2: :[currentlyempty; (":: "); (",: ")] 
		string3: :[0 = numericflag[inindex]
			"` $ "
			:[1 = numericflag[inindex]; "0.0 $ "; "0 $ "]]
		string3,: "globalvals[;inindex]"
		. string1, string2, string3
		if[1 = numericflag[inindex]
			globalindexes:: & 0N = . string1
			string3: string1, ("[globalindexes] :: `")
			. string3
		]
		inindex+: 1
	]
 ]
}


inputfromfile:{[filename]
  a: 0: filename
  globalvals:: ()
  oldglobalvals:: () / should not be necessary
  justempty:: 1 / as if we've just seen an empty line. should process table
  a,: ,,""
  processline'a
  makeintlist:{[line] (. $line), ()}
  // june 2007 eliminate parents without children
  jj: & ~ ` = tree.childrenid
  tree.treeid@: jj
  tree.parentid@: jj
  tree.childrenid@: jj
  // end change of june 2007
  tree.childrenid:: makeintlist'tree.childrenid
}

/ i is an index in the tree table (treeid, parentid, childrenid)
/ This routine will find the labels corresponding to the childrenid
/ of this tree using the treelabel table (treeid, nodeid, label)
/ and rearrange the childrenids appropriately to accord with the order.
ordertree:{[i]
 mykids: tree.childrenid[i]
 mytree: tree.treeid[i]
 mylabs: findlabel'mytree,/: mykids
 ii: < $mylabs
 tree.childrenid[i]:: mykids[ii]
}

findlabel:{[pair] treelabel.label[labpair ? pair]}


/ READ TREE IN INDENTED FORM

treeread:{[treeid; in]
  y: getline[in]'!#in
  indentlevel: y[;0]
  label: y[;1]
  num: #label
  treelabel.treeid,: num # treeid
  treelabel.label,: label
  treelabel.nodeid,: !num
  parent: ()
  parentlist: ()
  last: 0
  lastindent: indentlevel[0] / indent of last line
  pathindent: lastindent,() / indent of root
  path: 0,() / root node, in general this is the current path from root
  pathindex: #tree.childrenid / index where to put in children
  count: 0
  i: 1
  while[i < num
    if[~ lastindent < indentlevel[i]
	/ lastindent was equally or more indented than current
	/ so last node cannot have been a parent
	/ if[lastindent = indentlevel[i]
	lastindent: indentlevel[i]
	kk: pathindent ? lastindent / has to be there
 	tocut: kk - (#pathindent) 
	path: tocut _ path
	pathindent: tocut _ pathindent
	pathindex: tocut _ pathindex
	pathindent,: lastindent
	path,: i / potential parent
	pathindex,: #tree.childrenid
	lastindent: indentlevel[i]
	jj: (pathindent ? lastindent) - 1 / index of parent indent
	tree.childrenid[pathindex[jj]],: i
    ]
    if[lastindent < indentlevel[i]
	tree.parentid,: *|path / have a new parent-child relationship
	tree.childrenid,: ,,i / first child
	lastindent: indentlevel[i]
	pathindent,: lastindent
	path,: i / potential parent
	pathindex,: #tree.childrenid
	count+: 1
    ]
    i+: 1
  ]
  tree.treeid,: count # treeid
}

/ return number of indents plus label
getline:{[in;i]
  line: in[i],()
  j: (line = " "),()
  first: j ? 0
  :(first; ` $ line[first + !(#line) - first])
}
/ END OF READ TREE

/ DISPLAY.

fliptopost:{[treeid; nodeid ]
  ii: * & (treepost.treeid = treeid) & (treepost.nodeid = nodeid)
  :treepost.postid[ii]
}
  
  
/ spit out a tree in indented notation.
dumptree:{[treeid]
  out: ,("Tree: "), ($treeid)
  indent: 0
  ikids: & tree.treeid = treeid
  parent: tree.parentid[ikids]
  children: tree.childrenid[ikids]
  ilab: & treelabel.treeid = treeid
  / node: fliptopost[treeid]'treelabel.nodeid[ilab]
  node: treelabel.nodeid[ilab]
  label: treelabel.label[ilab]
  if[0 < #parent
    out,: dump[treeid; parent; children; node; label; 0; 0]
  ]
  if[0 = #parent
    out,: ,($label)
  ]
  :out
}

/ this is a recursive procedure
/ that does the actual printing where the level of indent is proportional 
/ to the level.
dump:{[treeid; parent; children; node; label; parentindex; indent]
  i: parentindex
  x: fliptopost[treeid; parent[i]]
  out: ,(indent # " "), ($label[node ? parent[i]]),(" ("),($x),(")")
  indent+: 2
  j: 0
  while[j < #children[i]
	kid: children[i;j]
	ii: parent ? kid
	if[ii < #parent
		out,: dump[treeid; parent; children; node; label; ii; indent]
	]
	if[ii = #parent
		x: fliptopost[treeid; kid]
		out,: ,(indent # " "), ($label[node ? kid]),(" ("),($x),(")")
  	]
	j+: 1
  ]
  :out
}
/ END OF DISPLAY.

/ DISTANCE CALCS: no optimizations, keyroots optimizations, and distance 
/ optimizations.


stack:()
match: ()


/ Find the match from one tree to the other.
/ mtree is the tree to tree distance between every relevant pair
/ mall is the forestdist(i;j).
/ labeli are the labels of the nodes in postorder
/ leftmosti are the leftmost numberings of tree i
/ offset is the offset needed for tree to subtree matching
/ Algorithm due to Kaizhong Zhang, March 2002.
/ Reimplemented as of March 2003.
/ Here is the recursive step. Given a tree rooted at k and l to match
/ (starting at the highest numbered nodes from each tree which are the roots
/ because of postorder)
/ Let left1 = leftmost leaf in subtree rooted at k
/ and left2 be leftmost leaf in subtree rooted at l
/ Recompute the forest to forest distance within the subtree (mall).
/ Now go back from k to left1 and from l to left2.
/ If you find a match between node i and j where tree(i,j) 
/ + mall(leftmost1(i)-1, leftmost(j)-1) is no bigger than removing
/ one node from one tree or the other, then
/ i and j are a match. So the subtree rooted 
/ at i matches the subtree rooted at j. 
/ If both leftmost(i)=left1 and leftmost(j) =left2, 
/ then keep going down 1 each and matches
/ end up in the final output.
/ If one isn't, say leftmost(i) > left1, then there are no matches to 
/ left1..l(i)-1.
/ Such matches are deferred by pushing them to a stack.
/ If neither is left1/left2, then continue going leftmost(i)-1 
/ and leftmost(j)-1
/ This match is also pushed to the stack.
/ Keep calling this traceback procedure until done.
/ DEBUGGING note: look at how x1, x2, and x3 are set and see whether
/ the calculation is making sense.
findmapping:{[mtree; mall; label1; leftmost1; label2; leftmost2; treename1; treename2; offset]
  / Need these because mall has an extra top row and leftmost column for
  / purposes of boundary conditions
  mall: 1 _ mall
  mall: 1 _' mall
  maxnum: 1 + (#mall[0])+(#mall[;0])
  stack:: ()
  match:: ,(treename1; treename2)
  push:{[x] stack,: x}
  pop:{[] x: *|stack; stack:: (-1) _ stack; :x}
/ Finds the mall for this subtree
/ left1 is a leftmost leaf in the tree for n1
/ left2 is a leftmost leaf in the tree for n2
/ Assume this is correct.
recomputemall:{[mtree;left1; n1; left2; n2]
   mall: ((n1-left1) + 2; (n2-left2) + 2) # -1
   mall[;0]: !2 + (n1-left1)
   mall[0]: !2 + (n2-left2)
   i1: left1
   i2: left2
   while[i1 < n1+1
	i2: left2
	while[i2 < n2 + 1
	  off1: 1 + (i1 - left1)
	  off2: 1 + (i2 - left2)
	  left_i1: leftmost1[i1]
	  left_i2: leftmost2[i2]
	  off_left_i1: 1 + (left_i1 - left1)
	  off_left_i2: 1 + (left_i2 - left2)
	  if[(left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off1-1;off2-1] + ~ (label1[i1] ~ label2[i2]))
/ !-1
	    z: mall[off1; off2]
	  ]
	  / May use tree distance from i1 to i2 as a subroutine.
	  / It is not calculated in the normal course of events.
	  if[~ (left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off_left_i1-1;off_left_i2-1] + mtree[i1;i2])
/ !-2
	  ]
	  i2+: 1
	]
	i1+: 1
   ]
   x: mall[(n1-left1)+1; (n2-left2)+1]
   :(mall)
}
  / go back from k;l
  / Need a proper mall to do this right.
  traceback:{[k;l]
	left1: leftmost1[k]
	left2: leftmost2[l]
	mall: recomputemall[mtree; left1; k; left2; l]
	i: k
	j: l
	/ while[(i > -1) & (j > -1) & ((i > leftmost1[k] - 1) | ( j > leftmost2[l] - 1))
	while[(i > (left1-1)) & (j > (left2-1)) 
		matchcost:  ~ (label1[i] ~ label2[j])
		x1: :[(i > left1) ; mall[1+(i-left1)-1;1+(j-left2)] +1; maxnum] 
	/ late august 2005: added the 1+ inside the indexes on both sides.
		x2: :[(j > left2) ; mall[1+(i-left1);1+(j-left2)-1] +1; maxnum] 
		x3: :[(leftmost1[i] > left1) & (leftmost2[j] > left2)
			mall[leftmost1[i]-(1+left1);leftmost2[j]-(1+left2)] + mtree[i;j] 
			mtree[i;j]+(0 | (leftmost1[i] - (1+left1))) + (0 | (leftmost2[j] - (1+left2))) ]
/ !-6
			/ third case is:
			/ number of elements that would have to be thrown
			/ out if we said we were matching subtrees at i and j
			/  plus subtree to subtree-to-subtree distance
		/ flag: x3 < (x1 & x2) / treematch is best; seems also to work
		/ flag: ~ x3 > (x1 & x2) / treematch is no worse
		flag: (x3 < (x1 & x2)) | ((x3 = (x1 & x2)) & matchcost = 0)
			/ critical to the debugging of august 2005
		flag2: (leftmost1[i]=leftmost1[k]) & (leftmost2[j]=leftmost2[l])
		if[flag & (flag2)
			match,: ,(i;label1[i]; j+offset; label2[j])
			/ added the +offset in aug 2005
			/ ` 0: ,("match "),($i), (" "), ($j)
			/ match,: ,(i;label1[i]; j; label2[j])
			i: i-1
			j: j-1
		]
		if[flag & (~ flag2)
			/ match,: ,(i;label1[i]; j+offset; label2[j])
			push[,(i;j+offset)]
			/ added the +offset in aug 2005
			/ ` 0: ,("push "),($i), (" "), ($j)
			i: leftmost1[i] - 1
			j: leftmost2[j]  - 1
		]
		if[(~flag)  / don't match at i and j
			:[x1 < x2; i-: 1; j-: 1]
			/ :[i > leftmost1[k]-1; i: i-1; j: j-1]
		]
	]
  }
  ii: (#label1) - 1
  jj: (#label2) - 1
  push[,(ii;jj)]
  while[0 < #stack
	mypair: pop[]
	traceback[mypair[0]; mypair[1]]
  ]
  :match
}

/ Find distance between two trees by building two arrays of trees.
/ No optimizations but return the subtree-to-subtree distances.
treedistmatrix:{[treeid1; treeid2]
   j: & treepost.treeid = treeid1
   node1: treepost.postid[j]
   parent1: treepost.parentid[j]
   leftmost1: treepost.leftmost[j]
   label1: treepost.label[j]
   j: & treepost.treeid = treeid2
   node2: treepost.postid[j]
   parent2: treepost.parentid[j]
   leftmost2: treepost.leftmost[j]
   label2: treepost.label[j]
   mtree: ((#node1) ; (#node2))  # -1
/ evaluates the tree to treedist
/ Note for example that mtree[i;j] is the distance of the tree rooted
/ at i with the tree rooted at j.
/ If l(i) is not 0, then when computing mtree[i+1; j], we may require
/ start to the left of l(i). That would give us a forest vs. tree computation,
/ so we would use mtree[i;j] as a subroutine.
evaldist:{[mtree;left1; n1; left2; n2]
   mall: ((n1-left1) + 2; (n2-left2) + 2) # -1
   mall[;0]: !2 + (n1-left1)
   mall[0]: !2 + (n2-left2)
   i1: left1
   i2: left2
   while[i1 < n1+1
	i2: left2
	while[i2 < n2 + 1
	  off1: 1 + (i1 - left1)
	  off2: 1 + (i2 - left2)
	  left_i1: leftmost1[node1 ? i1]
	  left_i2: leftmost2[node2 ? i2]
	  off_left_i1: 1 + (left_i1 - left1)
	  off_left_i2: 1 + (left_i2 - left2)
	  if[(left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off1-1;off2-1] + ~ (label1[i1] ~ label2[i2]))
/ !-3
	  ]
	  / May use tree distance from i1 to i2 as a subroutine.
	  / It is not calculated in the normal course of events.
	  if[~ (left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off_left_i1-1;off_left_i2-1] + mtree[i1;i2])
/ !-4
	  ]
	  i2+: 1
	]
	i1+: 1
   ]
   x: mall[(n1-left1)+1; (n2-left2)+1]
   :(x; mall)
}
   i: 0
   while[i < #node1
	j: 0
   	while[j < #node2
		n1: node1[i]
		n2: node2[j]
		/ mtree -- current tree to tree distance
		/ n1 is 
		mypair:evaldist[mtree;leftmost1[i];n1; leftmost2[j]; n2]
		mtree[n1; n2]:mypair[0]
		j+: 1
	]
	i+: 1
   ]
   / y: mtree[(#node1) - 1; (#node2) - 1]
    if[(reportmapping) & (~ treeid1 = treeid2)
		/ Doesn't work yet.
		/ We start by finding the best match for whole query tree
	  	rows: * ^ mtree
	  	cols: * | ^ mtree 
	  	bestdist: &/ mtree[rows - 1]
		jj: |/ & mtree[rows-1] = bestdist
		mypair: evaldist[mtree; 0; (#node1) - 1; leftmost2[jj]; jj]
		xx: leftmost2[jj]+!(1+jj-leftmost2[jj])
		/ THIS ONE NOT USED BY DEFAULT
		mappingout,: ,findmapping[mtree[;xx]; mypair[1]; label1; leftmost1; label2[xx]; leftmost2[xx]-leftmost2[jj]; treeid1; treeid2; leftmost2[jj]]
		/ we have to concentrate just on that subtree; that's
		/ why we do the subtractions.
    ]
   :(treeid1; treeid2; mtree)
}

/ Find distance between two trees by building two arrays of trees.
/ Use the LR_keyroot strategy, i.e. if l(p(i))  = l(i) then don't bother
/ with the l(i).
/ This amounts to saying: don't compute for i if i has a left sibling.
treedist:{[treeid1; treeid2]
   j: & treepost.treeid = treeid1
   node1: treepost.postid[j]
   parent1: treepost.parentid[j]
   leftmost1: treepost.leftmost[j]
   label1: treepost.label[j]
   hasleftsib1: treepost.hasleftsib[j]
   j: & treepost.treeid = treeid2
   node2: treepost.postid[j]
   parent2: treepost.parentid[j]
   leftmost2: treepost.leftmost[j]
   label2: treepost.label[j]
   hasleftsib2: treepost.hasleftsib[j]
   mtree:: ((#node1) ; (#node2))  # -1
/ evaluates the tree to tree dist
/ Note for example that mtree[i;j] is the distance of the tree rooted
/ at i with the tree rooted at j.
/ If l(i) is not 0, then when computing mtree[i+1; j], we may require
/ start to the left of l(i). That would give us a forest vs. tree computation,
/ so we would use mtree[i;j] as a subroutine.
evaldist:{[left1; n1; left2; n2]
   mall: ((n1-left1) + 2; (n2-left2) + 2) # -1
   mall[;0]: !2 + (n1-left1)
   mall[0]: !2 + (n2-left2)
   i1: left1
   i2: left2
   while[i1 < n1+1
	i2: left2
	while[i2 < n2 + 1
	  off1: 1 + (i1 - left1)
	  off2: 1 + (i2 - left2)
	  left_i1: leftmost1[node1 ? i1]
	  left_i2: leftmost2[node2 ? i2]
	  off_left_i1: 1 + (left_i1 - left1)
	  off_left_i2: 1 + (left_i2 - left2)
	  if[(left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off1-1;off2-1] + ~ (label1[i1] ~ label2[i2]))
	    z: mall[off1; off2]
	    mtree[i1;i2]:: z
	  ]
	  / May use tree distance from i1 to i2 as a subroutine.
	  / It is not calculated in the normal course of events.
	  if[~ (left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off_left_i1-1;off_left_i2-1] + mtree[i1;i2])
/ !-6
	  ]
	  i2+: 1
	]
	i1+: 1
   ]
   x: mall[(n1-left1)+1; (n2-left2)+1]
   :(x;mall)
}
   / consider only those having left sibling; all others
   / computed along the way.
   ilist: & hasleftsib1 = 1
   while[0 < #ilist
	i: *ilist
	jlist: &hasleftsib2 = 1
   	while[0 < #jlist
		j: *jlist
		n1: node1[i]
		n2: node2[j]
		mypair:evaldist[leftmost1[i];n1; leftmost2[j]; n2]
		mtree[n1; n2]::mypair[0]
		jlist: 1 _ jlist
	]
	ilist: 1 _ ilist
   ]
   y: mtree[(#node1) - 1; (#node2) - 1]
   if[(reportmapping) & (~ treeid1 = treeid2)
	mypair: evaldist[0; (#node1) - 1; 0; (#node2) - 1]
	/ THIS ONE USED BY DEFAULT
	mappingout,: ,findmapping[mtree; mypair[1]; label1; leftmost1; label2; leftmost2; treeid1; treeid2; 0]
   ]
   :(treeid1; treeid2; y)
}

/ Find distance between two trees by building two arrays of trees.
/ Use the LR_keyroot strategy, i.e. if l(p(i))  = l(i) then don't bother
/ with the l(i).
/ This amounts to saying: don't compute for i if i has a left sibling.
/ Compute treedist only if within a given distance.
treedistwithin:{[treeid1; treeid2; allowed]
   j: & treepost.treeid = treeid1
   node1: treepost.postid[j]
   parent1: treepost.parentid[j]
   leftmost1: treepost.leftmost[j]
   label1: treepost.label[j]
   hasleftsib1: treepost.hasleftsib[j]
   j: & treepost.treeid = treeid2
   node2: treepost.postid[j]
   if[allowed < _abs[(#node1) - (#node2)]; :(treeid1; treeid2; allowed+1)]
   parent2: treepost.parentid[j]
   leftmost2: treepost.leftmost[j]
   label2: treepost.label[j]
   hasleftsib2: treepost.hasleftsib[j]
   mtree:: ((#node1) ; (#node2))  # (allowed+1)
/ evaluates the tree to treedist
/ Note for example that mtree[i;j] is the distance of the tree rooted
/ at i with the tree rooted at j.
/ If l(i) is not 0, then when computing mtree[i+1; j], we may require
/ start to the left of l(i). That would give us a forest vs. tree computation,
/ so we would use mtree[i;j] as a subroutine.
evaldistwithin:{[left1; n1; left2; n2; allowed]
   mall: ((n1-left1) + 2; (n2-left2) + 2) # -1
   mall[;0]: !2 + (n1-left1)
   mall[0]: !2 + (n2-left2)
   i1: left1
   i2: left2
   while[i1 < n1+1
	i2: left2
	while[i2 < n2 + 1
	  off1: 1 + (i1 - left1)
	  off2: 1 + (i2 - left2)
	  left_i1: leftmost1[node1 ? i1]
	  left_i2: leftmost2[node2 ? i2]
	  off_left_i1: 1 + (left_i1 - left1)
	  off_left_i2: 1 + (left_i2 - left2)
	  if[(left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off1-1;off2-1] + ~ (label1[i1] ~ label2[i2]))
/ !-7
	    z: mall[off1; off2]
	    mtree[i1;i2]:: z
	  ]
	  / May use tree distance from i1 to i2 as a subroutine.
	  / It is not calculated in the normal course of events.
	  if[~ (left_i1 = left1) & (left_i2 = left2)
		/ COST FUNCTION implemented here. Note that &/ means take min
	    mall[off1; off2]: &/ (mall[off1;off2-1] + 1; mall[off1-1;off2] + 1
		mall[off_left_i1-1;off_left_i2-1] + mtree[i1;i2])
/ !-8
	  ]
	  i2+: 1
	]
	i1+: 1
   ]
   x: mall[(n1-left1)+1; (n2-left2)+1]
   / :(x;mall)
   :x / don't need other part
}
   / consider only those having left sibling; all others
   / computed along the way.
   ilist: & hasleftsib1 = 1
   while[0 < #ilist
	i: *ilist
	jlist: &hasleftsib2 = 1
   	while[0 < #jlist
		j: *jlist
		n1: node1[i]
		n2: node2[j]
		if[(~ (_abs (n1 - n2) ) > allowed) & (~ (_abs ((n1-leftmost1[i]) - (n2-leftmost2[j]))) > allowed)
		  p: evaldistwithin[leftmost1[i];n1; leftmost2[j]; n2; allowed]
		  / April 24, 2002. Throw away other member of pair.
		  mtree[n1; n2]::p
		]
		jlist: 1 _ jlist
	]
	ilist: 1 _ ilist
   ]
   y: mtree[(#node1) - 1; (#node2) - 1]
   if[reportmapping & (~ treeid1 = treeid2)
	mypair: evaldist[0; (#node1) - 1; 0; (#node2) - 1]
	mappingout,: ,findmapping[mtree; mypair[1]; label1; leftmost1; label2; leftmost2; treeid1; treeid2; 0]
   ]
   :(treeid1; treeid2; y)
}

/ EXECUTION ROUTINES

/ tree-to-tree matches, but for each probe tree want
/ only best match of dbtrees.
/ This allows us to use treedistwithin.
/ If there is an intersection between probetrees and dbtrees,
/ we save matches so we can reuse them.
execfulltreebestmatch:{[alltrees; probetrees; dbtrees]
  out: ()
  intersectflag: hasintersect[probetrees; dbtrees]
  if[intersectflag
	cache.probetree: ()
	cache.dbtree: ()
	cache.distance: () / for the reverse distance
	cache.equalorlessthan: () / `"=" means exact `"<" means at least
		/ should always be equal I think, but leave for uniformity
  ]
  estimateclosest:{[id1; ids]
	findsize:{[id] # & treelabel.treeid = id }
	size1: findsize[id1]
	sizes: findsize'ids
	diffs: _abs[sizes - size1]
	: < diffs
  }
  i: 0
  while[i < #probetrees
    ptreeid: probetrees[i]
    mydbtrees: dbtrees _dv ptreeid
    rr: estimateclosest[ptreeid; mydbtrees]
    j: 0
    bestsofar: 9999999 / big number
    besttree: `
    dbtreesrr: mydbtrees[rr]
    while[j < #dbtreesrr
	mustcalculate: 1
	dbtreeid: dbtreesrr[j]
	kk: ()
	if[intersectflag
	  kk: & (cache.probetree = dbtreeid) & (cache.dbtree = ptreeid) 
		/ note that we are looking at reverse of old case
	]
	if[0 < #kk
	  dist: cache.dist[*kk]
	  sign: cache.equalorlessthan[*kk]
	  if[dist > bestsofar; mustcalculate: 0]
		/ no way this distance can help
	]
	if[mustcalculate
	  x: treedistwithin[ptreeid; dbtreeid; bestsofar]
	  x: x[2]
  	  if[intersectflag
		cache.probetree,: ptreeid
		cache.dbtree,: dbtreeid
		cache.distance,: x
		cache.equalorlessthan,: :[x < bestsofar; `"="; `"<"] 
 	  ]
	  if[x < bestsofar
		bestsofar: x
		besttree: dbtreeid
	  ]
	]
	j+: 1
    ]
    out,:,(ptreeid; besttree; bestsofar)
    i+: 1
  ]
  :out
}


/ tree-to-tree matches, and for each probe tree want
/ all matches of dbtrees.
/ This allows us to use treedistwithin.
/ If there is an intersection between probetrees and dbtrees,
/ we save matches so we can reuse them.
execfulltreeallmatch:{[alltrees; probetrees; dbtrees]
  out: ()
  intersectflag: hasintersect[probetrees; dbtrees]
  if[intersectflag
	cache.probetree: ()
	cache.dbtree: ()
	cache.distance: () / for the reverse distance
	cache.equalorlessthan: () / `"=" means exact `"<" means at least
		/ should always be equal I think, but leave for uniformity
  ]
  i: 0
  while[i < #probetrees
    ptreeid: probetrees[i]
    j: 0
    while[j < #dbtrees
	dbtreeid: dbtrees[j]
	kk: ()
	if[intersectflag
	  kk: & (cache.probetree = dbtreeid) & (cache.dbtree = ptreeid) & (cache.equalorlessthan = `"=")
		/ note that we are looking at reverse of old case
	]
	if[0 < #kk
	  out,:,(ptreeid; dbtreeid; cache.distance[*kk])
	]
	if[0 = #kk
    	  x: treedist[ptreeid; dbtreeid] / all nodes tree to tree
	  x: x[2]
	  bestdist: x
	  reversedist: x
	  out,:,(ptreeid; dbtreeid; bestdist)
  	  if[intersectflag
		cache.probetree,: ptreeid
		cache.dbtree,: dbtreeid
		cache.distance,: reversedist
		cache.equalorlessthan,: `"=" 
 	  ]
	]
	j+: 1
    ]
    i+: 1
  ]
  :out
}

/ tree to subtree match
execsubtree:{[alltrees; probetrees; dbtrees; findbestonly]
  out: ()
  intersectflag: hasintersect[probetrees; dbtrees]
  if[intersectflag
	cache.probetree: ()
	cache.dbtree: ()
	cache.distance: () / for the reverse distance
	cache.equalorlessthan: () / `"=" means exact `"<" means at least
		/ should always be equal I think, but leave for uniformity
  ]
  i: 0
  while[i < #probetrees
    ptreeid: probetrees[i]
    j: 0
    while[j < #dbtrees
	dbtreeid: dbtrees[j]
	kk: ()
	if[intersectflag
	  kk: & (cache.probetree = dbtreeid) & (cache.dbtree = ptreeid) & (cache.equalorlessthan = `"=")
		/ note that we are looking at reverse of old case
	]
	if[0 < #kk
	  out,:,(ptreeid; dbtreeid; cache.distance[*kk])
	]
	if[0 = #kk
    	  x: treedistmatrix[ptreeid; dbtreeid] / all nodes tree to tree
	  x: x[2] / matrix of tree dists
	  rows: * ^ x
	  cols: * | ^ x 
	  bestdist: &/ x[rows - 1]
	  reversedist: &/ x[;cols-1]
	  out,:,(ptreeid; dbtreeid; bestdist)
  	  if[intersectflag
		cache.probetree,: ptreeid
		cache.dbtree,: dbtreeid
		cache.distance,: reversedist
		cache.equalorlessthan,: `"=" 
 	  ]
	]
	j+: 1
    ]
    i+: 1
  ]
  if[findbestonly
	uniqtrees: ? out[;0]
	part: = out[;0]
	findmin:{[list] list ? &/ list}
	ii: part @' findmin'out[part;2]
	out2: uniqtrees,'out[ii;1],'out[ii;2]
	out: out2
  ]
  :out
}

/ numtoprobe: -1
/ subtreeflag: 0
/ findbestonly: 0
exec:{[numtoprobe; subtreeflag; findbestonly; reportmapping]
  alltrees: ? tree.treeid
  if[numtoprobe > #alltrees
	` 0: "You have asked to probe with more trees than present"
	!-1
  ]
  if[numtoprobe < 0
	  probetrees: alltrees
	  dbtrees: alltrees
  ]
  if[numtoprobe > 0
	probetrees: alltrees[!numtoprobe]
	dbtrees: numtoprobe _ alltrees
  ]
  pairs: ,/probetrees ,/:\: dbtrees
  i: 0
  out: ()
  if[(subtreeflag = 0) & (findbestonly = 1)
	out: execfulltreebestmatch[alltrees; probetrees; dbtrees]
  ]
  if[(subtreeflag = 0) & (findbestonly = 0)
	out: execfulltreeallmatch[alltrees; probetrees; dbtrees]
  ]
  if[(subtreeflag = 1) 
	out: execsubtree[alltrees; probetrees; dbtrees; findbestonly]
  ]
  :out
}

flatten:{[list] -2 _ ,/ ($list) ,\: (" | ")}

/ END OF EXECUTION ROUTINES

/ DATA

mappingout: ()

treelabel.treeid: ()
treelabel.nodeid: ()
treelabel.label: ()

tree.treeid: ()
tree.parentid: ()
tree.childrenid: ()

treepost.treeid: ()
treepost.nodeid: ()
treepost.postid: ()
treepost.label: ()
treepost.parentid: () / postorder number of parent
treepost.leftmost: () / postorder number of leftmost descendant
treepost.hasleftsib: () / binary vec. needed for keyroots optimization

inputfromfile[filename]
if[alphabetizeflag / rearrange the children
 labpair: treelabel.treeid,'treelabel.nodeid
 x: ordertree'!#tree.treeid
]

if[0 / exclude this
in1:("x"
"  y"
"  z"
"    w"
"    v"
"  q"
"  r"
"    p"
"  r")

in2:("d1"
 "  a1"
 "  e1"
 "  d1"
 "    b1"
 "    e1"
 "  e1"
 "    d1"
 "      d1"
 "        b1")

in3:("d1"
 "  x1"
 "  e1"
 "  d1"
 "    e1"
 "  e1"
 "    d1"
 "      d1"
 "        b1")

treeread[`treea; in1]
treeread[`treeb; in2]
treeread[`treec; in3]
]


if[0 / exclude/include
lets: `b `c `d 
treeids: ` $ ("tree") ,/: $lets
i: 0
while[i < (#lets) - 1
  in: gentree[50]
  treeread[treeids[i]; in]
  i+: 1
]
treeread[treeids[(#lets)-1]; in]
]


if[0 / exclude/include this
treelabel.treeid,: 4 # `tree1
treelabel.nodeid,: !4
treelabel.label,: `a `b `d `c
treelabel.treeid,: 4 # `tree2
treelabel.nodeid,: !4
treelabel.label,: `a `b `c `d
treelabel.treeid,: 6 # `tree3
treelabel.nodeid,: !6
treelabel.label,: `a `b `c `d `e `f
treelabel.treeid,: 6 # `tree4
treelabel.nodeid,: !6
treelabel.label,: `a `b `d `e `f `c
treelabel.treeid,: 6 # `tree5
treelabel.nodeid,: !6
treelabel.label,: `a `b `d `c `e `f
treelabel.treeid,: 6 # `tree6
treelabel.nodeid,: !6
treelabel.label,: `a `b `c `d `e `f
treelabel.treeid,: 6 # `tree7
treelabel.nodeid,: !6
treelabel.label,: `a `b `d `c `e `f

tree.treeid,: 2 # `tree1
tree.parentid,: 0 1
tree.childrenid,: (1 3;,2)
tree.treeid,: 2 # `tree2
tree.parentid,: 0 2
tree.childrenid,: (1 2;,3)
tree.treeid,: 2 # `tree3
tree.parentid,: 0 2
tree.childrenid,: (1 2;3 4 5)
tree.treeid,: 2 # `tree4
tree.parentid,: 0 1
tree.childrenid,: (1 5;2 3 4)
tree.treeid,: 3 # `tree5
tree.parentid,: 0 1 3
tree.childrenid,: (1 3;,2; 4 5)
tree.treeid,: 5 # `tree6
tree.parentid,: !5
tree.childrenid,: (,1; ,2; ,3; ,4; ,5)
tree.treeid,: 5 # `tree7
tree.parentid,: !5
tree.childrenid,: (,1; ,2; ,3; ,4; ,5)
] / all excluded

/ EXECUTION

unused: fillpost'?tree.treeid
/ `show $ `treepost
/ `show $ `treelabel
/ treedist[`tree1; `tree2]


out: ()
start: _t
if[okflag
out: flatten' exec[numtoprobe; subtreeflag; findbestonly; reportmapping]
]
end: _t
"treeout" 0: out
out

"treelayout" 0: ,/dumptree'?tree.treeid
("Time is "), ($ end - start), (" seconds.")
("Distance results are in the file treeout.")
("The outline format tree layout is in the file treelayout.")
mappingout