/ Code author: Dennis Shasha, 2002
/ Joint work with Tsong-Li Wang and Kaizhong Zhang.
/ cousins.k
/  k cousins +q treeincaps +d treeindb +m cousindistance +e exact +s

/ We will find the set of cousins up to a certain distance.
/ We will compare trees that way and we will find commonalities
/ by getting a threshold function.
/ Siblings have a common parent. My nephew is the grandchild of my parent.
/ First cousins are the other grandchildren of my grandparents etc.
/ This measure is sibling order independent and does not require
/ labels on interior nodes.

/ A second measure is to dress each interior node with the labels
/ of their leaf descendants and then compare those labels
/ Call this metric the subtree label.


/ TIME TESTING
/ \l time 


/ SET STUFF


/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :x[?i[j]]
  ]
  i: y ?/: x
  j: & i < #y
  :y[?i[j]]
}

/ if we know the two sets are unique
intersectuniq: {[x;y]
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :y[j]
  ]
  i: y ?/: x
  j: & i < #y
  :x[j]
}

intersectbin: {[x;y]
  x,: ()
  y,: ()
  ii: x _bin/: y
  if[0 = #ii; :()]
  x,: -1
  jj: & x[ii] = y
  z:x[ii[jj]]
  :z[& z > -1]
}

/finds intersection of two lists
/ fastest of all
hasintersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  : (&/i) < #x
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :(#x) = (#intersect[x;y])
}

subset:{[x;y] (#x) = (#intersect[x;y])}

subset:{[x;y] 
  j: y ?/: x
   / if any js are as big as #y, then there is a member of x not in y
  :0 = |/ j = #y
}

/ proportion of x that y covers
approxsubset:{[x;y]
  j: y ?/: x
  / any equal to #y are in x but not in y
  :(# & j < (#y)) % (#x)
}

/ finds indexes in x and y that intersect
/ If x and y are both sets, then the results will be of the same length
/ fastest of all
intersectindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :(i[j];j)
}

/finds indexes in x that intersect with y
intersectleftindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :i[j]
}

/ finds indexes in x that intersect with y
/ even if x is a multiset
intersectleftindexes_multi: {[x;y]
  k: y ?/: x
  : & k < #y
}

/finds intersection of two lists
/ and returns index pairs of matches. Assumes no duplicates
/ in either list
intersectbothindexes: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  pairs: (i ,' (!#y))
  k: & pairs[;0] < #x
  :pairs[k]
}


/ this is a set intersection so we remove duplicates
multiintersect:{[lists]
 size: #lists
 if[0 = #lists; :()]
 / if[2 > size; :lists]
 if[2 > size; :*lists]
 first: lists[0],()
 jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

/ this is a set intersection so we remove duplicates
/ just like multiintersect except that the maxdist allows
/ a result if it is present in (#lists) - maxdist
/ of the input lists
threshintersect:{[lists; maxdist]
 size: #lists
 if[2 > size; :lists]
 / first: lists[0],() / no good because might be empty
 first: (?,/lists) ,() / union
 / jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 jj:  first ?/: (,/ ?:' lists) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 / kk: & x > size - (2 + maxdist)
 kk: & x > size - (1 + maxdist)
 :first[kk]
}

/ A faster difference
differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

/ Count the difference
differcount:{[x;y]
  i: y ?/: x
  :# & i = #y
}

/ END OF SET STUFF
/ READ TREE BY CONVERSION FROM A FILE OF THE FORM
/ # tree|treeid|parentid|childrenid
/ treea|0|1 2 5 6 8
/ treea|2|3 4
/ treea|6|7
/ 
/ # treelabel|treeid|nodeid|label
/ treea|0|x
/ treea|1|y
/ treea|2|z
/ treea|3|w
/ treea|4|v
/ treea|5|q
/ treea|6|r
/ treea|7|p
/ treea|8|r


inputfile: "treein"
if[0 < #_i
	inputfile: _i[0]
]
/ Null values should be represented by having only blanks in a field.
/ Tables are separated by a single blank line

/ a list is numeric if each member is either empty, a period
/ or a digit
isnumeric:{[list]
  s: ,/list
  nums: "1234567890"
  x: nums ?/: s
  y: (#nums) > |/x / if greater then everything in s met its match
  if[1 = y; :2] / 2 means integer
  nums: "1234567890."
  x: nums ?/: s
  :(#nums) > |/x / if greater then everything in s met its match
	/ 1 means float and 0 means not a number
}

/ parses a field based on vertical bars
getfields:{[line]
  i: line = "|"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  :(0,(j1 - !size)) _ line
}

getfieldssymb :{[symbline]
  line: $symbline
  i: line = "|"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  :` $ (0,(j1 - !size)) _ line
}

/ get rid of blanks at either end of the string
delendblanks:{[string]
  if[0 = #string; :""]
  if[string ~ ,"" ; :""]
  i: & ~ string = " "
  if[(#string) = (#i); :string]
  if[0 = (#i); :""]
  string: (- ((#string)  - (1 + *|i))) _ string
  :(*i) _ string
}
  

/ Handles one line of input at a time according to table schema above.
/ A little more space efficient than the previous version.
processline:{[line]
 emptyorblank:{[line] (0 = #delendblanks[line])}
 emptyflag: emptyorblank[line]
 if[~emptyflag
  justempty:: 0
  if[line[0] = "#"
    newline: delendblanks'getfields[1 _ line]
    if[(1 < #newline) 
	globaltable:: *newline
	alltables,: ` $ globaltable
	globalatt:: 1 _ newline
	globalvals:: () / initialize
    ] / newline
  ] / table declaration
  if[~ line[0] = "#"
	newline: delendblanks'getfields[line]
	if[(1 < #newline) | (0 < #newline[0]) 
		globalvals,: ,newline
	]
  ] / end of test on data line
 ] / end of test on line is non-empty
 if[emptyflag & (0 = justempty)
	justempty:: 1
	/ if[ ((#globalatt) > 1) | (~ (#globalatt) = (^globalvals)[0])
		/ globalvals:: + globalvals / take transpose
	/ ]
	if[~ (#globalatt) = (#globalvals[0]); !-1]
	numericflag:()
	hh: 0
	while[hh < #globalatt
		numericflag,: isnumeric[globalvals[;hh]] 
		hh+: 1
	]
	inindex:: 0
	while[inindex < #globalatt
	  / table.att[2] :: ` $ vals[3]
		string1: globaltable, (".")
		string1,: globalatt[inindex]
		teststring: ("0 = #"), string1
		/ For big sizes, uncomment following two statements
		/ User will get an error message, but it will work.
		/ x: @[.: ; teststring; :]
		/ currentlyempty: *x / error means empty
		/ For big sizes, comment following
		currentlyempty: 1
		/ if empty, then assign, else append
		string2: :[currentlyempty; (":: "); (",: ")] 
		string3: :[0 = numericflag[inindex]
			"` $ "
			:[1 = numericflag[inindex]; "0.0 $ "; "0 $ "]]
		string3,: "globalvals[;inindex]"
		. string1, string2, string3
		if[1 = numericflag[inindex]
			globalindexes:: & 0N = . string1
			string3: string1, ("[globalindexes] :: `")
			. string3
		]
		inindex+: 1
	]
 ]
}


inputfromfile:{[filename]
  a: 0: filename
  globalvals:: ()
  oldglobalvals:: () / should not be necessary
  justempty:: 1 / as if we've just seen an empty line. should process table
  a,: ,,""
  processline'a
  makeintlist:{[line] (. $line), ()}
  tree.childrenid:: makeintlist'tree.childrenid
}


/ DISPLAY.


oldspit:{[list] 
  numtodisplay: 6
  x: ($list) ,\: " "
  out: ()
  while[0 < #x
	if[~ numtodisplay < #x
		out,: , (-1) _ ,/x
		x: ()
	]
	if[numtodisplay < #x
		out,: , (-1) _ ,/x[!numtodisplay]
		x: 6 _ x
	]
  ]
  :out
}

spit:{[list] (-1) _ ,/($list) ,\: "|"}

/ spit out a tree in indented notation.
dumptree:{[treeid]
  out: ,("Tree: "), ($treeid)
  indent: 0
  ikids: & tree.treeid = treeid
  parent: tree.parentid[ikids]
  children: tree.childrenid[ikids]
  ilab: & treelabel.treeid = treeid
  node: treelabel.nodeid[ilab]
  label: treelabel.label[ilab]
  if[0 < #parent
    out,: dump[parent; children; node; label; 0; 0]
  ]
  if[0 = #parent
    out,: ,($label)
  ]
  :out
}

/ this is a recursive procedure
/ that does the actual printing where the level of indent is proportional 
/ to the level.
/ Assumes that tree is in parent to child order (i.e. no child
/ is in the parent position before being in the child position).
dump:{[parent; children; node; label; parentindex; indent]
  i: parentindex
  out: ,(indent # " "), ($label[node ? parent[i]]),(" ("),($parent[i]),(")")
  indent+: 2
  j: 0
  while[j < #children[i]
	kid: children[i;j]
	ii: parent ? kid
	if[ii < #parent
		out,: dump[parent; children; node; label; ii; indent]
	]
	if[ii = #parent
		out,: ,(indent # " "), ($label[node ? kid]),(" ("),($kid),(")")
  	]
	j+: 1
  ]
  :out
}

/ END OF DISPLAY.

/ When filename is missing, then input data comes from tempin
/ When num is missing then an all-pairs test is done. 
/ Otherwise the first num trees from the input data are compared against
/ all the others.
/ exact tells us whether we want cousins of each distance
/ to exactly match one another or whether all close cousins
/ (within cousindistance) should match one another.
/ Program cousins with the +b <filename> will build a database
/ from that file and will call it <filename>db with an extension
/ (either .l or .K depending on the operating system but you need not care).
/ Program cousins with the +q queryfilename and +d dbfilename
/ will take the query file and the database file already produced (but
/ you need not specify the extension) and find out where the query is
/ in the database (exact or inexact cousin relationship)
/ Program cousins with the +f <filename> will take a file using
/ the normal tree format and will compare the first tree against all others.
/ +dumpquery says to dump the query tree in indented form 
/ to the file dumpedquery 
/ +dumpdb says to dump the database trees in indented form to the file dumpeddb
/ In this case, a difference is a path that is different.
unrootedflag: 0 / when set, we treat the tree as unrooted, 
	/ and compare distances between nodes (direction doesn't matter) 
	/ inappropriate with subtreeflag, 
	/ At this point, we also want leafonly = 1 to hold 
subtreelabelflag: 0 / when set we do subtree label distance where
	/ we take each root of a subtree and give it as its label
	/ the ordered sequence of its leaf labels and then compare them
phyloflag: 0 / when set we have a phylogeny file as input
leafonly: 1 / if set, care only about cousins for leaves (normal case)
	/ otherwise, anything counts, but exact is not allowed
opcode: `"randtree" / only for testing but that's the default
setcompare: 0 / if set, then don't care how many there are, 
	/ just set comparison
output: "data.out"
maxdist: 0 / 0 means just look for siblings, 1 means up to cousins,
	/ 1.5 means first cousins one removed etc.
exact: 0 / if set, then we want siblings to match siblings, 
	/ first cousins to match first cousins etc.
findcommon: 0 / if set find the relationships that are in common
	/ applies to a database that has been previously established
	/ or to an input file with a +f.
filenamefordb: "treein"
filename: "treein"
dbfilename: "dbfile" / should have a .l or .K extension, but take this away
queryfilename: "no queryfile"
dumpqueryflag: 0
dumpdbflag: 0
dumpmatchflag: 0
simthresh: 0
findcommonsimflag: 0
specifictreeflag: 0 / if 1 then we are looking for specific trees
/ and they will be placed in specifictreefile
/ Otherwise, we are looking for all trees already present in cousincount.
specifictreeindexes: () / to be used in findinsuf
specifictreefile: "ljlkjl"
specifictrees: () / list of trees to look at
okflag: 1
processargs:{[args]
  i: 0
  numinputs: 0
  while[(i < #args) & okflag
	okflag:: 0
	addextra: 0
	if[args[i] ~ "+exact"
		exact:: 1
		okflag:: 1
	]
	if[args[i] ~ "+unrooted"
		unrootedflag:: 1
		okflag:: 1
	]
	if[args[i] ~ "+subtreelabel"
		subtreelabelflag:: 1
		okflag:: 1
	]
	if[args[i] ~ "+phylo"
		phyloflag:: 1
		okflag:: 1
	]
	if[args[i] ~ "+allnodes"
		leafonly:: 0
		okflag:: 1
	]
	if[args[i] ~ "+setcompare"
		setcompare:: 1
		okflag:: 1
	]
	if[args[i] ~ "+findcommon"
		findcommon:: 1
		okflag:: 1
	]
	if[args[i] ~ "+findcommonsim"
	    if[(i+1) < #args
		findcommon:: 1
		findcommonsimflag:: 1
		simthresh:: 0.0 $ args[i+1] / number between 0 and 1
		addextra: 1
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+b"
	    if[(i+1) < #args
		filenamefordb:: args[i+1]
		addextra: 1
		numinputs+: 1
		opcode:: `"+b"
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+q"
	    if[(i+1) < #args
		queryfilename:: args[i+1]
		addextra: 1
		numinputs+: 0.5
		opcode:: `"+d"
		okflag:: 1
	    ]
	]
	if[args[i] ~ "+d"
	    if[(i+1) < #args
		dbfilename:: args[i+1]
		kk: dbfilename ? "."
		if[kk < #dbfilename; dbfilename@: !kk]
		addextra: 1
		numinputs+: 0.5
		opcode:: `"+d"
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+f"
	    if[(i+1) < #args
		filename:: args[i+1]
		addextra: 1
		numinputs+: 1
		opcode:: `"+f"
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+o"
	    if[(i+1) < #args
		output:: args[i+1]
		addextra: 1
		numinputs+: 1
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+s"
	    if[(i+1) < #args
		specifictreefile:: args[i+1]
		specifictrees:: 1: specifictreefile
		specifictreeflag:: 1
		addextra: 1
		numinputs+: 1
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+m"
	    if[(i+1) < #args
		maxdist:: 0.0 $ args[i+1]
		addextra: 1
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+dumpquery"
		dumpqueryflag:: 1
		okflag:: 1
  	]
	if[args[i] ~ "+dumpdb"
		dumpdbflag:: 1
		okflag:: 1
  	]
	if[args[i] ~ "+dumpmatch"
		/ the matching tree
		dumpmatchflag:: 1
		okflag:: 1
  	]
	i+: 1 + addextra
	if[0 = okflag
		x: "format is k cousins \n "
		x,: " (+b <filenamefordb> | +q <queryfilename> +d <dbfilename> | +f <filename>  ) \n "
		x,: " [+phylo] [+allnodes] [+exact] [+setcompare] \n "
		x,: " [+subtreelabel | +unrooted] \n"
		x,: " [+findcommon | +findcommonsim <number between 0 and 1]\n "
		x,: " [+m <maxdist> (0)] [+dumpquery] [+dumpdb] [+dumpmatch]\n "
		x,: " [+o <outputfile>] \n "
		x,: " [+n ] \n "
		x,: " +phylo asserts that input is in phylogenetic format \n "
		x,: "    as in treeinphylo \n "
		x,: " +allnodes asserts that all labeled nodes are to be \n "
		x,: "   compared, not only leaves \n "
		x,: " +exact asserts that the same pair of labels will be \n "
		x,: "   consider matcheed only if they are the same degree \n"
		x,: "   of cousin. \n "
		x,: " +setcompare ignores cardinality (default is to count \n "
		x,: "   how many times a given pair appears) \n "
		x,: " +subtreelabel says to compare based on subtree \n "
		x,: "   labels formed by taking leaves below in \n "
		x,: "   alphabetical order. \n "
		x,: " +findcommon finds common pairs among all trees \n "
		x,: " +findcommonsim 0.7 will find those pairs that are \n "
		x,: "    common to at least 0.7 of the trees \n "
		x,: " Examples: \n "
		x,: " k cousins +b treein +m 2 \n "
		x,: "   will form treeindb.l or treeindb.K \n"
		x,: "   with a cousincount data structure for each tree \n" 
		x,: "   up to second cousins. This must be enough for \n" 
		x,: "   future queries. (No error checking for this) \n" 
		x,: " k cousins +n \n"
		x,: "   means load the functions by themselves.\n"
		x,: " k cousins +q treeinquery +d treeindb +m 1.5\n"
		x,: "   will match the query tree against each \n"
		x,: "   tree in treeindb up to first cousins once removed.\n"
		x,: " k cousins +q treeinquery +d treeindb +m 2 +exact \n"
		x,: "   will match the query tree against each \n"
		x,: "   tree in treeindb and consider matches only\n"
		x,: "   between labels having the same cousin relationship \n" 
		x,: "   e.g. newphews to nephews, first cousins to first.\n" 
		x,: " k cousins +q treeinquery +d treeindb +m 2 +allnodes \n"
		x,: "   will match the query tree against each \n"
		x,: "   tree in treeindb and consider matches \n"
		x,: "   between any labels whether leaves or not. \n" 
		x,: "   +allnodes precludes +exact \n" 
		x,: " k cousins +d treeindb +findcommon \n"
		x,: "   will find the relationships among trees\n"
		x,: " k cousins +q treeinquery +d treeindb +setcompare \n"
		x,: "   will match the query tree against each \n"
		x,: "   tree in treeindb and consider matches only\n"
		x,: "   when the number of cousins is the same.\n"
		x,: "   will match the query tree against each \n"
		x,: "   tree in treeindb.\n"
		x,: " k cousins +q treeinquery +d treeindb +o fooout\n"
		x,: "   will find the query tree in treeindb and put output in fooout.\n"
		x,: " k cousins +q treeinquery +d treeindb +dumpquery +dumpmatch +m 2\n"
		x,: "   will find the query tree in treeindb within second cousins, put the\n"
		x,: "   query tree in indented form in the file dumpedquery,\n"
		x,: "   and the matching trees in the file dumpedmatch.\n"
		x,: " k cousins +f treein +dumpquery +dumpdb +m 2\n"
		x,: "   will make the first tree in treein be the query tree;\n"
		x,: "   the remaining trees in treein be the database trees; \n"
		x,: "   will look for the query tree among the database trees\n"
		x,: "   within second cousins; will put the query tree in\n"
		x,: "   indented form in the file dumpedquery, and the\n"
		x,: "   database trees in indented form in the file dumpedb.\n"
		x,: "   NB. When using +q and +d, the database trees are not available to be dumped.\n"
		x,: "   NB. Similarly when using +b, there is no query tree.\n"
		x,: " k cousins +phylo +f treeinphylo +m 2 +findcommonsim 0.7 +exact "
		x,: "   will find the features in treeinphylo that are common\n"
		x,: "   to 0.7 of the phylogenetic trees in treeinphylo\n"
		` 0: x
		. "\\\\"
	]
  ]
  xx: (~ opcode = `"+d") | (findcommon = 1) | (~ queryfilename ~ "no queryfile")
  if[xx = 0
	x: "If you are using a database, then either look for commonalities\n"
	x,: " using findcommon or include a query file using +q.\n"
	` 0: x
	. "\\\\"
  ]
  if[(maxdist = 0) & (subtreelabelflag =  0)
	x: "Remember that you are asking for siblings only.\n"
	x,: " If you want first cousins, say +m 1, second cousins +m 2 etc.\n"
	` 0: x
  ]
  if[(leafonly = 0) & (exact = 1)
	/ can't do exact if you don't want leaves
	/ (It's too hard to distinguish parents from uncles; really it is.)
	x: "Choosing allnodes precludes exact, so exact is set to 0 \n"
	x,: " implying that parents are treated like uncles and   \n"
	x,: " of course all node labels are considered. \n"
	` 0: x
	exact:: 0
  ] 
  / if[(findcommon = 0) & (maxdist > 0)
	/ can't do exact if you don't want leaves
	/ (It's too hard to distinguish parents from uncles; really it is.)
	/ x: "Setting +m has no effect when you are looking for  \n"
	/ x,: " commonalities. You should use findcommonsim with an argument.\n"
	/ ` 0: x
	/ exact:: 0
  / ] 
  if[(subtreelabelflag) & ((exact = 1) | (leafonly = 0) | (unrootedflag=1))
	x: "Choosing subtreeflag implicitly implies rooted, exact and only \n"
	x,: " leaves are taken into account.\n"
	` 0: x
	. "\\\\"
  ]
  if[(unrootedflag) & (leafonly = 0) 
	x: "Choosing unrootedflag implicitly implies only \n"
	x,: " leaves are taken into account.\n"
	x,: " This is not intrinsic, but is the normal case. "
	` 0: x
	. "\\\\"
  ]
  if[(unrootedflag) 
	x: "Note that distance here is number of links from leaf to leaf.\n"
	x,: "We hope that's what you mean.\n"
	maxdist:: 0 | ((maxdist-2) % 2)
		/ so that it counts the way the cousin counts
		/ e.g. if I'm interested up to 5 edges, then
		/ I get a distance of 1.5 (first cousins once removed)
	` 0: x
  ]
}

/ STATISTICS

avg:{(+/ x) % # x}
var:{avg[_sqr x] - _sqr avg[x]}
std:{_sqrt var[x]}
cov:{avg[x * y] - avg[x] * avg[y]}
corr:{ (cov[x;y])%((std[x]) * (std[y]))}
/ END OF STATISTICS
/ SUFFIX TREE PART

/ Construct a suffix array of a string and a tree_node id of the
/ beginning of the suffix.
/ This consists of forming all the suffixes, then then sort them.
/ Create an array that holds only the pointers to the array.

/ form suffixes and then spit out the array as symbols
/ with associated node positions.
makesuffixold:{[string; treevec]
  cuts: !(#string) 
  out: cuts _\: string
  j: <out
  out@: j
  :(out; treevec[j])
}

/ creates a suffix array
/ Greg Heil move.
makesuffixold2:{[string]
  l: #string
  : < (!l) _\: string
}

/ creates a suffix array
/ Greg Heil move, but with an improvemnt to take advantage that
/ for trees, we need not compare more than the max path length.
/ This one attempts to do the further optimization of making rs even smaller
/ than maxlen
/ Doesn't work that well.
makesuffixopt:{[maxlen; string]
  if[0 / set to 1 for testing
	string: "abcabdabbabaz"
	maxlen: 4
  ]
  s: #string
  rs: _ maxlen % 2
  if[s < rs
    : < (!s) _\: string
  ]
  x: string[(!(s-rs)) +\: !rs],string[(s-rs)+(!rs)+'!:'(rs-!rs)]
	/ all strings of length rs (or less towards the end)
  firstshot: < x / indexes in order
	/ some may be equal however and those should be extended.
  y: x[firstshot]
  equals: 0, ~': y / put a 1 in an index location i 
		/ if x[firstshot[i]] = x[firstshot[i-1]]
  firstshot: betterorder[firstshot; equals; string; maxlen]
  :firstshot
}

/ equals has a 1 in location i 
/ if x[firstshot[i]] = x[firstshot[i-1]]
/ (see calling routine)
/ we have to break the ties.
betterorder:{[firstshot; equals; string; maxlen]
  ii: & equals / indexes in firstshot such that for each index i in ii
	/ x[firstshot[i]] = x[firstshot[i-1]]
  j: 0
  while[j < #ii
	equivclass: (firstshot[ii[j]-1], firstshot[ii[j]])
	equivclassinds: (ii[j]-1; ii[j])
		/ where in firstshot this is
	flag: 1
	while[flag
	  flag: 0 / don't repeat unless there is good reason to
	  if[(j+1) < #ii
		if[ii[j+1] = ii[j]+1
			equivclass,: firstshot[ii[j+1]]
			equivclassinds,: ii[j+1]
			flag: 1
			j+: 1	
		]
	  ]
	]
	firstshot[equivclassinds]: reorder[equivclassinds; equivclass
		string; maxlen; firstshot]
	  / now have to break the ties ??? unfinished, too involved
    	j+: 1
  ]
  :firstshot
}

/ look at string at locations equivclass up to maxlen
/ return the sort order and rearrange equivclassinds that way
reorder:{[equivclassinds; equivclass; string; maxlen; firstshot]
  tocompare: ()
  slen: #string
  i: 0
  while[i < #equivclass
	ind: equivclass[i]
	len: (slen - ind) & maxlen
	tocompare,: ,string[ind+!len]
	i+: 1
  ]
  jj: < tocompare
  :firstshot[equivclassinds[jj]]
}


/ creates a suffix array
/ Greg Heil move, but with an improvement to take advantage that
/ for trees, we need not compare more than the max path length.
makesuffix:{[maxlen; string]
  s: #string
  rs: maxlen
  if[s < rs
    : < (!s) _\: string / form suffixes of whole string
  ]
  x: string[(!(s-rs)) +\: !rs],string[(s-rs)+(!rs)+'!:'(rs-!rs)]
	/ all strings of length rs (or less towards the end)
  :< x / indexes in order
}


/ here we cut off each suffix at the next delimiter character
makesuffixopt2:{[maxlen; string]
  delimiter: "#"
  s: #string
  i: 0
  out: ()
  while[i < s
	jj: string ? delimiter
	if[0 < jj; out,: ,string[!jj]]
	string: 1 _ string
	i+: 1
  ]
  :< out / indexes in order
}

/ here we cut off each suffix at the next delimiter character
/ still not better
makesuffixopt3:{[maxlen; string]
  delimiter: `"#"
  ii: & string = delimiter
  s: #string
  i: 0
  out: ()
  while[i < s-1 / because last one is #
	if[i = ii[0] 
		ii: 1 _ ii
	]
	if[~ i = ii[0] 
		out,: ,string[i+!(ii[0] - i)]
	]
	i+: 1
  ]
  :< out / indexes in order
}

/ returns 0 if s1 = s2, 1 if s1 > s2 and -1 if s1 < s2
/ assumes we have strings
strcmp:{[s1; s2]
  if[s1 ~ s2; :0]
  if[((s1 < s2) ? 1) < ((s1 > s2) ? 1); :-1]
  :1
}

strcmp:{[s1; s2]
  f1: (s1 < s2) ? 1
  f2: (s2 < s1) ? 1
  / if[f1 < f2; :-1]
  / if[f2 < f1; :1]
  / :0
  : :[f1 < f2; -1; f2 < f1; 1; 0] 
}

/ are there count indexes between any two dividing points
/ assume indexes are sorted.
isenough:{[indexes; divides; count; query; labelarray]
   while[0 < #divides
	ii: #& indexes < divides[0]
	if[~ count > ii; :1] / there are enough
	divides: 1 _ divides
	indexes: ii _ indexes
   ]
   :0
}

foundit:{[indexes; divides; count; query; labelarray]
   out: ()
   while[0 < #divides
	ii: & indexes < divides[0]
	mycount: #ii
	if[~ count > mycount
		if[(query ~ labelarray[indexes[ii]]) & (1 = |/ -': indexes[ii])
			out,: *indexes[ii]
		]
	]
	divides: 1 _ divides
	indexes: mycount _ indexes
   ]
   :out
}

/ Find out which roots have the query string
/ The query is in string notation.
/ suffixarray is a triple (allstring; sufarray; rootsufarray)
/ consisting of a string and indexes into that string
/ where nodes begin.
/ roots are the places in the tree where a path begins.
/ The trouble is that we need a sufarray and a rootsubarray because
/ the indexes for the roots are different from in the string since each root
/ represents only the beginning of a string.
/ DOESN'T QUITE WORK
searchintreenew:{[query; suffixarray; rootsintree]
  flag: 1
  if[optflag / try to filter out this part of suffixarray
	if[(1 = #query)
		ind: & suffixarray[0] = query[0]
		:rootsintree[ind]
		/ that's all you need
	]
	if[(1 < #query)
		/ linear time on suffixarray[0]
		indexes: intersectleftindexes_multi[suffixarray[0]; query]
		indexes@: <indexes
		divides: & suffixarray[0] = `"#"
		/ divides@: <divides / already sorted
		/ flag:isenough[indexes; divides; #query; query; suffixarray[0]]
		ind: foundit[indexes; divides; #query; query; suffixarray[0]]
    		:rootsintree[ind]
	]
  ]
  if[flag
    ind: searchstring[query; suffixarray]
    :rootsintree[ind]
  ]
}

searchintree:{[query; suffixarray; rootsintree]
  flag: 1
  if[optflag / try to filter out this part of suffixarray
	if[(1 = #query)
		flag: query[0] _in suffixarray[0] 
		/ that's all you need
	]
	if[(1 < #query)
		/ linear time on suffixarray[0]
		indexes: intersectleftindexes_multi[suffixarray[0]; query]
		indexes@: <indexes
		divides: & suffixarray[0] = `"#"
		/ divides@: <divides / already sorted
		flag:isenough[indexes; divides; #query; query; suffixarray[0]]
	]
  ]
  if[~ flag
	:()
  ]
  if[flag
    ind: searchstring[query; suffixarray]
    :rootsintree[ind]
  ]
}

/ given a query and roots, find paths that match that query
/ and start in those roots
locateintreeold:{[query; rootnodes; suffixarray; rootsintree]
   ind: searchstring[query; suffixarray] / indexes where query paths begin
   starting: rootsintree[ind] / the root nodes of each matching path 
   ind@: intersectleftindexes[starting ; rootnodes]
	/ only want to keep indexes where the rootnodes match
	/ starting locations where the whole structure matches
   sizeof:{[x] i: & query = "_"; 1 + #i}
   x: rootsintree[ind +\: !(sizeof[query])]
   :x 
}

locateintreeold:{[queries; rootnode; suffixarray; rootsintree]
   ind: searchstring[; suffixarray]'queries / indexes where query paths begin
   / lengths: 1 + #:' &:' (queries =\: "_") / lengths of how long to match
   lengths: 1 + |/#:' queries / how long to match
   findgood:{[rootnode;rootsintree;list]
	i: & rootsintree[list] = rootnode
	:list[*i]
   }
   ind: findgood[rootnode; rootsintree]'ind
	/ indexes consistent with roots
   x: findnodes[ rootsintree]'[ind;lengths]
   out:,?,/x
   :out
}

locateintree:{[queries; rootnode; suffixarray; rootsintree]
   ind: searchstring[; suffixarray]'queries / indexes where query paths begin
   / lengths: 1 + #:' &:' (queries =\: "_") / lengths of how long to match
   lengths: 1 + |/#:' queries / how long to match
   findgood:{[rootnode;rootsintree;list]
	i: & rootsintree[list] = rootnode
	:list[i]
   }
   ind: ,/findgood[rootnode; rootsintree]'ind
	/ indexes consistent with roots
   x: findnodes[ rootsintree]'[ind;lengths]
   out:,?,/x
   :out
}

/ for each index
findnodes:{[rootsintree; index; length] 
	:rootsintree[index+!(length & ((#rootsintree) - index))]
}

/ search for query in suffixarray
/ and find the positions of that query in suffixarray
/ Note that suffixarray is a pair (sequence of symbols; indexes in order)
searchstring:{[query; suffixarray]
  rootindexes: suffixarray[1] / should be same as indexes
  indexes: suffixarray[1]
  dbstring: suffixarray[0]
  qlen: #query
  dlen: #dbstring
  out: ()
  while[0 < #indexes
	half: _ (#indexes) % 2
	testind: indexes[half]
	roottestind: rootindexes[half]
	if[~ (testind + qlen) > dlen 
		/ e.g. if testind = 8, qlen = 1, and dlen = 9, then ok
		x: strcmp[query; dbstring[testind+!qlen]]
		if[x = 1 / query is too big
			indexes: (1 + half) _ indexes
			rootindexes: (1 + half) _ rootindexes
		]
		if[x = -1
			indexes@: !half
			rootindexes@: !half
		]
		if[x = 0  / match, may be one of several places
			/ all are consecutive
			out,: roottestind
			jj: half+1
			flag: 1
			while[(jj < #indexes) & (flag)
				testind: indexes[jj]
				roottestind: rootindexes[jj]
				if[(testind + qlen) > dlen
					flag: 0 / no way
				]
				if[~ (testind + qlen) > dlen
					x: strcmp[query;dbstring[testind+!qlen]]
					:[x = 0
						out,: roottestind
						flag: 0]
				]
				jj+: 1
			]	
			jj: half-1
			flag: 1
			while[(jj > -1) & (flag)
				testind: indexes[jj]
				roottestind: rootindexes[jj]
				if[(testind + qlen) > dlen
					flag: 0 / no way
				]
				if[~ (testind + qlen) > dlen
					x: strcmp[query;dbstring[testind+!qlen]]
					:[x = 0
						out,: roottestind
						flag: 0]
				]
				jj-: 1
			]	
			indexes: ()
			rootindexes: ()
		]
		
	]
	if[(testind + qlen) > dlen
		endlen: dlen - testind / distance in dbstring
		x: strcmp[query[!endlen]; dbstring[testind+!endlen]]
		if[x > (-1)
			/ even if 0, this is not a good position
			/ since we had to shorten query
			/ so real match should be greater
			indexes: (1 + half) _ indexes
			rootindexes: (1 + half) _ rootindexes
		]
		if[~ x > (-1)
			indexes@: !half
			rootindexes@: !half
		]
	]
  ]
  :out
}


/ END SUFFIX TREE PART


/ PATHFIX PROCESSING


/ take a path of numbers and extend it through all its children
extendpath:{[numberpath; parent; children]
  i: parent ? *|numberpath
  if[i = #parent; :,numberpath]
  if[1 = #children[i]
	y:  ,/extendpath[;parent; children]',numberpath,children[i]
	:y
  ]
  allpaths: numberpath,/:children[i] 
  x:  ,/extendpath[; parent;children]'allpaths
  :x
}
  

/ list of labels
formlab:{[x] :x}

/ assuming a symbol list is going to be extended to strings with _ in between,
/ here are the beginnings of the strings
findbeginnings:{[symblist]
  slist: $symblist
  counts: 1 + #:'slist
  :0, +\ (-1) _ counts
}

/ single label that is one long label
/ formlabnew:{[x]  y: (-1) _ ,/ ($x) ,\: ("_");  :y}
formlabnew:{[x]  y:  ("_") , (-1) _ ,/ ($x) ,\: ("_");  :y} 
	/ do want initial _ but not trailing one

/ take a path of node ids and find the corresponding labels
converttolabel: {[labellabels; labelnodeids; path]
  i: intersectleftindexes[labelnodeids; path,()]
  :formlab[labellabels[i]]
}

/ take a path of node ids and find the corresponding labels
addtreeid:{[treeid;path]
  : ` $ ($treeid) ,/: ("_") ,/: ($path)
}


/ In case where we have variable length don't cares in query tree,
/ form a partial tree given a bunch of nodes. Label this tree by its root.
/ Otherwise, algorithm is similar to producepaths.
producepaths_partial:{[treeid; root; nodes]
  i: & tree.treeid = treeid
  parent: tree.parentid[i]
  children: tree.childrenid[i]
  mm: intersectleftindexes_multi[parent;nodes]
  parent@: mm
  children@: mm
  children: intersect[nodes]'children / don't go beyond certain children
  counts: #:' children
  mm2: & 0 < counts
  parent@: mm2
  children@: mm2
  j: & treelabel.treeid = treeid
  labelnodeids: treelabel.nodeid[j] 
  labellabels: treelabel.label[j] 
  paths: extendpath[root; parent; children]
  labels: converttolabel[labellabels; labelnodeids]'paths
  newpaths: :[suffixarray 
	addtreeid[` $ ($treeid), ("+"), ($root)]'paths 
	paths]
	/ append numbers to trees or not
  / x: makesuffix'[labels; newpaths]
  / :x
  :(labels;newpaths)
}

/ cut up a tree having vldcs and question marks.
findpartitions:{[treeid]
  partroot:: () / have a root for each partition
  partnodes:: () / and a bunch of nodes
  i: & tree.treeid = treeid
  parent: tree.parentid[i]
  children: tree.childrenid[i]
  i: & treelabel.treeid = treeid
  nodes: treelabel.nodeid[i]
  labels: treelabel.label[i]
  root: * differ[parent; ,/children]
  partroot,: root / this can be a *???
  partnodes,: ,root
  rootlab: $ labels[nodes ? root]
  dontcareflag:: ("*") = rootlab[0] 
  dontcareflag|: ("?") = rootlab[0] 
  if[~dontcareflag
    recurfindpartitions[root; parent; children; nodes; labels; root]
  ]
  if[dontcareflag / root is a don't care
     j: parent ? root
     kids: children[j]
     kids: ,//finddontcare[parent;children;nodes;labels]'kids
     partroot,: kids
     partnodes,: ,:'kids
	/ assumes no dontcares among immediate children of root
	/ (march8)
	/ If we wanted that, then we could recursively look for kids
	/ that are don't cares and make them further separate partitions
	/ by putting them in partroot and partnodes. Then we would
	/ also need to change other places where we handle dontcareflag.
     recurfindpartitions[; parent; children; nodes; labels; ]'[kids;kids]
  ]
  :(partroot; partnodes)
}

/ find the non-don't cares underneath mynode 
finddontcare:{[parent;children;nodes;labels; mynode]
  out: ()
  j: nodes ? mynode
  mylab: $labels[j]
  flag: ("*") ~ mylab[0]
  flag|: ("?") ~ mylab[0]
  if[~ flag
	:mynode
  ]
  if[flag
	jj: parent ? mynode
	if[jj < #parent
		mykids: children[jj]
		out,:,/finddontcare[parent;children;nodes;labels]'mykids
	]
  ]
  :out
}

/ for a tree that has vldcs, divide up the tree
/ by identifying root and partitions
/ This is starting a new one at root.
recurfindpartitions:{[currentroot; parent; children; nodes; labels; next]
  j: parent ? next
  if[j = #parent; :,(root;root)] / root is by itself
  mm: partroot ? currentroot
  kids: children[j]
  i: 0
  labelind: nodes ?/: kids
  while[i < #kids
	kidlab: $ labels[labelind[i]]
	flag: ("*") = kidlab[0] / we don't care about vldcs having counts
	flag|: ("?") = kidlab[0] / we don't care about vldcs having counts
		/ if we want to match *[3] meaning three characters,
		/ then put in three question marks.
	if[flag
		jj: parent ? kids[i] / want children of this
		if[jj < #parent
			grandkids: children[jj]
			kk: 0
			/ Each grandkid starts its own tree which
			/ is what we want.
			/ If a grandkid is a don't care then skip to next gen
			while[kk < #grandkids
			   x: nodes ? grandkids[kk]
			   kidlab: $ labels[x]
			   flag2: ("*") = kidlab[0] 
			   flag2|: ("?") = kidlab[0] 
			   if[~flag2
				partroot,: grandkids[kk]
				partnodes,: ,grandkids[kk]
				recurfindpartitions[grandkids[kk]; parent; children; nodes; labels; grandkids[kk]]
			   ]
			   if[flag2
				/ skip to next generation
				jj1: parent ? grandkids[kk]
				if[jj1 < #parent
					grandkids,: children[jj1]
				]
			   ]
			   kk+: 1
			]
		]
	]
	if[~ flag
  		partnodes[mm],: kids[i]	
		recurfindpartitions[currentroot; parent; children; nodes; labels; kids[i]]
	]
	i+: 1
  ]
  :out
}

/ extend a suffix tree given paths and start nodes
buildsuffixtree:{[treeid; paths; startlists] 
  buildone[treeid]'[,/paths;,/startlists]
}

/ actually build the suffix tree
/ Start at the anchor and put the startlist there and then descend.
/ suffixtree.nextchars: () / an array of suffix tree chars
/ suffixtree.nextnodes: () / an array of suffix tree nodes
/ suffixtree.startnodes: () / nodes in the original trees
/ suffixtree.starttrees: () / original tree name
/ the branch from the root of the suffix tree
/ anchorsuffixtree.nextchar: () / first char of a path
/ anchorsuffixtree.nextnode: () / next place to look
buildone:{[treeid; path; startlist]
  p: path
  first: *p
  numstarts: #startlist
  ii: anchorsuffixtree.nextchar  ? first
  if[ii = #anchorsuffixtree.nextchar
	/ not present
	anchorsuffixtree.nextchar,: first
	globalnextnode+: 1
	anchorsuffixtree.nextnode,: globalnextnode
		/ initialize point in suffix tree
	suffixtree.nextchars,: ,()
	suffixtree.nextnodes,: ,()
	suffixtree.startnodes,: , ()
	suffixtree.starttrees,: , ()
  ]
  nextindex: anchorsuffixtree.nextnode[ii]
  p: 1 _ p
  while[0 < #p
	first: *p
	chars: suffixtree.nextchars[nextindex]
	kk: chars ? first
	if[kk = #chars
		suffixtree.nextchars[nextindex],: first
		globalnextnode+: 1
		suffixtree.nextnodes[nextindex],: globalnextnode
		/ initialize point in suffix tree
		suffixtree.nextchars,: ,()
		suffixtree.nextnodes,: ,()
		suffixtree.startnodes,: , ()
		suffixtree.starttrees,: , ()
		newindex: globalnextnode
	]
	if[kk < #chars
		newindex: suffixtree.nextnodes[nextindex; kk]
  	]
	nextindex: newindex
	p: 1 _ p
  ]
  suffixtree.startnodes[nextindex],: startlist
  suffixtree.starttrees[nextindex],: numstarts # treeid
}

/ suffixtree.nextchars: () / an array of suffix tree chars
/ suffixtree.nextnodes: () / an array of suffix tree nodes
/ the branch from the root of the suffix tree
/ anchorsuffixtree.nextchar: () / first char of a path
/ anchorsuffixtree.nextnode: () / next place to look
/ anchorsuffixtree.startnodes: () / nodes in the original trees
/ anchorsuffixtree.starttrees: () / original tree name
reorganizesuffixtree:{[]
  i: < anchorsuffixtree.nextchar
  anchorsuffixtree.nextnode@: i
  anchorsuffixtree.nextchar@: i
  jj: 0
  while[jj < #suffixtree.nextchars
	if[1 < #suffixtree.nextchars[jj]
		x: < suffixtree.nextchars[jj]
		suffixtree.nextchars[jj]@: x
		suffixtree.nextnodes[jj]@: x
		/ don't need to change other suffix tree terms
	]
	jj+: 1
  ]
}

/ is s1 a prefix of s2
isprefix:{[s1; s2]
  if[(#s1) > #s2; :0]
  :s1 ~ s2[!#s1]
}

/ find indexes of local superstrings
findsuperstrings:{[paths]
  jj: 0
  supers: ()
  while[jj < (#paths) - 1
	if[~ prefix[paths[jj]; paths[jj+1]]
		supers,: jj
	]
	jj+: 1
  ]
  supers,: jj / last one
}

/ creates structure of form (path; endpositions in path; starts
/ for those end positions)
reducetolongest:{[paths ; starts]
  outpath: ()
  outpositions: ()
  outstarts: ()
  superstrings: findsuperstrings[paths]
  groups: superstrings _ !#paths
  jj: 0
  while[jj < #groups
	outpath,: ,paths[*|groups[jj]]
	
	
	jj+: 1
  ]
  :(outpath; outpositions; outstarts)
}

/ END OF PATHFIX tree

/ COUSIN APPLICATION

ceil:{[x] :[x = _ x; _ x; 1 + _ x]}
sort: {[list] list[<list]}
sortunderbar: {[list] 
  : ` $ (-1) _ ,/ ($list[<list]) ,\: "_"
}

/ find all cousin pairs up to degree maxdist
/ Care only about leaves.
/ Note that we count siblings as 0, nephews as 0.5; first cousins as 1;
/ first cousins once removed as 1.5; etc.
/ No more than once removed.
/ This function has a side effect to cousincount.
setcousincount:{[treeid]
  i: & tree.treeid = treeid
  parent: tree.parentid[i]
  children: tree.childrenid[i]
  if[leafonly / leaves only
	leaves: ? differ[,/children; parent]
  ]
  if[~leafonly
  	leaves: ?(,/children), parent / all nodes are leaves
  ]
  / get node ids and labels
  j: & treelabel.treeid = treeid
  labelnodeids: treelabel.nodeid[j] 
  labellabels: treelabel.label[j] 
  / have node ids and labels
  leaflabels: ()
  mynode: ()
  mynodeindex: ()
  mypar: ()
  myparlevel: ()
  kk: 0
  / find immediate parent relationships
  / mynodeindex is the index of the children set in the input file
  / e.g. if we have
  / treey |0 |2 1
  / treey |2 |3 4 9
  / treey |1 |5 6 7
  / then 0 is the parent and 2 1 are children in index set 0
  / 2 is the parent and 3, 4, 9 in index set 1 etc.
  / We need this for all nodes even those that are not children,
  / because we will use that in the next loop.
  while[kk < #children
	/ c: :[leafonly = 1; children[kk]; intersect[children[kk]; leaves]]
	/ I think above is exactly wrong and we should count 
	/ leaves only if leafonly is set to 1  ???
	c: :[leafonly = 0; children[kk]; intersect[children[kk]; leaves]]
	/ No because definition of leaves is different. So I think
	/ it is good.
	mm: intersectleftindexes[labelnodeids; c]	
	leaflabels,: ,differ[labellabels[mm]; unknownlabels]
	mynode,: children[kk]
	mynodeindex,: (#children[kk]) # kk / index of children set
	mypar,: (#children[kk]) # parent[kk]
	myparlevel,: (#children[kk]) # 1
	kk+: 1
  ]
  kk: 1
  / find grandparents, great grandparents etc. Take advantage of fact that
  / parent level 1 is first to be found when using ?
  / A further optimization is possible if we consider extending
  / only those nodes that are leaves.
  while[kk < 1 + 2*ceil[maxdist]
	ii: & myparlevel = kk / those with a high parent level
	ii@: intersectleftindexes[mynode[ii]; leaves] / extend only leaves	
	jj: mynode ?/: mypar[ii] / find all immediate parent pairs if any
		/ immediate means immediate parents
	jjindex: & jj < #mynode
	mynode,: mynode[ii[jjindex]] / good starting nodes
	mynodeindex,: mynodeindex[ii[jjindex]] / starting indexes
	mypar,: mypar[jj[jjindex]] / good ending nodes
	myparlevel,: (#jjindex) # (kk+1) / 1 means parent, 2 grandparent etc.
	if[0 = #jjindex; kk+: 2*ceil[maxdist]]
	kk+: 1
  ]
  / Now, rearrange so we telescope this uniquely to 
  / indexes. For each child index of the original 
  / tree table, figure out how many it takes to get
  / to a certain ancestor.
  part: = mynodeindex,'myparlevel
  uniqs: ? mynodeindex,'myparlevel
  goodindex: uniqs[;0] / index of children id
  goodpar: mypar[*:'part] / parents at each level
  goodlevel: uniqs[;1] / level of those parents
  uniqgoodpar: ? goodpar
  partgoodpar: = goodpar
  partgoodpar,: ,()
  uniqgoodlevel: ?goodlevel
  partgoodlevel: = goodlevel
  partgoodlevel,: ,()
  / Now, leaflabels has the labels of leaves in each children set.
  / None of these labels are unknowns.
  kk: 0
  todo: () / children indexes to handle
  hasleaves: 0 < #:' leaflabels
  hasleaveindexes: & hasleaves
  goodindexwithleaves: goodindex
  mm1: hasleaveindexes ?/: goodindex
  mm1index: & mm1 = #hasleaveindexes
  goodindexwithleaves[mm1index]: -1 / ones without leaves
  uniqgoodindexwithleaves: ? goodindexwithleaves
  partgoodindexwithleaves: = goodindexwithleaves
  partgoodindexwithleaves,: ,()
  / Prepare level counts
	/ Ok, so here's how it goes: i = 0.5 means we go
	/ to parent of kk and find grandchildren of those.
	/ These are nephews. 
	/ (parlevel 1, parlevel 2)
	/ i = 1 means I go to 
	/ grandparents and then find others that are their grandchildren
	/ excepting myself.
	/ (parlevel 2, parlevel 2)
	/ i = 1.5 I go to my grandparents and find great grandchildren
	/ excluding my own. These are first cousins once removed.
	/ (parlevel 2, parlevel 3)
	/ i = 2 I go to great grandparents and then find their
	/ great grandchildren (parlevel 3 on each side)
	/ So, given i, take residual over floor and double;
	/ take remainder and add 1 to each side.
	/ These will always be lists of one element for unrooted case.
  if[unrootedflag = 0 / have root and am interested only in once removal
    myselflevel: (4 + _ 3 * maxdist) # ,()
    myotherlevel: (4 + _ 3 * maxdist) # ,()
    i: 0.5
    while[ ~ i > maxdist / i <= maxdist
	residual: _ 2 * (i - _ i) / 1 if remainder is 0.5
	myselflevel[_ 2 * i],: 1 + _ i
	myotherlevel[_ 2 * i],: residual + myselflevel[_ 2 * i]
	i+: 0.5
    ]
  ]
  if[unrootedflag = 1 / no root; if the maxdist = 0, we want
    / length 2, if 0.5 length 3 and so on. (We have done the reverse
    / conversion above where we processed the flags.)
    / In this case the levels can go from 1 up for me
    / to 1 up for the other guy.
    / e.g. maxdist = 0 is 1 up for both of us (common parent)
    / maxdist = 0.5 is 1 up for me and 2 up for the other or vice versa
    / maxdist = 1 is 1 up for me, 3 up for the other; 2 and 2; 3 and 1.
    / maxdist = 1.5 is 1 up for me, 4 up for the other; 2 and 3 etc.
    / In general, the formula is 1..(2*maxdist) + 1 on each side
    / This requires lists in myselflevel and myotherlevel
    / myselflevel: 1 + !((_ 2*i)+1)
    / e.g. for i = 1.5 we get 1+!4 vs. 1 + |!4, which is 
    / all ways to get 5 edges 1,4; 2,3; 3;2; 4;1
    / To save computation however so we don't double-count
    / on the other side, we could say 1 2 vs. 4 3
    / but we don't do that yet
    / Compared to the rooted case, this gives a variety of possibilities
    / e.g. if the rooted case has 2 for myselflevel and 3 for
    / myotherlevel then unrooted has 1 4, 2 3, 3 2, 4 1
    / This seems to work.
    myselflevel: (4 + _ 3 * maxdist) # ,()
    myotherlevel: (4 + _ 3 * maxdist) # ,()
    i: 0.5
    while[ ~ i > maxdist / i <= maxdist
	residual: -1 / just to have something defined
	/ Following two lines work, though they double count.
	/ Those are the unoptimized
	/ myselflevel[_ 2 * i],: 1 + !((_ 2*i)+1)
	/ myotherlevel[_ 2 * i],: 1 + |(!((_ 2*i)+1)) / reverses first 
	/ Here are the optimized lines
	/ e.g. instead of getting 1 2 3 and 3 2 1
	/ we will get 1 2 and 3 2
	/ instead of 1 2 3 4 and 4 3 2 1
	/ we will get 1 2 and 4 3
	xx1: 1 + !((_ 2*i)+1)
	xx2: 1 + |(!((_ 2*i)+1)) / reverses first 
	myhalf: ceil[(#xx1)%2]	
	myselflevel[_ 2 * i],: xx1[!myhalf]
	myotherlevel[_ 2 * i],: xx2[!myhalf]
	/ This is better but it double counts the case where we have
	/ 1 2 and 3 2. This is compensated for by a check in the
	/ addition to pairs.
	/ So the net result is correct
	i+: 0.5
    ]
  ]
  / critical loop: here we find pairs of indexes and their relationships.
  / Time-consuming step.
  / STARTING HERE
  while[kk < #leaflabels / leaflabels is a set of lists from children
    if[hasleaves[kk] / otherwise, we don't care
	new: ,(kk, 0) / level 0 cousins
  	allpairs: ,kk / a variable to eliminate from distant cousins
		/ the pairs that are already recognized as closer cousins
		/ This is a zeroth cousin of kk (the implicit first member
		/ of the pair).
	i: 0.5
	while[ ~ i > maxdist / i <= maxdist
	   selflevellist: myselflevel[_ 2*i] / of length 1 if unrooted
	   otherlevellist: myotherlevel[_ 2*i]
	   while[(0 < #selflevellist)
		selflevel: *selflevellist
		otherlevel: *otherlevellist
		selflevellist: 1 _ selflevellist
		otherlevellist: 1 _ otherlevellist
		/ i1: & (kk = goodindex) & (selflevel = goodlevel)
		/ i1: & (kk = goodindexwithleaves) & (selflevel = goodlevel)
		x11: partgoodindexwithleaves[uniqgoodindexwithleaves ? kk]
		x12: partgoodlevel[uniqgoodlevel ? selflevel]
		i1: intersectuniq[x11; x12]
		/ if[0 = #i1; i: maxdist+1] / exit loop
			/ Not sure if this is allowed in unrooted case
		/ Optimization idea: if I have no grandparents, then surely no
		/ great grandparents.
		if[0 < #i1
			if[1 < #i1; !-11] / should be only one such entry
			i1: * i1
			/ i2: & (goodpar[i1] = goodpar)&(otherlevel = goodlevel)
			x11: partgoodpar[uniqgoodpar ? goodpar[i1]]
			x12: partgoodlevel[uniqgoodlevel ? otherlevel]
			i2: intersectuniq[x11; x12]
			/ i2: i2 _dv i1 / should not be needed
			/ if[0 = #i2; i: maxdist+1] / exit loop
			/ Optimization doesn't work. The fact that I have no
			/ nephews does not imply that I have no second cousins.
			if[0 < #i2
			     / x: intersect[goodindex[i2]; hasleaveindexes]
			     / trying to find those indexes that
			     / have leaves.
			     x: goodindexwithleaves[i2]
			     x@: & x > 0
			     if[0 < #x
				pairs: x
				/ Implicitly these are pairs 
				/ since they all will have kk as first member.
				pairs: differ[pairs; allpairs]
				if[selflevel = otherlevel
					/ to avoid double-counting
					/ in both unrooted and rooted cases
				   pairs@: & pairs > kk 
				]
				allpairs,: pairs
				new,: pairs,\: i
			     ] / test on #x
			]
		]
	   ] / using the levellists
	   i+: 0.5
	] / based on i
	bits:  kk > new[;0]
	mm1: & bits=1
	mm2: & bits=0
	todo,: ((new[mm1;0]) ,' kk ,/: new[mm1;1]), (kk,/: new[mm2])
	/ todo,: (((new[mm1;0]) ,\: kk ), (kk,/: new[mm2;0])) ,' new[mm1,mm2;1]
    ] / non-empty leaflabels
    kk+: 1
  ]
  / ENDING HERE
  if[unrootedflag = 0; todo?:] / ?? Not really sure we should do this.
  / Now that we have the indexes and their relationships, we can
  / do the crossproduct of the labels, being careful not to count twice.
  / x: ,/setcousins[leaflabels; labellabels; children; goodindex; goodpar]'todo
  x: ,/setcousins[leaflabels]'todo
  / now put in cousincount
  part: = x[;0] ,' x[;1] / the degree
  uniqs: ? x[;0] ,' x[;1]
  counts: #:' part
  cousincount.treeid,: (#uniqs) # treeid
  / if[unrootedflag = 0; cousincount.degree,: uniqs[;1]]
  / if[unrootedflag = 1; cousincount.degree,: _ 2 + (2*uniqs[;1])]
  cousincount.degree,: uniqs[;1] / keep the distance in the way
		/ we count for cousins and perhaps convert if we
		/ dump the comparison
  cousincount.pair,: uniqs[;0]
  cousincount.count,: counts
}

/ leaflabels are the leaves at each id 
/ The triple consists of ind1, ind2 into leaflabels and level.
/ ind1, ind2 are indexes
/ level is the cousin relationship (0.5 is for once removed).
/ puts output in the form label_label level
setcousins:{[leaflabels; triple]
	ind1: triple[0]
	ind2: triple[1]
	level: triple[2]
  	pairs: ()
	if[level = 0 
		/ must do something a bit more complicated 
		/ to avoid self-label pairs
		myinds: !#leaflabels[ind1]
		mypairs: ,/ myinds ,/:\:  myinds
		mypairs@: & mypairs[;0] < mypairs[;1]
		pairs: leaflabels[ind1; mypairs]
	]
	if[level > 0
		lab1: leaflabels[ind1]
		lab2: leaflabels[ind2]
		pairs: ,/lab1 ,/:\: lab2
		if[0 & (leafonly = 0) / don't do this
			/ it doesn't block parents anyway.
			indpairs: ,/ children[ind1] ,/:\: children[ind2]
			noanc:{[children;  goodindex; goodpar; mypair]
				ind1: * & mypair[0] _in/: children
				parind2: mypair[1] / this is a node which
					/ could be a parent of index ind1
				jj: & goodindex = ind1
				x: parind2 _in goodpar[jj]
				if[x = 1; :0] / there is an intersection
				/ now reverse it
				ind2: * & mypair[1] _in/: children
				parind1: mypair[0] / this is a node which
					/ could be a parent of index ind2
				jj: & goodindex = ind2
				x: parind1 _in goodpar[jj]
				:~ x
			}
			indpairs@: & noanc[children;goodindex;goodpar]'indpairs
			if[0 < #indpairs
			  pairs:alllabels[indpairs[;0]],'alllabels[indpairs[;1]]
			  pairs@: & (~ pairs[;0] _in\: unknownlabels)
			  pairs@: & (~ pairs[;1] _in\: unknownlabels)
			]
		]
	]
	pairs: sortunderbar'pairs
	x:pairs,\: level
	:x
}

/ given two trees that are both in treecount,
/ compare them up to cousin distance maxdist
/ If exact is 1 then comparisons are exact only if they match
/ on the degree of cousin; otherwise, all cousins compared. 
/ If setcompare is 1 then comparisons disregard cardinality
/ (i.e. how many instances of each pair).
/ cousincount[treeid, degree, pair, count]
/ degree is degree of cousins, pair are the pair of labels, count is
/ the number of pairs having that degree.
/ Returns the intersection and the union.
comparetrees:{[treeid1; treeid2; exact; maxdist; setcompare; dumpmatchflag]
  i1: & (cousincount.treeid = treeid1) & (~ cousincount.degree > maxdist)
  i2: & (cousincount.treeid = treeid2) & (~ cousincount.degree > maxdist)
  if[(setcompare = 1)
	if[exact = 0 / any close cousins will do
		x: intersect[(?cousincount.pair[i1]); (?cousincount.pair[i2])]
		y: ?(cousincount.pair[i1]), (cousincount.pair[i2])
		z: spit (treeid1; treeid2; (#x)%(#y)) 
		if[dumpmatchflag
			new: ,z
			new,: ,"Detailed matches: "
			additional: spit' treeid1 ,/: treeid2 ,/: x	
			z: new,additional
		]
		:z
	]
	if[exact = 1 / only close cousins to same degree will do
		if[unrootedflag = 0
		  x1: cousincount.pair[i1],'cousincount.degree[i1]
		  x2: cousincount.pair[i2],'cousincount.degree[i2]
		]
		if[unrootedflag = 1
		  x1: cousincount.pair[i1],'_ 2 + (2*cousincount.degree[i1])
		  x2: cousincount.pair[i2],'_ 2 + (2*cousincount.degree[i2])
		]
		x: intersect[x1; x2]
		y: ? x1,x2
		z: spit (treeid1; treeid2; (#x)%(#y)) 
		if[dumpmatchflag
			new: ,z
			new,: ,"Detailed matches: "
			additional: spit' treeid1 ,/: treeid2 ,/: x	
			z: new,additional
		]
		:z
	]
  ]
  if[(setcompare = 0)
	if[exact = 0 / any close cousins will do
		x1p: cousincount.pair[i1]
		x1c: cousincount.count[i1]
		/ if[unrootedflag; x1c: _ x1c % 2]
		part: = x1p / need to do this because we are taking cousins
		x1pair: ? x1p
		x1count: +/' x1c[part]
		x2p: cousincount.pair[i2]
		x2c: cousincount.count[i2]
		/ if[unrootedflag; x2c: _ x2c % 2]
		part: = x2p / need to do this because we are taking cousins
		x2pair: ? x2p
		x2count: +/' x2c[part]
		jj: intersectbothindexes[x1pair; x2pair]
		xtotuniqs: ()
		xtotcounts: ()
		if[0 < #jj
		  xtotuniqs: x1pair[jj[;0]]
		  xtotcount: x1count[jj[;0]] &' x2count[jj[;1]] 
			/ minimum count in the intersection
		]
		ytot: x1pair, x2pair
		ylongcount: x1count, x2count
		part: = ytot
		ytotuniqs: ?ytot
		ytotcount: |/' ylongcount[part]
		z: spit (treeid1; treeid2; 0)
		if[0 < #jj
		 z: spit (treeid1; treeid2; (+/xtotcount)%(+/ytotcount)) 
		]
		if[dumpmatchflag
			new: ,z
			new,: ,"Detailed matches: "
			additional: spit' treeid1 ,/: treeid2 ,/: xtotuniqs ,' xtotcount	
			z: new,additional
		]
		:z
	]
	if[exact = 1 / cousins must match up
		x1pair: cousincount.pair[i1]
		x1count: cousincount.count[i1]
		/ if[unrootedflag; x1count: _ x1count % 2]
		x2pair: cousincount.pair[i2]
		x2count: cousincount.count[i2]
		/ if[unrootedflag; x2count: _ x2count % 2]
		if[unrootedflag = 0
			x1degree: cousincount.degree[i1]
			x2degree: cousincount.degree[i2]
		]
		if[unrootedflag = 1
			x1degree: _ 2 + (2*cousincount.degree[i1])
			x2degree: _ 2 + (2*cousincount.degree[i2])
		]
		jj: intersectbothindexes[x1pair,'x1degree; x2pair,'x2degree]
		xtotuniqs: x1pair[jj[;0]],'x1degree[jj[;0]]
		xtotcount: x1count[jj[;0]] &' x2count[jj[;1]] / minimum count
			/ in the intersection
		ytot: (x1pair,'x1degree), (x2pair,'x2degree)
		ylongcount: x1count, x2count
		part: = ytot
		ytotuniqs: ?ytot
		ytotcount: |/' ylongcount[part]
		/ :(xtotuniqs; xtotcount; ytotuniqs; ytotcount)
		z: spit (treeid1; treeid2; (+/xtotcount)%(+/ytotcount)) 
		if[dumpmatchflag & (0 < #xtotuniqs)
			new: ,z
			new,: ,"Detailed matches: "
			additional: spit' treeid1 ,/: treeid2 ,/: xtotuniqs[;0] ,' xtotuniqs[;1] ,' xtotcount	
			z: new,additional
		]
		:z
	]
  ]
}

/ given a set of specific trees, this finds the commonalities
/ held in most of these.
/ specifictrees has the trees to look for
/ exact = 1 says degree counts should match
/ maxdist tells the cousin distance we care about
/ setcompare = 1 says we don't care about cardinalities
/ cousincount.treeid: ()
/ cousincount[treeid, degree, pair, count]
/ findcommonsimflag if set indicates that we are looking for a threshold
/ simthresh is that threshold
findcommonalities:{[specifictrees; exact; maxdist; setcompare]
  ii: &(cousincount.treeid _in\: specifictrees)&(~ cousincount.degree > maxdist)
  mytrees: cousincount.treeid[ii]
  mytreesnum: #?mytrees
  mymaxdist: mytreesnum * (1 - simthresh) 
		/ for similarity: number of trees that need not match
  if[setcompare = 1
	if[exact = 0
  		part: = mytrees
		mypairs: cousincount.pair[ii]
		mypairsset: ?:' mypairs[part]
		out: :[findcommonsimflag
			threshintersect[mypairsset; mymaxdist]
			multiintersect[mypairsset]]
		z: ,,`common_pair 
		z,: ,:' out
		:z
	]
	if[exact = 1
		mypairs: cousincount.pair[ii]
		mydists: cousincount.degree[ii]
  		part: = mytrees ,' mydists
		uniqcombos: ? mytrees ,' mydists
		mypairsset: mypairs[part]
			/ label pairs for each tree/degree combo
		uniqdists: ?mydists
		uniqtrees: specifictrees
		out: ()
		while[0 < #uniqdists
			onedist: * uniqdists
			uniqdists: 1 _ uniqdists
			jj:  & uniqcombos[;1] = onedist
			if[(#jj) = #specifictrees
				/ otherwise some trees don't even have that
				/ distance. But now we can do a multiintersect.
				/out,:multiintersect[mypairsset[jj]] ,\: onedist
				xp: :[findcommonsimflag
				  threshintersect[mypairsset[jj]; mymaxdist]
				  multiintersect[mypairsset[jj]]]
				out,:  xp ,\: onedist
			]
		]
		z: ,`common_pair `degree
		z,: out
		:z
	]
  ]
  if[setcompare = 0 / we care about cardinalities
	if[exact = 0 / we don't care about exact degree
  		part: = mytrees
		mypairs: cousincount.pair[ii]
		mycounts: cousincount.count[ii]
		mypairsmultiset: mypairs[part]
		mypairsmulticount: mycounts[part]
		mypairs2: ()
		mycounts2: ()
		i: 0
		/ collapses all the pairs and counts for
		/ all these degrees, but then lays them out lengthwise
		/ Therefore, there will be as many copies of a pair
		/ in mypairs2 as there are trees having that pair.
		while[i < #mypairsmultiset
			part: = mypairsmultiset[i]
			mypairs2,: ? mypairsmultiset[i]
			mycounts2,:  +/' mypairsmulticount[i][part]
			i+: 1
		]
		part: = mypairs2 / partition over pairs, one pair per tree
			/ where each pair is represented once per tree
			/ though with a count
		counts: #:'part / how many trees (parititions) a pair 
			/ appears in
		jj: & ~ counts < (#specifictrees) - mymaxdist  
			/ these partitions have representatives in every tree
			/ or at least enough of them
		uniqpairs2: (?mypairs2)[jj]
		mincounts2: (&/' mycounts2[part[jj]]) 
		treecount2: counts[jj]
		z: ,`pair `min_num_within_tree `num_tree
		z,:(uniqpairs2,' mincounts2,' treecount2)
		:z
	]
	if[exact = 1
		mypairs: cousincount.pair[ii]
		mycounts: cousincount.count[ii]
		mydegrees: cousincount.degree[ii]
		part: = mypairs,'mydegrees
		uniqs: ? mypairs,'mydegrees
		counts: #:'part
		/ jj: & counts = #specifictrees
		jj: & ~ counts < (#specifictrees) - mymaxdist  
			/ these partitions have representatives in enough trees
		uniqpairs2: uniqs[jj]
		mincounts2: (&/' mycounts[part[jj]]) 
		treecount2: counts[jj]
		z: ,`pair `degree `min_num_within_tree `num_tree
		z,:(uniqpairs2,' mincounts2,'treecount2)
		:z
	]
  ]
}

/ END GENERIC COUSINS CODE

/ SUBTREE LABELS

subtreenodeid: () / id of parent of subtree
subtreelabels: () / the label


fillsubtreelabel:{[leaves; labelnodeids; labellabels; parent; children; node]
  if[node _in leaves; :labellabels[labelnodeids ? node]] / just return
  i: parent ? node
  x: ,/ fillsubtreelabel[leaves; labelnodeids; labellabels; parent; children]'children[i]
  x@: <x
  subtreenodeid,: node
  subtreelabels,: ` $ spit[x]
  :x
}

/ set a label on each subtree based on the sorted
/ order of its descendant subtrees.
/ This function has a side effect to cousincount.
setsubtreelabel:{[treeid]
  i: & tree.treeid = treeid
  parent: tree.parentid[i]
  children: tree.childrenid[i]
  / get node ids and labels
  j: & treelabel.treeid = treeid
  labelnodeids: treelabel.nodeid[j] 
  labellabels: treelabel.label[j] 
  root: * differ[parent; ?,/children]
  leaves: differ[,/children; parent]
  / have node ids and labels
  subtreenodeid:: () / id of parent of subtree
  subtreelabels:: () / the label
  x: fillsubtreelabel[leaves; labelnodeids; labellabels; parent; children; root]
  subtreelabel.treeid,: treeid
  subtreelabel.labels,: ,(?subtreelabels),()
}
  

/ As of now this is implicitly set compare, because we remove duplicates
/ on construction.
/ This finds the common interior labels of the subtrees
findcommonsubtreelabel:{[specifictrees]
  ii: &(subtreelabel.treeid _in\: specifictrees)
  mytrees: subtreelabel.treeid[ii]
  mytreesnum: #?mytrees
  mymaxdist: mytreesnum * (1 - simthresh) 
  out: :[findcommonsimflag
	threshintersect[subtreelabel.labels[ii]; mymaxdist]
	multiintersect[subtreelabel.labels[ii]]]
  if[0 = #out; :()]
  outsymb: getfieldssymb'out
  counts: #:' outsymb
  ii: < counts
  outsymb@: ii
  indexes: !#outsymb
  pairs: ,/ indexes ,/:\: indexes
  pairs@: & pairs[;0] < pairs[;1]
  tmppairs: pairs
  deleteset: ()
  while[0 < #tmppairs
	flag: propersubset[outsymb[tmppairs[0;0]]; outsymb[tmppairs[0;1]]]
	if[flag
		deleteset,:tmppairs[0;0]
		bads: & tmppairs[0;0] = tmppairs[;0]
		tmppairs:  tmppairs _di bads
	]
	if[~ flag
		tmppairs: 1 _ tmppairs
	]
  ]
  newout: outsymb _di deleteset
  :newout
}

/ find intersection and union and count
subtreelabelcompare:{[qtree; datatree]
  i1: (subtreelabel.treeid ? qtree) 
  i2:  (subtreelabel.treeid ? datatree) 
  if[(i1 | i2) = #subtreelabel.treeid; !-12] / bad tree identifiers
  x: intersect[subtreelabel.labels[i1]; subtreelabel.labels[i2]]
  y: ?subtreelabel.labels[i1], subtreelabel.labels[i2]
  z: spit (qtree; datatree; (#x)%(#y)) 
  if[dumpmatchflag
	new: ,z
	new,: ,"Detailed matches: "
	additional: spit' qtree ,/: datatree ,/: x	
	z: new,additional
  ]
  :z
}

/ END OF SUBTREE

/ PHYLOGENY


/ code to parse phylogenetic tree info.
/ For example, ( (A B) (C D) E) is in outline format
/ blank
/  E
/  blank
/   A
/   B
/  blank
/   C
/   D
/ So, we have the following algorithm: each left paren starts a new
/ node that has label blank. The children are those until the balancing
/ right paren.
phyloparse:{[file]
  lines: 0: file
  bracketmode: 0 / bracket mode is for reading the name of the tree
  foundstring: "bad string"
  bracketstring: "bad treename"
  treenum: 0
  treecurloc: 0 / index in the tree table
  treelabelcurloc: 0 / index in the treelabel table
  stringflag: 0 / not currently processing a string
  i: 0
  level: -1
  while[i < #lines
	myline: lines[i]
	j: 0
	while[j < #myline
		if[myline[j] = ("[")
			bracketmode: 1
			bracketstring: ""
		]
		if[myline[j] = ("]")
			ii: & tree.treeid = treename
			tree.treeid[ii]:: ` $ ("tree"), (bracketstring)
			ii: & treelabel.treeid = treename
			treelabel.treeid[ii]:: ` $ ("tree"), (bracketstring)
			bracketmode: 0
			bracketstring: "bad treename"
		]
		if[myline[j] = ("(")
			newtreeflag: level = -1
			level+: 1 / brings it to level 0 from -1
			if[newtreeflag
				nodenum: 0
  				treenum+: 1
  				treename: ` $ ("tree"),($treenum)
				treecurloc: ,#tree.treeid
				tree.treeid,: treename
				tree.parentid,: nodenum
				tree.childrenid,: ,()
				treelabelcurloc: ,#treelabel.treeid
				treelabel.treeid,: treename
				treelabel.nodeid,: nodenum
				treelabel.label,: `
				nodenum+: 1
			]
			if[~ newtreeflag
			        tree.childrenid[treecurloc[(#treecurloc)-1]],: nodenum
  				treecurloc,: #tree.treeid	
				treelabelcurloc,: #treelabel.treeid
				tree.treeid,: treename
				tree.parentid,: nodenum
				tree.childrenid,: ,()
				treelabel.treeid,: treename
				treelabel.nodeid,: nodenum
				treelabel.label,: `
				nodenum+: 1
			]
		]
		if[stringflag & (myline[j] _in (") "))
			/ either right paren or blank
			tree.childrenid[treecurloc[(#treecurloc)-1]],: nodenum
			treelabel.treeid,: treename
			treelabel.nodeid,: nodenum
			treelabel.label,: ` $ foundstring
			nodenum+: 1
			stringflag: 0 / no longer processing a string
			foundstring: "bad string" / reinitialize
		]
		if[myline[j] = (")") / important that it goes after string
			level-: 1
			treecurloc: (-1) _ treecurloc
			treelabelcurloc: (-1) _ treelabelcurloc
		]
		x: _ic[myline[j]]
		if[(((x > 47) & (x < 58)) | ((x > 64) & (x < 91)) | ((x > 96) & (x < 123))) 
			
			/ we have a character
			if[stringflag = 1
				foundstring,: myline[j]
			]
			if[(stringflag = 0) & (bracketmode = 0)
				foundstring: ,myline[j]
				stringflag: 1
			]
			if[(bracketmode = 1)
				bracketstring,: myline[j]
			]
		]
		j+: 1
	]
	i+: 1
  ]
}


/ END OF PHYLOGENY


/ GENERATE DATA

/ generate kids and fill in data structures
genkids:{[treeid; labels; nodeids; rootlabel; root]
  treelabel.treeid,: treeid 
  treelabel.nodeid,: root 
  treelabel.label,: rootlabel 
  numlabels: #labels
  choices: (numlabels) & (#nodeids)
  if[0 = choices; :()]
  kids: nodeids[choices _draw -#nodeids]
  x: *1 _draw #kids / take only some of those
  kids@: !x
  if[0 = #kids; :nodeids]
  kidlabels: labels[(#kids) _draw -numlabels]
  tree.treeid,: treeid
  tree.parentid,: root
  tree.childrenid,: ,kids
  remaining: differ[nodeids; kids] / not available to kids
  i: 0
  while[i < #kids
	remaining: genkids[treeid; labels; remaining; kidlabels[i]; kids[i]]
	i+: 1
  ]
  :remaining
}

/ generate a tree by putting data into
/ tree(`childrenid `treeid `parentid)
/ and treelabel(`treeid `nodeid `label).
gentree:{[size; treeid]
  labels: `a1 `b1 `c1 `d1 `e1
  numlabels: #labels
  nodeids: !size
  root: nodeids[*1 _draw #nodeids]
  rootlabel: labels[* 1 _draw #labels]
  nodeids: nodeids _dv root
  genkids[treeid; labels; nodeids; rootlabel; root]
}

/ generate kids of a subtree
genkidssubtree:{[treeid; labels; nodes; rootlabel; root; parents; children]
  treelabel.treeid,: treeid 
  treelabel.nodeid,: root 
  treelabel.label,: rootlabel 
  i: parents ? root
  if[i = #parents; :()]
  kids: children[i]
  x: 1 + *1 _draw #kids / take only some of those, but at least 1
  kids@: !(x)
  if[0 = #kids; :()]
  ii: intersectleftindexes[nodes; kids]
  kidlabels: labels[ii]
  tree.treeid,: treeid
  tree.parentid,: root
  tree.childrenid,: ,kids
  i: 0
  while[i < #kids
	genkidssubtree[treeid; labels; nodes; kidlabels[i]; kids[i]; parents; children]
	i+: 1
  ]
  :()
}

/ gen an interior subtree of a tree
/ tree(`childrenid `treeid `parentid)
/ and treelabel(`treeid `nodeid `label).
gensubtree:{[treeid]
  j: & tree.treeid = treeid
  if[0 = #j; :()]
  parents: tree.parentid[j]
  children: tree.childrenid[j]
  k: & treelabel.treeid = treeid
  labelnodes: treelabel.nodeid[k]
  labellabels: treelabel.label[k]
  j1: * 1 _draw #parents
  n: parents[j1]
  j: labelnodes ? n
  root: labelnodes[j]
  rootlabel: labellabels[j]
  newtreeid: ` $ ($treeid), ("sub")
  genkidssubtree[newtreeid; labellabels; labelnodes; rootlabel; root; parents; children]
}


/ DATA

treelabel.treeid: ()
treelabel.nodeid: ()
treelabel.label: ()

tree.treeid: ()
tree.parentid: ()
tree.childrenid: ()

cousincount.treeid: ()
cousincount.degree: () / 0th degree for siblings, 0.5 for uncles, 1 for cousins
cousincount.pair: () / pair of labels in alphabetical order with a _ separator
cousincount.count: () / how many of that pair

subtreelabel.treeid: ()
subtreelabel.labels: () / the whole set of labels for the subtree

unknownlabels: ` `unknown


/ EXECUTION

/ TIME TESTING HARNESS (REMOVE / if you want this)
/ .time.set`.k
/ start: _t
/ END OF TIME TESTING HARNESS (REMOVE / if you want this)   

text:()
starttime: _t
processargs[_i]
/ Generate trees if random
if[(opcode = `randtree) 
	numtrees: 10
	size: 50
	treeids: ` $ ("rtree") ,/: ($!numtrees)
	gentree[size]' treeids
	/ gensubtree'treeids
	text,: ,"Input from random tree generation. "
]
/ Create tree files if from text
if[opcode = `"+b" 
   if[phyloflag = 0
	inputfromfile[filenamefordb]
	text,: ,("Input from build file "), filenamefordb
   ]
   if[phyloflag = 1
	phyloparse[filenamefordb]
	text,: ,("Input from build file is a phylogeny tree "), filenamefordb
   ]
   if[(0 = #tree.treeid) | (0 = #treelabel.treeid)
		x: " The input file for the database seems to have no trees.\n"
		x,: " Please check format.\n"
		` 0: x
		. "\\\\"
   ]
]
/ If query against database, then create tree file for that
if[(opcode = `"+d" )
   text,: ,("Input from database file "), dbfilename
   if[~ queryfilename ~ "no queryfile"
      text,: ,("  Input from query file "), queryfilename
      if[phyloflag = 0
         inputfromfile[queryfilename] / we now have the query file
      ]
      if[phyloflag = 1
         phyloparse[queryfilename] / we now have the query file
      ]
      if[(0 = #tree.treeid) | (0 = #treelabel.treeid)
		x: " The query file seems to have no trees.\n"
		x,: " Please check format.\n"
		` 0: x
		. "\\\\"
      ]
   ]
]
/ If set of trees, then create tree file.
if[opcode = `"+f" 
   if[phyloflag = 0 
	inputfromfile[filename]
   	text,: ,("Input from file "), filename, (" whose first tree is the query")
   ]
   if[phyloflag = 1 
	phyloparse[filename]
   	text,: ,("Input from phylogeny file "), filename, (" whose first tree is the query")
   ]
   if[(0 = #tree.treeid) | (0 = #treelabel.treeid)
		x: " The input file seems to have no trees.\n"
		x,: " Please check format.\n"
		` 0: x
		. "\\\\"
   ]
]
/ figure out what to dump and dump it
if[dumpdbflag
    if[opcode = `"+b"
     "dumpeddb" 0: ,/dumptree'?treelabel.treeid
    ]
    if[opcode _in `"+f" `"+phylo"
	:[findcommon
     		"dumpeddb" 0: ,/dumptree'(?treelabel.treeid)
     		"dumpeddb" 0: ,/dumptree'(1 _ ?treelabel.treeid)]
    ]
]

/ These are cases where we have the tree table
if[(opcode _in `"+b" `"+f" `randtree `"+phylo")
	if[opcode _in `"+b" `randtree
	  uniqtrees: ? tree.treeid
	  / this part is for the +b case
    	  xx: ` $ filenamefordb, ("db")
 	  if[specifictreeflag = 0; specifictrees: uniqtrees]
	]
	if[opcode _in `"+f" `"+phylo"
	  uniqtrees: :[findcommon; ?tree.treeid; 1 _ ? tree.treeid]
 	  if[specifictreeflag = 0; specifictrees: uniqtrees]
	]
  jj: 0
  while[jj < #uniqtrees
    mytree: uniqtrees[jj]
    x: :[subtreelabelflag
		setsubtreelabel[mytree]
		setcousincount[mytree]] 
	/ for each tree produce cousincount or subtree labels
    jj+: 1
  ]
  if[opcode = `"+b" 
	xx: filenamefordb, ("db")
    	x: :[subtreelabelflag
		xx 1: subtreelabel
		xx 1: cousincount]
  ]
]

/ query part
/ Here we perform the work using the cousincount or subtreelabel data structure
doquery:{[]
  matched: ()
  dbbuild: _t
  / trying to find commonalities
  if[findcommon
     if[dumpqueryflag
 	outtmp: ()
	i: 0
	tmplabs: ?treelabel.treeid
	while[i < #tmplabs
		outtmp,: dumptree[tmplabs[i]]
		i+: 1
	]
        "dumpedquery" 0: outtmp
     ]
     x: :[subtreelabelflag
		findcommonsubtreelabel[specifictrees]
		findcommonalities[specifictrees; exact; maxdist; setcompare]]
     text,: spit'x
     :()
  ]
  if[opcode _in `"+f" `"+phylo"
	/ query part
	querytrees: ,*? tree.treeid
  ]
  if[opcode _in `"no-op" `randtree / must make sure you get all the database
	querytrees: ? tree.treeid
  ]
  if[~ queryfilename ~ "no queryfile"
	querytrees: ? tree.treeid
  ]
  / so we are doing a query
 x: :[subtreelabelflag
		setsubtreelabel'querytrees
 		setcousincount'querytrees]
 / comparetrees[treeid1; treeid2; exact; maxdist; setcompare]
 outdumpmatch: ()
 mm: 0
 while[mm < (#querytrees) 
  qtree: querytrees[mm]
  / iterate over the trees
  x: :[subtreelabelflag
    subtreelabelcompare[qtree]'specifictrees
    comparetrees[qtree; ;exact;maxdist; setcompare;dumpmatchflag]'specifictrees]
  if[dumpmatchflag
      outdumpmatch,: x
  ]
  if[~ dumpmatchflag
	out: x
  ]
  if[dumpqueryflag
     "dumpedquery" 0: dumptree[*?treelabel.treeid]
  ]
  if[~ (opcode = `"no-op")
    text,: ,("Total time is "), ( $ _t - starttime), (" seconds.")
    if[opcode _in `"+b" `"+f" `randtree `"+phylo"
      text,: ,("Database construction time is "), ( $ dbbuild - starttime), (" seconds.")
    ]
    if[opcode = `randtree
      text,: ,("Number of trees is "), ($numtrees)
      text,: ,("Size of trees is "), ($size)
    ]
    if[0 = #out
  	  text,: , ("There are no matches for the query tree: "), ($qtree)
    ]
    if[0 < #out
  	  text,: , ("Results for "), ($qtree), (" are: ")
	  / text,: spit[out @ < out]
    	  text,: :[dumpmatchflag 
		  ,/outdumpmatch 
		  out]
    ]
  ]
  mm+: 1 / to skip to next tree 
 ]
 if[opcode = `"no-op"; :matched]
}

text: ()
if[opcode _in `randtree `"+f" `"+phylo"
  doquery[]
]
output 0: text
if[50 < #text
  text@: !50
  text,: ," ...continued..."
]
/ If running as a web server, comment out the following line:
text
/ TIME TESTING HARNESS (REMOVE / if you want this)
/ .time.sum[]
/ xf: .TIME.f
/ xt: .TIME.t
/ xx: xf,'xt
/ zz: xx @ <xx[;1]
/ zz             
/ Joao, comment from here down.
/ !-1 / ??? REMOVE
if[~ opcode = `"+d"; . "\\\\"]

 / now take care of +d case
 aax: 1: dbfilename
 if[subtreelabelflag
	subtreelabel.treeid: aax.treeid
	/ subtreelabel.labels: aax.labels
	/ We had a problem with incompatible columns even when
	/ this branch wasn't taken, so we have done this:
	. "subtreelabel.labels: aax.labels"
 	if[specifictreeflag = 0; specifictrees: ?subtreelabel.treeid]
 ]
 if[~ subtreelabelflag
 	/ cousincount[treeid, degree, pair, count]
 	cousincount.treeid: aax.treeid
 	/ cousincount.degree: aax.degree
 	/ cousincount.pair: aax.pair
 	/ cousincount.count: aax.count
 	. "cousincount.degree: aax.degree"
 	. "cousincount.pair: aax.pair"
 	. "cousincount.count: aax.count"
 	if[specifictreeflag = 0; specifictrees: ?cousincount.treeid]
  ]
 

 doquery[]
output 0: text


/ If running as a web server, comment out the following line:
if[50 < #text
  text@: !50
  text,: ," ...continued..."
]
text
\\