/ Code author: Dennis Shasha, May 2001 - February 2002
/ Joint work with Tsong-Li Wang and Kaizhong Zhang.
/ pathfilter.k
/  k pathfilter +q treequery [+m maxdist] [+o outputfile] +t
/ treequery may have multiple trees in it and should be different
/ for concurrent input trees. outputfile should also be different in that case.
/ Final result will be in data.out as usual.
/ Looks at treecount and treegroup to do filtering.
/ Those are assumed to be fixed for a database.
/ Do this one query tree at a time.
/ If no possible match, then say so, else find the databases to check out
/ and go for them.
/ Put the result in data.out as usual.
/ To trouble-shoot the performance, check to see whether candidatetrees
/ is larger than you think.
/ Also consider interprocess communication to pathfix.
/ March 2002: hash function changed.

/ Intended use:
/ 1) In a directory, partition the database into files of trees (the sets
/ should not overlap and more than four sets will probably reduce performance).
/ Let's say they are files treeset1, treeset2, ...
/ each in the format with the tree table and then the treelabel table.
/ 2) Now, delete the files (if they exist) treecount.l and treegroup.l.
/ or, if Unix, treecount.K and treegroup.K
/ 3) For each treeseti, call k pathfix +b treeseti
/   That sets up databases and filters.
/ 4) Now you are ready to query. Take your query file and instead of
/ saying k pathfix ... 
/ you say k pathfilter +q queryfile
/ 5) The filter will do filtering and then call pathfix on appropriate
/ databases.


/ Spit out something of the form 
/ k pathfix +q queryfile +m maxdist +o output +d databasefile +dumpmatch
/ That goes to data.out and then is appended to finalout.


/ TIME TESTING
/ \l time


treecount: 1: "treecount" / if on unix, make this "treecount.K"
treegroup: 1: "treegroup" / if on unix, make this "treegroup.K"

/ TABLE INPUT

/ a list is numeric if each member is either empty, a period
/ or a digit
isnumeric:{[list]
  s: ,/list
  nums: "1234567890"
  x: nums ?/: s
  y: (#nums) > |/x / if greater then everything in s met its match
  if[1 = y; :2] / 2 means integer
  nums: "1234567890."
  x: nums ?/: s
  :(#nums) > |/x / if greater then everything in s met its match
	/ 1 means float and 0 means not a number
}

/ parses a field based on vertical bars
getfields:{[line]
  i: line = "|"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  :(0,(j1 - !size)) _ line
}

/ get rid of blanks at either end of the string
delendblanks:{[string]
  if[0 = #string; :""]
  if[string ~ ,"" ; :""]
  i: & ~ string = " "
  if[(#string) = (#i); :string]
  if[0 = (#i); :""]
  string: (- ((#string)  - (1 + *|i))) _ string
  :(*i) _ string
}
  

/ Handles one line of input at a time according to table schema above.
/ A little more space efficient than the previous version.
processline:{[line]
 emptyorblank:{[line] (0 = #delendblanks[line])}
 emptyflag: emptyorblank[line]
 if[~emptyflag
  justempty:: 0
  if[line[0] = "#"
    newline: delendblanks'getfields[1 _ line]
    if[(1 < #newline) 
	globaltable:: *newline
	alltables,: ` $ globaltable
	globalatt:: 1 _ newline
	globalvals:: () / initialize
    ] / newline
  ] / table declaration
  if[~ line[0] = "#"
	newline: delendblanks'getfields[line]
	if[(1 < #newline) | (0 < #newline[0]) 
		globalvals,: ,newline
	]
  ] / end of test on data line
 ] / end of test on line is non-empty
 if[emptyflag & (0 = justempty)
	justempty:: 1
	/ if[ ((#globalatt) > 1) | (~ (#globalatt) = (^globalvals)[0])
		/ globalvals:: + globalvals / take transpose
	/ ]
	if[~ (#globalatt) = (#globalvals[0]); !-1]
	numericflag:()
	hh: 0
	while[hh < #globalatt
		numericflag,: isnumeric[globalvals[;hh]] 
		hh+: 1
	]
	inindex:: 0
	while[inindex < #globalatt
	  / table.att[2] :: ` $ vals[3]
		string1: globaltable, (".")
		string1,: globalatt[inindex]
		teststring: ("0 = #"), string1
		/ For big sizes, uncomment following two statements
		/ User will get an error message, but it will work.
		/ x: @[.: ; teststring; :]
		/ currentlyempty: *x / error means empty
		/ For big sizes, comment following
		currentlyempty: 1
		/ if empty, then assign, else append
		string2: :[currentlyempty; (":: "); (",: ")] 
		string3: :[0 = numericflag[inindex]
			"` $ "
			:[1 = numericflag[inindex]; "0.0 $ "; "0 $ "]]
		string3,: "globalvals[;inindex]"
		. string1, string2, string3
		if[1 = numericflag[inindex]
			globalindexes:: & 0N = . string1
			string3: string1, ("[globalindexes] :: `")
			. string3
		]
		inindex+: 1
	]
 ]
}


inputfromfile:{[filename]
  a: 0: filename
  globalvals:: ()
  oldglobalvals:: () / should not be necessary
  justempty:: 1 / as if we've just seen an empty line. should process table
  a,: ,,""
  processline'a
  makeintlist:{[line] (. $line), ()}
  tree.childrenid:: makeintlist'tree.childrenid
}


/ DUMP TABLE dumptable


/ formstring takes a list and makes a string
formstring:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (" ")
}

formstringvertbar:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: ("|")
}

/ Output a table (a variable) to a text file outfile (string)
/ e.g.   output[`guide; guide; "foobar"]
dumptable:{[tablename; table; outfile; count]
  out: ,("# "), ($tablename), ("|"), formstringvertbar[!table]
  first: *!table
  numofelements: count
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: ("|")
	i+: 1
  ]
  out,: ,"  " / extra newline
  outfile 0: out
}


/ FILTER SPECIFIC

/ This uses the filter for each tree.

/ candidatetrees are the trees in the database that look promising.
/ This finds the groups (i.e. the databases where these trees can be found)
/ Algorithm: for each candidate tree, find the groups that correspond.
/ Then solve the hitting set and find a set of groups such that every
/ tree belongs to one group in the set.
findgroups:{[candidatetrees]
    out: () / groups to try
    mmm: treegroup.tree ?/: candidatetrees
    g: ,/treegroup.groups[mmm] / each element is a list of groups
	/ that may contain this tree.
    counts: #:' g
    i: & counts = 1 / these must be in
    while[0 < #i
	out,: ,/g[i]
	out?:
	g: differ[;out]'g
    	counts: #:' g
    	i: & counts = 1 / these must be in
    ]
    :out, ,//g
}
    

/ generate queries for this tree
genqueries:{[mytree]
    xtime: _t
    / Get elements in filter.
    j: & treelabel.treeid = mytree
    labelnodeids: treelabel.nodeid[j] 
    labellabels: treelabel.label[j] 
    minitreelabel.treeid:: (#j) # mytree
    minitreelabel.nodeid:: labelnodeids
    minitreelabel.label:: labellabels
    minitreelabelcount: #j
    i: & tree.treeid = mytree
    parent: tree.parentid[i]
    parentlab: labellabels[labelnodeids ?/: parent]
    children: tree.childrenid[i]
    minitree.treeid:: (#i) # mytree
    minitree.childrenid:: children
    minitree.parentid:: parent
    minitreecount: #i
    counts: #:' children
    newpars: ,/counts #' parentlab
    newchil: labellabels[labelnodeids ?/: ,/children]
	/ Since we are counting all children here, if there is
	/ any distance we have to ask for twice the distance
    parchil: ($newpars) ,' ("_") ,/: ($newchil)
    / jj: & (~ "*" _in/: parchil) & (~ "?" _in/: parchil)
    / parchil@: jj
    / add in more parent children pairs if they don't intersect with
    / the first ones and dist > 0.
    / make sure not to double-count those either.
    if[maxdist > 0
        / to make it so that only leaves and their parents are recorded
        / when distance > 0
        myleaves: differ[,/children; parent]
        leafchildren: intersect[myleaves]'children
        counts: #:' leafchildren
        newchil: labellabels[labelnodeids ?/: ,/leafchildren]
        newpars: ,/counts #' parentlab
        parchil_leafonly: ($newpars) ,' ("_") ,/: ($newchil)
	/ Here we need to ask for maxdist only
        / jj: & (~ "*" _in/: parchil_leafonly) & (~ "?" _in/: parchil_leafonly)
	/ parchil_leafonly@: jj
    ]
    / x: parchil, $labellabels
    / First do single node
    / NOW DO SINGLE NODES 
    / only the initial assignment to x
    / and the assignment to candidatessingle differs
    x: $labellabels
    part: = x
    counts: #:' part
    uniqs: ?x
    jj: & ~ (("*") _in' uniqs)  | (("?") _in' uniqs)
    counts@: jj
    uniqs@: jj
    / Have all elements in the filter
    i: 0
    treelists: ()
    mydataout: ("data.out"), queryfilename
    myfinalout: ("finalout"), queryfilename
    countbad: 0
    while[i < #uniqs
      x: hashquery[uniqs[i]; counts[i]]
	/ So for each parent-child, we find the trees
	/ where it is present. Now if we have a path a -- b -- c
	/ and the data tree is a -- d -- c, then we will miss this if
	/ we use every pair. Instead, we will use only the ones near the leaves.
      countbad+: 0 = #x / don't match any data trees
      if[countbad >  maxdist / one bad kills us
	mydataout 0: ,("No matches for "),($mytree)
        . ("\\cat "), mydataout, (" >> "), myfinalout
	filtertime+: (_t) - xtime
	:()
      ]
      treelists,: ,x
      i+: 1
    ]
    candidatessingle: threshintersect[treelists;maxdist]
    / NOW DO PARENT-CHILD
    / only the initial assignment to x
    / and the assignment to candidatespairs differs
    treelists: ()
    if[0 < #candidatessingle
      x: parchil
      part: = x
      counts: #:' part
      uniqs: ?x
      jj: & ~ (("*") _in/: uniqs)  | (("?") _in/: uniqs)
      counts@: jj
      uniqs@: jj
      / Have all elements in the filter
      i: 0
      mydataout: ("data.out"), queryfilename
      myfinalout: ("finalout"), queryfilename
      countbad: 0
      while[(i < #uniqs) 
        x: hashquery[uniqs[i]; counts[i]]
	  / So for each parent-child, we find the trees
	  / where it is present. Now if we have a path a -- b -- c
	  / and the data tree is a -- d -- c, then we will count this twice
	  / so we compensate
        countbad+: 0 = #x / don't match any data trees
        if[countbad >  2*maxdist / because of double-counting
	  mydataout 0: ,("No matches for "),($mytree)
          . ("\\cat "), mydataout, (" >> "), myfinalout
	  filtertime+: (_t) - xtime
	  :()
        ]
        treelists,: ,x
        i+: 1
      ] / end while
    ] / end on candidatessingle
    candidatespairs: threshintersect[treelists;2*maxdist]
		/ could miss in twice max dist locations, because of 
		/ double-counting
    if[(maxdist > 0) & (0 < #candidatepairs)
      / NOW DO LEAVES ONLY; that is parent-child that include leaves only
      / only applicable if there is a distance
      / don't need to worry about double-counting because only doing
      / leaves and their parents.
      x: parchil_leafonly
      part: = x
      counts: #:' part
      uniqs: ?x
      jj: & ~ (("*") _in/: uniqs)  | (("?") _in/: uniqs)
      counts@: jj
      uniqs@: jj
      / Have all elements in the filter
      i: 0
      treelists: ()
      mydataout: ("data.out"), queryfilename
      myfinalout: ("finalout"), queryfilename
      countbad: 0
      while[i < #uniqs
        x: hashquery[uniqs[i]; counts[i]]
	  / So for each parent-child, we find the trees
	  / where it is present. Now if we have a path a -- b -- c
	  / and the data tree is a -- d -- c, then we will miss this if
	  / we use every pair. 
	  / Instead, we will use only the ones near the leaves.
        countbad+: 0 = #x / don't match any data trees
        if[countbad >  maxdist / one bad kills us
	  mydataout 0: ,("No matches for "),($mytree)
          . ("\\cat "), mydataout, (" >> "), myfinalout
	  filtertime+: (_t) - xtime
	  :()
        ]
        treelists,: ,x
        i+: 1
      ]
      candidates_leafonly: threshintersect[treelists;maxdist]
		/ could miss in twice max dist locations
      candidatespairs: intersect[candidatespairs; candidates_leafonly]
    ]
    / NOW COMBINE; Only interested in those that pass both
    candidates: intersect[candidatespairs; candidatessingle]
    / Now figure out groups
    groups: findgroups[candidates]
    / Now dump the proper portion of the tree
    mytmp1: ("tmp1"), queryfilename
    mytmp2: ("tmp2"), queryfilename
    dumptable[`tree; minitree; mytmp1; minitreecount]
    dumptable[`treelabel; minitreelabel; mytmp2; minitreelabelcount]
    mytmpquery: ("tmpquery"), queryfilename
    . ("\\cat "), mytmp1, (" "), mytmp2, (" > "), mytmpquery
    / Now issue the queries
    / Spit out something of the form k pathfix +q queryfile +d group +dumpmatch
    queryoutfile: ("data.out"), queryfilename
    i: 0
    string: ("\\ k pathfix +q "), mytmpquery, (" +d ") 
    mycandtrees: ("candtrees"), queryfilename
    mycandtrees 1: candidates
    filtertime+: (_t) - xtime
    xtime: _t
    while[i < #groups
	string1: (" +dumpmatch +m "), ($maxdist)
	string2: (" +s "), mycandtrees, (" +o "), (queryoutfile)
	/ string2: ("  ")
	. string, ($groups[i]), string1, string2 
        . ("\\cat "),  queryoutfile , (" >> "), myfinalout
	i+: 1
    ]
    pathfixtime+: (_t) - xtime
    :[outputflag
		. ("\\cp "), myfinalout, (" "), outputfile
		. ("\\cp "), myfinalout, (" data.out")]
}

/ find terms for these queries
/ Here we look only at the multiset of labels, disregarding the structure.
findterm:{[mytree]
    xtime: _t
    / Get elements in filter.
    j: & treelabel.treeid = mytree
    labellabels: treelabel.label[j] 
    / First do single node
    / NOW DO SINGLE NODES 
    / only the initial assignment to x
    / and the assignment to candidatessingle differs
    x: $labellabels
    part: = x
    counts: #:' part
    uniqs: ?x
    jj: & ~ (("*") _in' uniqs)  | (("?") _in' uniqs)
    counts@: jj
    uniqs@: jj
    / Have all elements in the filter
    i: 0
    treelists: ()
    mydataout: ("data.out"), queryfilename
    myfinalout: ("finalout"), queryfilename
    countbad: 0
    while[i < #uniqs
      x: hashquery[uniqs[i]; counts[i]]
	/ So for each parent-child, we find the trees
	/ where it is present. Now if we have a path a -- b -- c
	/ and the data tree is a -- d -- c, then we will miss this if
	/ we use every pair. Instead, we will use only the ones near the leaves.
      countbad+: 0 = #x / don't match any data trees
      if[countbad >  maxdist / one bad kills us
	mydataout 0: ,("No matches for "),($mytree)
        . ("\\cat "), mydataout, (" >> "), myfinalout
	filtertime+: (_t) - xtime
	:()
      ]
      treelists,: ,x
      i+: 1
    ]
    candidatessingle: threshintersect[treelists;maxdist]
    myfinalout 0: $candidatessingle
    pathfixtime+: (_t) - xtime
    :[outputflag
		. ("\\cp "), myfinalout, (" "), outputfile
		. ("\\cp "), myfinalout, (" data.out")]
}

/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  i: x ?/: y
  :x[(?i) _dv #x]
}        

/ intersect many lists
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj: ,/ ?:' first (?/:)/: lists[1+ !(size-1)] / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

/ this is a set intersection so we remove duplicates
/ just like multiintersect except that the maxdist allows
/ a result if it is present in (#lists) - maxdist
/ of the input lists
threshintersect:{[lists; maxdist]
 size: #lists
 / ii: & 0 < #:'lists
 / diff: (#lists) - (#ii) / number that are empty
 / lists@: ii / non-empty only
 / maxdist-: diff / tolerance goes down
 if[2 > size; :lists]
 / first: lists[0],() / no good because might be empty
 first: (?,/lists) ,() / union
 / jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 jj:  first ?/: (,/ ?:' lists) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 / kk: & x > size - (2 + maxdist)
 kk: & x > size - (1 + maxdist)
 :first[kk]
}

/ A faster difference
differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

    
/ see which trees this particular string/count combination is in
/ OTHER BIG CHANGE
hashquery:{[string; count]
    x:  _ic'string
    internalstate: 0
    while[4 < #x
	internalstate+: */x[!4]
    	internalstate: internalstate ! modval
	x: 4 _ x
    ]
    internalstate+: */x
    i: internalstate ! modval
    xxx: treecount.trees[i]
    yyy: treecount.counts[i]
    jj: & ~ (count > yyy)
    :xxx[jj]
}
    
    
hashquery:{[string; count]
    x:  31 _sv _ic string
    i: x ! modval
    xxx: treecount.trees[i]
    yyy: treecount.counts[i]
    jj: & ~ (count > yyy)
    :xxx[jj]
}
    
    
/ END OF FILTER FUNCTIONS

/ DATA

treelabel.treeid: ()
treelabel.nodeid: ()
treelabel.label: ()

tree.treeid: ()
tree.parentid: ()
tree.childrenid: ()


modval: 997


/ A generic argument handler.
/  k pathfilter +q treequery [+m maxdist] [+o outputfile] +t
queryfilename: ""
maxdist: 0
outputflag: 0
termonly: 0
outputfile: ""
okflag: 1
processargs:{[args]
  i: 0
  numinputs: 0
  while[(i < #args) & okflag
	okflag:: 0
	addextra: 0
	if[args[i] ~ "+q"
	    if[(i+1) < #args
		queryfilename:: args[i+1]
		addextra: 1
		numinputs+: 0.5
		opcode:: `"+d"
		okflag:: 1
	    ]
	]
	if[args[i] ~ "+o"
	    if[(i+1) < #args
		outputfile:: args[i+1]
		outputflag::1
		addextra: 1
		numinputs+: 1
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+m"
	    if[(i+1) < #args
		maxdist:: 0 $ args[i+1]
		addextra: 1
		okflag:: 1
	    ]
	] 
	if[args[i] ~ "+t"
	    termonly:: 1
	    okflag:: 1
	]
	i+: 1 + addextra
	if[0 = okflag
		x: "format is k pathfilter \n "
		x,: " +q <queryfilename>  \n "
		x,: " [+m <maxdist> (0)] [+o <outputfile>] [+t] \n "
		x,: " e.g. k pathfilter +q query +m 2 \n "
		x,: " Note: +t says to look for labels anywhere in tree \n "
		` 0: x
		. "\\\\"
	]
  ]
}

/ EXECUTION
/ TIME TESTING HARNESS (REMOVE / if you want this)
/ .time.set`.k
/ start: _t
/ END OF TIME TESTING HARNESS (REMOVE / if you want this)        

processargs[_i]
inputfromfile[queryfilename]
myfinalout: ("finalout"), queryfilename
myfinalout 0: ""

specifictrees:: ()
mytimes: ("times"), queryfilename
filtertime: 0
pathfixtime: 0
x: :[termonly; findterm'?tree.treeid; genqueries'? tree.treeid]
x
out: ,("Time in filter is: "), ($filtertime)
out,: ,("Time in pathfix is: "), ($pathfixtime)
mytimes 0: out

/ TIME TESTING HARNESS (REMOVE / if you want this)
/ .time.sum[]
/ xf: .TIME.f
/ xt: .TIME.t
/ xx: xf,'xt
/ zz: xx @ <xx[;1]             
/ zz

\\