/ This spits out the main stats based on tmpout and cleanin
/ Number in dirty that are DF -- found a high score match.
/ Number in dirty that are DP -- found a possible match
/ Number in dirty that are ND -- found no good match
/ and a few others.
/ This also generates the following files based on tmpout and cleanin
/ tmpforeric -- all matches in format clean, dirty, statutdoublons, score
/ unfoundcleans -- records that are in clean that nobody has found
/ dupsinclean -- records in clean that might be duplicated because
        / they are DFed matched to the same dirty.
/ dirtyunmatched -- all dirty records that are either ND or have
/ no matching cleans of any sort
/ cleannds -- all matches having statut nd that in fact match clean records
        / (none as of 11 January 2005)
/ missedcleanone -- records that are clean and have same id as some dirty
/ yet the dirty is not matched DF to that clean even though the dirty
/ is matched DF to something. (none as of 11 January 2005).


/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :x[?i[j]]
  ]
  i: y ?/: x
  j: & i < #y
  :y[?i[j]]
}


intersectleftindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :i[j]
}


spitline:{[line] (-1) _ ,/ ($line) ,\: (",")}
delblanks:{[word] ii: & ~ (word = " "); word[ii]}

idclean: ()
myscore: ()
iddirty: ()
statutdoublons: ()
numtarget: 0
numclean: 0
currentdirtyrec: ""
currentdirtyid: `noid
currentstat: `nostat
cleanpairs: ()
processline:{[line]
   / if[line _sm ("*GAILLAC*"); :()] / ??? temporary
   if[line _sm ("target*")
	ii: & line = "|"
	x: (1+ii[0]) _ line
	j: x ? "|"
	currentstat:: ` $ delblanks x[!j]
	x: (1+ii[2]) _ line
	j: x ? "|"
	currentdirtyid:: ` $ delblanks x[!j]
	currentdirtyrec:: line
	if[(unmatchedflag = 1) | (currentstat = `ND)
		 / have not found a matching target
		dirtyunmatched,: ,line	
	]
	unmatchedflag:: 1
	:()
   ]
   if[line _sm ("match*")
	unmatchedflag:: 0
	ii: & line = "|"
   	if[3 > #ii; :()]
	x: (1+ii[1]) _ line
	j: x ? "|"
	myscore,: ` $ delblanks x[!j]
	x: (1+ii[2]) _ line
	j: x ? "|"
	xclean: ` $ delblanks x[!j]
	idclean,: xclean
	xdirty: currentdirtyid
	iddirty,: xdirty
	xdoublons: currentstat
   	statutdoublons,: xdoublons
	cleanpairs,: ,(xclean; line)
   ]
}

outincorrect: ()
allcleanid:()
universeclean:()
processclean:{[line]
   if[10 < #line 
	ii: & line = ";"
	x: (1+ii[2]) _ line
	j: x ? ";"
	allcleanid,: ` $ delblanks x[!j]
	universeclean,: ,((` $ delblanks x[!j]); line)
   ]
}

finderrors:{[]
  / x: $?differ[allcleanid; ?idclean]
  / ("missingcleans") 0: x
  ii: & statutdoublons = `DF
  myclean: idclean[ii]
  mydirty: iddirty[ii]
  part: = mydirty
  counts: #:' part
  jj: & 1 < counts
  if[0 < #part[jj]
  	k1: ,/part[jj]
	x:  ,"clean,dirty -- all for DF matches" 
	x,: spitline' myclean[k1],'mydirty[k1]
	("toomanymatches") 0: x
	outdupinclean,: ? myclean[k1]
  ]
  jj2: & myclean = mydirty
  gooddirty: ?mydirty[jj2]
  jj: & (~ myclean = mydirty) & (~ mydirty _lin gooddirty)
  kk: intersectleftindexes[mydirty[jj]; allcleanid]
  jj@: kk
  if[0 <#jj
	x: ,"clean,dirty -- where a clean exists"
	x,: spitline'myclean[jj],'mydirty[jj]
	("missedcleanone") 0: x
  ]
}

/ print out some cleanids
printdups:{[cleanids; pairs]
  jj: (pairs[;0]) ?/: cleanids
  :pairs[jj;1]
}

differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

dirtyunmatched: ,"          "
unmatchedflag: 0
a: 0: "cleanin"
x: processclean'a
a: 0: "tmpout"
x: processline'a
("dirtyunmatched") 0: dirtyunmatched
out: ,"clean,dirty,statutdoublons,score"

out,: spitline' idclean,'iddirty,'statutdoublons,'myscore
out,: ($differ[allcleanid;?idclean]) ,\: ",,"

"tmpforeric" 0: out

if[1
  totcount: #?iddirty
  ii: & statutdoublons = `DF
  countdf: #?iddirty[ii] 
  ii: & statutdoublons = `DP
  countdp: #?(iddirty[ii])
  countdphit: +/ (iddirty[ii]) = (idclean[ii])
  ii: & statutdoublons = `ND
  countnd: #?(iddirty[ii])
  countndhit: +/ (iddirty[ii]) = (idclean[ii])
  countdirtnotclean: #? differ[iddirty[ii]; idclean]
  x: & (iddirty[ii]) = (idclean[ii])
  cleannds: idclean[ii[x]] / clean records that are found but with a low score
  ` 0: , (" Total dirty that have found any match at all:"), ($totcount)
  ` 0: , (" Total dirty altogether get by grep target tmpout |wc")
  ` 0: , (" Dirty that are DF:"), ($countdf), (" Ratio:"), ($countdf%totcount)
  ` 0: , (" Dirty that are DP:"), ($countdp), (" Ratio:"), ($countdp%totcount)
  ` 0: , (" Number of DP that are correct:"), ($countdphit)
  ` 0: , (" Dirty that are ND:"), ($countnd), (" Ratio:"), ($countnd%totcount)
  ` 0: , (" Number of ND that are correct:"), ($countndhit)
  ` 0: , (" Dirty that are ND and not in clean:"), ($countdirtnotclean)
]
   
outdupinclean: ()
finderrors[]
("dupsinclean") 0: printdups[outdupinclean; cleanpairs]
x: ?differ[allcleanid;?idclean]
("unfoundcleans") 0: printdups[x; universeclean]

("cleannds") 0: printdups[?cleannds; universeclean]