/ This spits out the main stats based on tmpout and cleanin / Number in dirty that are DF -- found a high score match. / Number in dirty that are DP -- found a possible match / Number in dirty that are ND -- found no good match / and a few others. / This also generates the following files based on tmpout and cleanin / tmpforeric -- all matches in format clean, dirty, statutdoublons, score / unfoundcleans -- records that are in clean that nobody has found / dupsinclean -- records in clean that might be duplicated because / they are DFed matched to the same dirty. / dirtyunmatched -- all dirty records that are either ND or have / no matching cleans of any sort / cleannds -- all matches having statut nd that in fact match clean records / (none as of 11 January 2005) / missedcleanone -- records that are clean and have same id as some dirty / yet the dirty is not matched DF to that clean even though the dirty / is matched DF to something. (none as of 11 January 2005). /finds intersection of two lists / fastest of all intersect: {[x;y] x,: () y,: () if[(#x) < (#y) i: x ?/: y j: & i < #x :x[?i[j]] ] i: y ?/: x j: & i < #y :y[?i[j]] } intersectleftindexes: {[x;y] i: x ?/: y / where each y hits j: & i < #x / those ys that hit :i[j] } spitline:{[line] (-1) _ ,/ ($line) ,\: (",")} delblanks:{[word] ii: & ~ (word = " "); word[ii]} idclean: () myscore: () iddirty: () statutdoublons: () numtarget: 0 numclean: 0 currentdirtyrec: "" currentdirtyid: `noid currentstat: `nostat cleanpairs: () processline:{[line] / if[line _sm ("*GAILLAC*"); :()] / ??? temporary if[line _sm ("target*") ii: & line = "|" x: (1+ii[0]) _ line j: x ? "|" currentstat:: ` $ delblanks x[!j] x: (1+ii[2]) _ line j: x ? "|" currentdirtyid:: ` $ delblanks x[!j] currentdirtyrec:: line if[(unmatchedflag = 1) | (currentstat = `ND) / have not found a matching target dirtyunmatched,: ,line ] unmatchedflag:: 1 :() ] if[line _sm ("match*") unmatchedflag:: 0 ii: & line = "|" if[3 > #ii; :()] x: (1+ii[1]) _ line j: x ? "|" myscore,: ` $ delblanks x[!j] x: (1+ii[2]) _ line j: x ? "|" xclean: ` $ delblanks x[!j] idclean,: xclean xdirty: currentdirtyid iddirty,: xdirty xdoublons: currentstat statutdoublons,: xdoublons cleanpairs,: ,(xclean; line) ] } outincorrect: () allcleanid:() universeclean:() processclean:{[line] if[10 < #line ii: & line = ";" x: (1+ii[2]) _ line j: x ? ";" allcleanid,: ` $ delblanks x[!j] universeclean,: ,((` $ delblanks x[!j]); line) ] } finderrors:{[] / x: $?differ[allcleanid; ?idclean] / ("missingcleans") 0: x ii: & statutdoublons = `DF myclean: idclean[ii] mydirty: iddirty[ii] part: = mydirty counts: #:' part jj: & 1 < counts if[0 < #part[jj] k1: ,/part[jj] x: ,"clean,dirty -- all for DF matches" x,: spitline' myclean[k1],'mydirty[k1] ("toomanymatches") 0: x outdupinclean,: ? myclean[k1] ] jj2: & myclean = mydirty gooddirty: ?mydirty[jj2] jj: & (~ myclean = mydirty) & (~ mydirty _lin gooddirty) kk: intersectleftindexes[mydirty[jj]; allcleanid] jj@: kk if[0 <#jj x: ,"clean,dirty -- where a clean exists" x,: spitline'myclean[jj],'mydirty[jj] ("missedcleanone") 0: x ] } / print out some cleanids printdups:{[cleanids; pairs] jj: (pairs[;0]) ?/: cleanids :pairs[jj;1] } differ:{[x;y] x,: () y,: () i: y ?/: x j: & i = #y :?x[j] } dirtyunmatched: ," " unmatchedflag: 0 a: 0: "cleanin" x: processclean'a a: 0: "tmpout" x: processline'a ("dirtyunmatched") 0: dirtyunmatched out: ,"clean,dirty,statutdoublons,score" out,: spitline' idclean,'iddirty,'statutdoublons,'myscore out,: ($differ[allcleanid;?idclean]) ,\: ",," "tmpforeric" 0: out if[1 totcount: #?iddirty ii: & statutdoublons = `DF countdf: #?iddirty[ii] ii: & statutdoublons = `DP countdp: #?(iddirty[ii]) countdphit: +/ (iddirty[ii]) = (idclean[ii]) ii: & statutdoublons = `ND countnd: #?(iddirty[ii]) countndhit: +/ (iddirty[ii]) = (idclean[ii]) countdirtnotclean: #? differ[iddirty[ii]; idclean] x: & (iddirty[ii]) = (idclean[ii]) cleannds: idclean[ii[x]] / clean records that are found but with a low score ` 0: , (" Total dirty that have found any match at all:"), ($totcount) ` 0: , (" Total dirty altogether get by grep target tmpout |wc") ` 0: , (" Dirty that are DF:"), ($countdf), (" Ratio:"), ($countdf%totcount) ` 0: , (" Dirty that are DP:"), ($countdp), (" Ratio:"), ($countdp%totcount) ` 0: , (" Number of DP that are correct:"), ($countdphit) ` 0: , (" Dirty that are ND:"), ($countnd), (" Ratio:"), ($countnd%totcount) ` 0: , (" Number of ND that are correct:"), ($countndhit) ` 0: , (" Dirty that are ND and not in clean:"), ($countdirtnotclean) ] outdupinclean: () finderrors[] ("dupsinclean") 0: printdups[outdupinclean; cleanpairs] x: ?differ[allcleanid;?idclean] ("unfoundcleans") 0: printdups[x; universeclean] ("cleannds") 0: printdups[?cleannds; universeclean]