/ We are trying to do searching.
/ We first import a csv file.
/ We parse it into 15 fields, suppressing final quotes and stuff.
/ Index by dept and write it out.
/ Order of fields:
/ sound1: ()
/ sound2: ()
/ sound3: ()
/ sound4: ()
/ phonexname: ()
/ codepostale: ()
/ ville: ()
/ dept: ()
/ nom: ()
/ prenom: ()
/ adresse: ()
/ teldom: ()
/ telport: ()
/ telbur: ()
/ nature: ()
/ Steps: first do a split -lines=100000 on the input file.
/ This should give you the files xaa, xab, ...
/ Then run this program which gives you dbrecs.

/ Algorithms:
/ Expect the input in the same format as the initial client sample.
/ 1. For each input record, divide it into 15 fields.
/ 2. find the appropriate code postale region by appending the
/ first three numbers to the word post and bringing in the data.
/ 3. do a match.
/
/ Some notes on the matches: 
/ 1. The sample gives some misleading soundexes
/ inasmuch as the first soundex field in the testbed spreadsheet
/ has numerals but the data doesn't have numerals in the soundex fields.
/ So cutting out that field will probably help.
/ We should also ask why they did that.
/ 2. Using the entire code postale doesn't make that much difference
/ in accuracy compared to using the first three numbers.
/ We can play with this in a second version.
/ 3. Things to change are marked CHANGE
/


/ Author: Dennis Shasha and Eric Simon


/ BASIC ROUTINES



/ find difference between list[0] and list[1]
listdiff:{[list]
  :differ[list[0]; list[1]]
}

/ returns one if x is a subset of y
subset:{[x; y]
  i: y ?/: x
  : ~ (#y) _in i
}

/ returns one if x is a subset of y
subset:{[x;y] (#y) > |/ y ?/: x}        


differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

/ A faster difference, yielding indexes in x that differ from y
differindexes:{[x;y]
  i: y ?/: x
  j: & i = #y
  :j
}


/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  :x[(?i) _dv #x]
}

/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :x[?i[j]]
  ]
  i: y ?/: x
  j: & i < #y
  :y[?i[j]]
}

/finds intersection of two lists
/ fastest of all
hasintersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  : (&/i) < #x
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :(#x) = (#intersect[x;y])
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :subset[x;y]
}

/finds indexes in x and y that intersect
/ If x and y are both sets, then the results will be of the same length
/ fastest of all
intersectindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :(i[j];j)
}

/finds indexes in x that intersect with y
intersectleftindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :i[j]
}

/finds intersection of two lists
/ and returns index pairs of matches. Assumes no duplicates
/ in either list
intersectbothindexes: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  pairs: (i ,' (!#y))
  k: & pairs[;0] < #x
  :pairs[k]
}

/ intersect many lists
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj: ,/ ?:' first (?/:)/: lists[1+ !(size-1)] / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

/ this is a set intersection so we remove duplicates
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

avg:{(+/ x) % # x}

var:{avg[_sqr x] - _sqr avg[x]}
std:{_sqrt var[x]}
cov:{avg[x * y] - avg[x] * avg[y]}
corr:{ (cov[x;y])%((std[x]) * (std[y]))}
/ delay based search
corrdelay:{[delay;x;y] 
  x: (-delay) _ x
  y: delay _ y
  (cov[x;y])%((std[x]) * (std[y]))}


spitline:{[line] (-1) _ ,/ ($line),\: ("|")}

/ END BASICS


/ FILE INPUT

/ transform lower case to upper case and strip out ending quotes
stripquotes:{[item]
  c: _ic'item
  ii: & (c > 96) & (c < 123)
  if[0 < #ii
  	c[ii]-: 32
	item: _ci' c
  ]
  if[2 > #item
	:item
  ]
  if[((item[0]) = "\"") & ((*|item) = "\"")
	: (-1) _ (1 _ item)
  ]
  :item
}

/ parses a field based on tabs
/ Strip away final quotes.
/ Accept records only if it has 15 fields.
getfieldsvert:{[line]
  i: line = "|"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  y:` $ stripquotes'x
  :y
}

/ get rid of blanks at either end of the string
delendblanks:{[string]
  if[0 = #string; :""]
  if[string ~ ,"" ; :""]
  i: & ~ string = " "
  if[(#string) = (#i); :string]
  if[0 = (#i); :""]
  string: (- ((#string)  - (1 + *|i))) _ string
  :(*i) _ string
}

/ get rid of blanks and trailing S
delblankandS:{[string]
   s: delendblanks[string]
   if[s ~ ,"S"; :s]
   if[(*|s) = "S"; :(-1) _ s]
   :s
}
  

/ MATCHING LOGIC

/ distance function without provisions for gaps or transposition
basicdist:{[x;y] res: {y(1+&)\(1_ x)&(-1_ x)-z}\[!1+#y;1+!#x;x=\:y];res[#x][#y]}

/ similarity based on edit distance
matchdist:{[x;y]
  len: (#x) | (#y)
  if[len = 0; :0]
  :(len - basicdist[x;y]) % len
}

/ if exact match then return 2
/ else fraction of intersect over union
/ Also give extra credit if first letters are the same.
/ CHANGE the credit given to first letters
match:{[symb1; symb2]
  if[symb1 ~ symb2; :2]
  s1: $symb1
  s2: $symb2
  if[1 > (#s1) & (#s2); :0]
  c1: #intersect[s1;s2]
  min: (#s1) & (#s2)
  c1+: (s1[!min] = s2[!min]) ? 0 / give extra weight if you match at position
  :c1 % #?s1,s2
}

/ if exact match then return 2
/ else fraction of intersect over union
minimatch:{[symb1; symb2]
  if[symb1 ~ symb2; :2]
  s1: $symb1
  s2: $symb2
  if[1 > (#s1) & (#s2); :0]
  c1: #intersect[s1;s2]
  :c1 % #?s1,s2
}


/ match records
/ CHANGE the scoring function at will.
vecmatches:{[rec1; rec2]
  / if[~ (rec1[7]) ~ (rec2[7]); :0] / departments must be equal
  x: 0
  x1: rec1[!3] / matching soundex numbers
  x2: rec2[!3]
  z: intersect[x1;x2] / entire number is a match; may be empty
  / y: :[(2 < #,/$z); 10; 2* match[(,/$x1); (,/$x2)]]
  y: :[(2 < #,/$z); 10; 7* matchdist[(,/$x1); (,/$x2)]]
  / if[y = 0; :0]
  x+: y
  / x+: 2* match[($rec1[4]);($rec2[4])] / phonexname
  x+: 5* matchdist[($rec1[4]);($rec2[4])] / phonexname
  x+: 0 < match[($rec1[5]);($rec2[5])] / code postale
  x+: 5 * matchdist[($rec1[6]);($rec2[6])] / ville
	/ dept is assumed
  / y: 5 * match[,/($rec1[8+!2]);(,/$rec2[8+!2])] / on nom and prenom
  y: 8 * matchdist[,/($rec1[8+!2]);(,/$rec2[8+!2])] / on nom and prenom
  if[y = 0; :0]
  x+:  y
  x1: rec1[11+!3] / matching telephone numbers
  x2: rec2[11+!3]
  z: intersect[x1;x2] / entire number is a match; may be empty
  / x+: :[(5 < #,/$z); 10; 2* match[(,/$x1); (,/$x2)]]
  x+: :[(5 < #,/$z); 10; 3* matchdist[(,/$x1); (,/$x2)]]
  :x
}

/ find the best matching records based on the score.
/ CHANGE how to deal with postal code
findbestmatch:{[rec]
  codepostale: $ rec[5]
  firstthree: (codepostale[!3])
  file: firstthree,("post")
  pair: @[1:: ; file; :]
  if[0 < pair[0]; :,"First three code postale symbols are not in database"]
  datarecs: ? pair[1] 
  jj: & ((datarecs[;14]) = rec[14]) & ((datarecs[;5]) = (rec[5]))
	/ I assume professionel/particulier is reliable
  / jj: & ((datarecs[;14]) = rec[14]) 
  datarecs@: jj
  scores: vecmatches[rec]'datarecs
  jj: > scores
  jj@: !((#jj) & 20)
  :spitline' (`candidate),/: datarecs[jj]
}

/ DUMP TABLE dumptable


/ formstring takes a list and makes a string
formstring:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (" ")
}

formstringvertbar:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: ("|")
}

formstringcomma:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (",")
}

/ Output a table (a variable) to a text file outfile (string)
/ e.g.   output[`guide; guide; "foobar"]
dumptable:{[tablename; table; outfile]
  out: ,("# "), ($tablename), ("|"), formstringvertbar[!table]
  first: *!table
  numofelements: . ("#"), ($tablename), ("."), ($first)
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: ("|")
	i+: 1
  ]
  outfile 0: out
}

dumptablecsv:{[tablename; table; outfile]
  out: , formstringcomma[!table]
  first: *!table
  numofelements: . ("#"), ($tablename), ("."), ($first)
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: (",")
	i+: 1
  ]
  outfile 0: out
}

/ APPLICATION


makeupper:{[let]
  x: _ic let
  if[(x < 97) | (x > 122); :let]
  x-: 32
  :_ci x
}

/ parses a field based on white space and other stuff
/ CHANGE HERE: other things to ignore
getfieldswhite:{[line]
  i: line _lin " .-*/;'"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  counts: #:'x
  ii: & 0 < counts
  if[0 = #ii; :`]
  :` $ delblankandS'x[ii]
}

canon:{[item]
  i: translatesource ? item
  if[i = #translatesource; :item]
  :translatetarget[i]
}
  
splittokenize:{[line]
	x: * tokenize[line]
	if[3 > #$x; :x]
	:(` $ (($x)[!2])), x
}

/ create a new member of this line if the first two symbols are singletons
/ Prepends some guys together if they have one letter.
conglomtokenize:{[line]
	s1: tokenize[line] / eliminates all manner of delimiter
		/ blanks, ;, etc. and eliminates S.
	if[2 > #s1; :s1]
	if[2 = #s1
	  if[(1 = #$s1[0]) & (1 = #$s1[1]); s1: (` $ ($s1[0]),($s1[1])),s1] 
	]
	if[2 < #s1
	  if[(1 = #$s1[0]) & (1 = #$s1[1]) & (1 = #$s1[2])
		s1: (` $ ($s1[0]),($s1[1]),($s1[2])),s1
	  ]
	]
	:s1
}

splitnumeric:{[line]
	s1: tokenize[line]
	num1: -3
	ii: & isnumeric's1
	if[0 < #ii
		num1: 0 $ $ s1[*ii]
	]
	if[0 > num1; :s1]
	first:(` $ ($num1))
	:first,s1
}


/ tokenize each list
/ This picks strings out from the middle.
tokenize:{[line]
   my: makeupper'$line
   f1: getfieldswhite my
   f2: canon'f1
   f3: differ[f2; suppress]
   :f3
}

/ tokenize each list
/ This picks strings out from the middle.
teltokenize:{[line]
   my: $line
   ii: & ~my = "."
   my@: ii / get rid of .
   ii: & ~my = ","
   my@: ii / get rid of .
   countwhite: +/ my = " "
   if[countwhite > 3 / probably between pairs of digits
      ii: & ~my = " "
      my@: ii / get rid of .
   ]
   f1: getfieldswhite my
   if[2 > #,/$f1; :` $ $ * 1 _draw 10000000] / generate a random number 
   :f1
}

/ Above does parsing
/ Now process

sort:{[list] : list[<list]}
  
/ Use the tokenized table new to find things
/ Algorithm: partition based on code (postal)
/ Then filter each group by using the first word of nom
/ (unless it's an acronym in which case use the letters) 
/ After that, try the address to get a better sense of the match.

/ This tries to unify two partitions. Untried so far.
process:{[]
  / part: = (*:' new.code),'(*:' new.address)
  / part: = (*:' new.code),'(*:' new.nom),'(*:'new.address)
  part: = (*:' new.code),'(*:' new.nom)
  x: getbest'part
}

processrollsroyce:{[]
  / part: = (*:' new.code),'(*:' new.address)
  / part: = (*:' new.code),'(*:' new.nom),'(*:'new.address)
  part: = (*:' new.code)
  x: processgroup'part
}

/ a bunch of indexes with the same code postale
/ Only from processrollsroyce
processgroupold:{[indexes]
  mynames: *:' new.nom[indexes]
  part: = mynames
  x: getbest'[indexes[part]]
}

/ This tries to unify two partitions. Untried so far.
processgroup:{[indexes]
  part: = (*:' new.nom[indexes])
  part2: = (*:' new.address[indexes])
  d12: differ[part; part2]
  d21: differ[part2; part1]
  subflag: ()
  j: 0
  while[j < #d12
  	subflag,: |/ propersubset[d12[j]]'d21
	j+: 1
  ]
  / Any index in subflag should be deleted
  d12: d12 _di &subflag
  subflag: ()
  j: 0
  while[j < #d21
  	subflag,: |/ propersubset[d21[j]]'d12
	j+: 1
  ]
  / Any index in subflag should be deleted
  d21: d21 _di &subflag
  both: intersect[part; part2]
  / x: processgroup'part
  newpart: both,d12,d21
  x: getbest'[indexes[newpart]]
}


/ These ones have the same code postale and the same first part of nom 
/ Try to find the best matches.
getbest:{[indexes]
  thresh: 4
  if[1 < #indexes
	mynames: new.nom[indexes]
	myadds:  new.address[indexes]
	myville: new.ville[indexes]
	mytel: new.tel[indexes]
	mycode: new.code[indexes]
	/ if[(`JPC _in ,/mynames) ; !-1]
	ind: !#mynames
	pairs: ,/ ind ,/:\: ind
	pairs@: & (pairs[;0]) < (pairs[;1])
	xname: findvalnames'(mynames[pairs])
	xadd: findvaladdresses'(myadds[pairs])
	xville: findvalvilles'(myville[pairs])
	xtel: findvaltel'(mytel[pairs])
	xcode:  findvalcodes'(mycode[pairs])
	/ CHANGE HERE: also see thresh above.
	/ Weighted add approach combined with logic.
	/ The logic says the name is the same and either the code
	/ city or address.
	/ xtel by the same is conclusive
	/ following was eliminated because a similarity in the name,
	/ same city and same code and anything in address was enough.
	/ xtot: (2*xname) + (0.5* xadd) + xville + (2 * xcode) + (5*xtel)
	xtot: (2*xname) + (0.5* xadd) + xville + (xcode) + (5*xtel)
	ii: & (xtot > thresh) | (1=(xname & (xcode|xville|xadd) & (xadd > 0.3)))
	goodpairs: indexes[pairs[ii]]
	/ CHANGE HERE: if it's a transitive closure but not a clique
	/ then it's probable but not certain. For later.
	if[0 < #goodpairs
	  mymat: findcycles[goodpairs]
	  nodes: ?(goodpairs[;0]),(goodpairs[;1])
	  left: nodes
	  yy: ()
	  while[0 < #left
		i: nodes ? *left
		/ CHANGE HERE: We take the transitive closure as a group.
		vv: & mymat[i]
		jj: ?nodes[vv]
		if[0 < #jj
		  yy,: ,"                "
		  globalcount+: 1
		  yy,: ,$globalcount
		  k: 0
		  while[k < #jj
			v: jj[k]
			yy,: ,spitline (lapeyre.Clientnum[v]),(lapeyre.Nom[v]),(lapeyre.Adresse1[v]),(lapeyre.CodePostal[v]),(lapeyre.Ville[v]),(lapeyre.CANetHTfacture[v]),(lapeyre.NumeroTellineaire[v])
			k+: 1
		  ]
		]
		left: differ[left; jj]
		/ if[0 < #left; !-1]
	  ]
	  globalresult,: yy
	]
  ]
}

findvalnames:{[namepair]
	:(#intersect[namepair[0]; namepair[1]])%(#?(namepair[0]),(namepair[1]))
}

findvalcodes:{[codepair]
	:(#intersect[codepair[0]; codepair[1]])%(#?(codepair[0]),(codepair[1]))
}

/ CHANGE HERE: if there is any intersection then it is one
findvaltel:{[telpair]
	:0 < (#intersect[telpair[0]; telpair[1]])
}

findvaladdresses:{[addpair]
	y:(#intersect[addpair[0]; addpair[1]]) % (#?(addpair[0]),(addpair[1]))
	:(y)
}

findvalvilles:{[vpair]
	:(#intersect[vpair[0]; vpair[1]]) % (#?(vpair[0]),(vpair[1]))
}

/ build edge matrix
buildmatrix:{[edges]
  els: ?edges[;0],edges[;1]
  num: #els
  mat: (num; num) # 0
  i: 0
  while[i < #mat
	mat[i;i]: 1
	i+: 1
  ]
  i: 0
  while[i < #edges
        pair: edges[i]
        j1: els ? pair[0]
        j2: els ? pair[1]
        mat[j1;j2]: 1
        mat[j2;j1]: 1
        i+: 1
  ]
  :mat
}

/ find transitive closure of adjacency matrix
transclos:{[mat]
  flag: 1
  while[flag
        newmat: ()
        tmat: +mat
        i: 0
        while[i < #mat
                yy: |/'mat[i] */: tmat
                newmat,: ,yy |' mat[i]
                i+: 1
        ]
        flag: ~ mat ~ newmat
        mat: newmat
  ]
  :mat
}

/ transitive closure
/ build matrix and then do multiplication as needed
findcycles:{[edges]
  mat: buildmatrix[edges]
  tmpmat:: mat
  :transclos[mat]
}

/ DATA



suppress: (`"DE"
`"LE"
`"LA"
`"DU"
`"-"
`"DES"
`".")

translatesource:(`AV `RTE `"&")
translatetarget:(`AVENUE `ROUTE `ET)



/ EXECUTION


` 0: ,"Test file is called test.csv by default and should have 15 fields "
` 0: ,"per record. Some of those may be blank."
` 0: ,"If you want to use a different file, let it be the first argument."

file: "test.csv"
if[0 < #_i
	file: _i[0]
]

sound1: ()
sound2: ()
sound3: ()
sound4: ()
phonexname: ()
codepostale: ()
ville: ()
dept: ()
nom: ()
prenom: ()
adresse: ()
teldom: ()
telport: ()
telbur: ()
nature: ()

testrecs: getfieldsvert'0: file
/ special case we are trying 52
/ testrecs: ,testrecs[52]


out: ()
/ tentatively, I'm going to store these as 15 element records
i: 0
while[i < #testrecs
	/ out,: ,"                 "
	/ out,: ,"================="
	out,: ,"                 "
	out,: ,("QUERY|"),spitline testrecs[i]
	/ out,: ,("Best matches from best to worst:")
	out,: findbestmatch[testrecs[i]]
	i+: 1
]
"tmpout" 0: out


` 0: ,"search is done. Thanks for using it. Output in tmpout. Bye."
\\