/ This parses dirtyin and cleanin from the single file databases into
/ output files.
/ 


/ Authors: Dennis Shasha and Eric Simon

/ TIME TESTING HARNESS (REMOVE / if you want this)
/ \l time

/ BASIC ROUTINES


/ find difference between list[0] and list[1]
listdiff:{[list]
  :differ[list[0]; list[1]]
}

/ returns one if x is a subset of y
subset:{[x; y]
  i: y ?/: x
  : ~ (#y) _in i
}

/ returns one if x is a subset of y
subset:{[x;y] (#y) > |/ y ?/: x}        


differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

/ A faster difference, yielding indexes in x that differ from y
differindexes:{[x;y]
  i: y ?/: x
  j: & i = #y
  :j
}


/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  :x[(?i) _dv #x]
}

/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :x[?i[j]]
  ]
  i: y ?/: x
  j: & i < #y
  :y[?i[j]]
}

/finds intersection of two lists
/ fastest of all
hasintersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  : (&/i) < #x
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :(#x) = (#intersect[x;y])
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :subset[x;y]
}

/finds indexes in x and y that intersect
/ If x and y are both sets, then the results will be of the same length
/ fastest of all
intersectindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :(i[j];j)
}

/finds indexes in x that intersect with y
intersectleftindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :i[j]
}

/finds intersection of two lists
/ and returns index pairs of matches. Assumes no duplicates
/ in either list
intersectbothindexes: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  pairs: (i ,' (!#y))
  k: & pairs[;0] < #x
  :pairs[k]
}

/ intersect many lists
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj: ,/ ?:' first (?/:)/: lists[1+ !(size-1)] / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

/ this is a set intersection so we remove duplicates
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

avg:{(+/ x) % # x}

var:{avg[_sqr x] - _sqr avg[x]}
std:{_sqrt var[x]}
cov:{avg[x * y] - avg[x] * avg[y]}
corr:{ (cov[x;y])%((std[x]) * (std[y]))}
/ delay based search
corrdelay:{[delay;x;y] 
  x: (-delay) _ x
  y: delay _ y
  (cov[x;y])%((std[x]) * (std[y]))}


spitline:{[line] (-3) _ ,/ ($line),\: (" | ")}

/ END BASICS


/ FILE INPUT

/ transform lower case to upper case 
uppercase:{[item]
  c: _ic'item
  ii: & (c > 96) & (c < 123)
  if[0 < #ii
  	c[ii]-: 32
	item: _ci' c
  ]
  if[2 > #item
	:item
  ]
  :item
}

/ parses a field based on semicolons
/ get as strings
getfieldssemi:{[line]
  i: line = ";"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  y:uppercase'x
  :y
}

/ parses a field based on blanks
/ then eliminates suppressed terms, then returns concatenated
suppressconcat:{[line]
  i: line = " "
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  y:` $ x
  ii: & ~ y _lin suppress
  z:,/$y[ii]
  jj: & ~ z _lin punctuation
  :z[jj]
}

/ get rid of punctuation and then take off the first two digits if long.
suppresscompress:{[line]
  i: line _lin punctuation
  j2: &~i
  line @:j2
  sz: #line
  : (0 | (sz - 8)) _ line
}

suppressonly:{[line]
  i: line _lin punctuation
  j2: &~i
  line @:j2
  :line
}

/ This is built for incomplete lines in appendix3bad.csv
extendline:{[line]
  if[31 < (#line); :line]
  x: 32 - #line
  yy: x # ,""
  line,: yy
  :line
}

/ get rid of blanks at either end of the string
delendblanks:{[string]
  if[0 = #string; :""]
  if[string ~ ,"" ; :""]
  i: & ~ string = " "
  if[(#string) = (#i); :string]
  if[0 = (#i); :""]
  string: (- ((#string)  - (1 + *|i))) _ string
  :(*i) _ string
}

/ get rid of blanks and trailing S
delblankandS:{[string]
   s: delendblanks[string]
   if[s ~ ,"S"; :s]
   if[(*|s) = "S"; :(-1) _ s]
   :s
}
  

/ MATCHING LOGIC

/ distance function without provisions for gaps or transposition
basicdist:{[x;y] 
  res: {y(1+&)\(1_ x)&(-1_ x)-z}\[!1+#y;1+!#x;x=\:y];res[#x][#y]}

/ similarity based on edit distance
matchdist:{[x;y]
  len: (#x) | (#y)
  if[len = 0; :0]
  :(len - basicdist[x;y]) % len
}

/ if exact match then return 2
/ else fraction of intersect over union
/ Also give extra credit if first letters are the same.
/ CHANGE the credit given to first letters
match:{[symb1; symb2]
  if[symb1 ~ symb2; :2]
  s1: $symb1
  s2: $symb2
  if[1 > (#s1) & (#s2); :0]
  c1: #intersect[s1;s2]
  min: (#s1) & (#s2)
  c1+: (s1[!min] = s2[!min]) ? 0 / give extra weight if you match at position
  :c1 % #?s1,s2
}

/ if exact match then return 2
/ else fraction of intersect over union
minimatch:{[symb1; symb2]
  if[symb1 ~ symb2; :2]
  s1: $symb1
  s2: $symb2
  if[1 > (#s1) & (#s2); :0]
  c1: #intersect[s1;s2]
  :c1 % #?s1,s2
}


/ match records
/ CHANGE the scoring function at will.
/ I think it's pretty good though. Also the highest score should be in
/ the range 12 to 14.
vecmatches:{[index; rec2]
  / if[~ (rec1[7]) ~ (rec2[7]); :0] / departments must be equal
  x: 0
  x+: 2* matchdist[(ville[index]);($rec2[6])] / ville
	/ dept is assumed
  / y: 5 * matchdist[(nom[index]);(,/$rec2[8+!2])] / on nom and prenom
  y: 5 * matchdist[(nom[index]);,/$(rec2[8])] / on nom alone
  if[y < 1; :0]
  x+:  y
  x1: tellist[index] / matching telephone numbers
  x2: rec2[11+!3]
  z: intersect[x1;x2] / entire number is a match; may be empty
  x+: :[(5 < #,/$z); 10; 3* matchdist[(,/$x1); (,/$x2)]]
  x+: 5* matchdist[address1[index]; suppressconcat ($rec2[10])]
  :x
}

/ find the best matching records based on the score.
/ nom
/ address1
/ codepostal
/ ville
/ tellist
findbestmatch:{[index]
  if[5 > #codepostal[index] 
	:(`bad;,"Code postal is incomplete.")]
  firstthree: (codepostal[index; !3])
  file: firstthree,("post")
  if[ file ~ lastfileread
	datarecs: globaldatarecs
  ]
  if[~ file ~ lastfileread
    pair: @[1:: ; file; :]
    if[0 < pair[0]; 
	:(`bad;,"First three code postal symbols are not in database")]
    lastfileread:: file
    datarecs: ? pair[1] 
    globaldatarecs:: datarecs
  ]
  if[0 = #datarecs; :(`bad; ,"   ")] 
  jj: & (datarecs[;5]) = ` $ codepostal[index]
  datarecs@: jj
  scores: vecmatches[index]'datarecs
  if[0 = #scores; :(`bad; ,"   ")] 
  jj: > scores
  jj@: !((#jj) & 20)
  datarecs@: jj
  scores@: jj
  if[scores[0] > 10.99
	jj: & scores > 10.99
  	x: ,spitline scores[jj]
  	x,:spitline'datarecs[jj]
	:(`good; x)
  ]
  if[scores[0] > 9
	jj: & scores > 9
  	x: ,spitline scores[jj]
  	x,:spitline'datarecs[jj]
	:(`possible; x)
  ]
  / scores are less than 9
  jj: & scores > 5
  jj@: !(#jj) & 20
  x: ,spitline scores[jj]
  x,:spitline'datarecs[jj]
  :(`bad; x)
}

/ DUMP TABLE dumptable


/ formstring takes a list and makes a string
formstring:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (" ")
}

formstringvertbar:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: ("|")
}

formstringcomma:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (",")
}

/ Output a table (a variable) to a text file outfile (string)
/ e.g.   output[`guide; guide; "foobar"]
dumptable:{[tablename; table; outfile]
  out: ,("# "), ($tablename), ("|"), formstringvertbar[!table]
  first: *!table
  numofelements: . ("#"), ($tablename), ("."), ($first)
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: ("|")
	i+: 1
  ]
  outfile 0: out
}

dumptablecsv:{[tablename; table; outfile]
  out: , formstringcomma[!table]
  first: *!table
  numofelements: . ("#"), ($tablename), ("."), ($first)
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: (",")
	i+: 1
  ]
  outfile 0: out
}

/ APPLICATION


makeupper:{[let]
  x: _ic let
  if[(x < 97) | (x > 122); :let]
  x-: 32
  :_ci x
}

/ parses a field based on white space and other stuff
/ CHANGE HERE: other things to ignore
getfieldswhite:{[line]
  i: line _lin " .-*/;'"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  counts: #:'x
  ii: & 0 < counts
  if[0 = #ii; :`]
  :` $ delblankandS'x[ii]
}

canon:{[item]
  i: translatesource ? item
  if[i = #translatesource; :item]
  :translatetarget[i]
}
  
splittokenize:{[line]
	x: * tokenize[line]
	if[3 > #$x; :x]
	:(` $ (($x)[!2])), x
}

/ create a new member of this line if the first two symbols are singletons
/ Prepends some guys together if they have one letter.
conglomtokenize:{[line]
	s1: tokenize[line] / eliminates all manner of delimiter
		/ blanks, ;, etc. and eliminates S.
	if[2 > #s1; :s1]
	if[2 = #s1
	  if[(1 = #$s1[0]) & (1 = #$s1[1]); s1: (` $ ($s1[0]),($s1[1])),s1] 
	]
	if[2 < #s1
	  if[(1 = #$s1[0]) & (1 = #$s1[1]) & (1 = #$s1[2])
		s1: (` $ ($s1[0]),($s1[1]),($s1[2])),s1
	  ]
	]
	:s1
}

splitnumeric:{[line]
	s1: tokenize[line]
	num1: -3
	ii: & isnumeric's1
	if[0 < #ii
		num1: 0 $ $ s1[*ii]
	]
	if[0 > num1; :s1]
	first:(` $ ($num1))
	:first,s1
}


/ tokenize each list
/ This picks strings out from the middle.
tokenize:{[line]
   my: makeupper'$line
   f1: getfieldswhite my
   f2: canon'f1
   f3: differ[f2; suppress]
   :f3
}

/ tokenize each list
/ This picks strings out from the middle.
teltokenize:{[line]
   my: $line
   ii: & ~my = "."
   my@: ii / get rid of .
   ii: & ~my = ","
   my@: ii / get rid of .
   countwhite: +/ my = " "
   if[countwhite > 3 / probably between pairs of digits
      ii: & ~my = " "
      my@: ii / get rid of .
   ]
   f1: getfieldswhite my
   if[2 > #,/$f1; :` $ $ * 1 _draw 10000000] / generate a random number 
   :f1
}

/ Above does parsing
/ Now process

sort:{[list] : list[<list]}
  
/ Use the tokenized table new to find things
/ Algorithm: partition based on code (postal)
/ Then filter each group by using the first word of nom
/ (unless it's an acronym in which case use the letters) 
/ After that, try the address to get a better sense of the match.

/ This tries to unify two partitions. Untried so far.
process:{[]
  / part: = (*:' new.code),'(*:' new.address)
  / part: = (*:' new.code),'(*:' new.nom),'(*:'new.address)
  part: = (*:' new.code),'(*:' new.nom)
  x: getbest'part
}

processrollsroyce:{[]
  / part: = (*:' new.code),'(*:' new.address)
  / part: = (*:' new.code),'(*:' new.nom),'(*:'new.address)
  part: = (*:' new.code)
  x: processgroup'part
}

/ a bunch of indexes with the same code postale
/ Only from processrollsroyce
processgroupold:{[indexes]
  mynames: *:' new.nom[indexes]
  part: = mynames
  x: getbest'[indexes[part]]
}

/ This tries to unify two partitions. Untried so far.
processgroup:{[indexes]
  part: = (*:' new.nom[indexes])
  part2: = (*:' new.address[indexes])
  d12: differ[part; part2]
  d21: differ[part2; part1]
  subflag: ()
  j: 0
  while[j < #d12
  	subflag,: |/ propersubset[d12[j]]'d21
	j+: 1
  ]
  / Any index in subflag should be deleted
  d12: d12 _di &subflag
  subflag: ()
  j: 0
  while[j < #d21
  	subflag,: |/ propersubset[d21[j]]'d12
	j+: 1
  ]
  / Any index in subflag should be deleted
  d21: d21 _di &subflag
  both: intersect[part; part2]
  / x: processgroup'part
  newpart: both,d12,d21
  x: getbest'[indexes[newpart]]
}


/ These ones have the same code postale and the same first part of nom 
/ Try to find the best matches.
getbest:{[indexes]
  thresh: 4
  if[1 < #indexes
	mynames: new.nom[indexes]
	myadds:  new.address[indexes]
	myville: new.ville[indexes]
	mytel: new.tel[indexes]
	mycode: new.code[indexes]
	/ if[(`JPC _in ,/mynames) ; !-1]
	ind: !#mynames
	pairs: ,/ ind ,/:\: ind
	pairs@: & (pairs[;0]) < (pairs[;1])
	xname: findvalnames'(mynames[pairs])
	xadd: findvaladdresses'(myadds[pairs])
	xville: findvalvilles'(myville[pairs])
	xtel: findvaltel'(mytel[pairs])
	xcode:  findvalcodes'(mycode[pairs])
	/ CHANGE HERE: also see thresh above.
	/ Weighted add approach combined with logic.
	/ The logic says the name is the same and either the code
	/ city or address.
	/ xtel by the same is conclusive
	/ following was eliminated because a similarity in the name,
	/ same city and same code and anything in address was enough.
	/ xtot: (2*xname) + (0.5* xadd) + xville + (2 * xcode) + (5*xtel)
	xtot: (2*xname) + (0.5* xadd) + xville + (xcode) + (5*xtel)
	ii: & (xtot > thresh) | (1=(xname & (xcode|xville|xadd) & (xadd > 0.3)))
	goodpairs: indexes[pairs[ii]]
	/ CHANGE HERE: if it's a transitive closure but not a clique
	/ then it's probable but not certain. For later.
	if[0 < #goodpairs
	  mymat: findcycles[goodpairs]
	  nodes: ?(goodpairs[;0]),(goodpairs[;1])
	  left: nodes
	  yy: ()
	  while[0 < #left
		i: nodes ? *left
		/ CHANGE HERE: We take the transitive closure as a group.
		vv: & mymat[i]
		jj: ?nodes[vv]
		if[0 < #jj
		  yy,: ,"                "
		  globalcount+: 1
		  yy,: ,$globalcount
		  k: 0
		  while[k < #jj
			v: jj[k]
			yy,: ,spitline (lapeyre.Clientnum[v]),(lapeyre.Nom[v]),(lapeyre.Adresse1[v]),(lapeyre.CodePostal[v]),(lapeyre.Ville[v]),(lapeyre.CANetHTfacture[v]),(lapeyre.NumeroTellineaire[v])
			k+: 1
		  ]
		]
		left: differ[left; jj]
		/ if[0 < #left; !-1]
	  ]
	  globalresult,: yy
	]
  ]
}

findvalnames:{[namepair]
	:(#intersect[namepair[0]; namepair[1]])%(#?(namepair[0]),(namepair[1]))
}

findvalcodes:{[codepair]
	:(#intersect[codepair[0]; codepair[1]])%(#?(codepair[0]),(codepair[1]))
}

/ CHANGE HERE: if there is any intersection then it is one
findvaltel:{[telpair]
	:0 < (#intersect[telpair[0]; telpair[1]])
}

findvaladdresses:{[addpair]
	y:(#intersect[addpair[0]; addpair[1]]) % (#?(addpair[0]),(addpair[1]))
	:(y)
}

findvalvilles:{[vpair]
	:(#intersect[vpair[0]; vpair[1]]) % (#?(vpair[0]),(vpair[1]))
}

/ build edge matrix
buildmatrix:{[edges]
  els: ?edges[;0],edges[;1]
  num: #els
  mat: (num; num) # 0
  i: 0
  while[i < #mat
	mat[i;i]: 1
	i+: 1
  ]
  i: 0
  while[i < #edges
        pair: edges[i]
        j1: els ? pair[0]
        j2: els ? pair[1]
        mat[j1;j2]: 1
        mat[j2;j1]: 1
        i+: 1
  ]
  :mat
}

/ find transitive closure of adjacency matrix
transclos:{[mat]
  flag: 1
  while[flag
        newmat: ()
        tmat: +mat
        i: 0
        while[i < #mat
                yy: |/'mat[i] */: tmat
                newmat,: ,yy |' mat[i]
                i+: 1
        ]
        flag: ~ mat ~ newmat
        mat: newmat
  ]
  :mat
}

/ transitive closure
/ build matrix and then do multiplication as needed
findcycles:{[edges]
  mat: buildmatrix[edges]
  tmpmat:: mat
  :transclos[mat]
}

/ DATA

punctuation: " ,.;-_*"

suppress: (`"DE"
`ST
`SAINT
`"LE"
`"LA"
`"DU"
`"-"
`"DES"
`"AV"
`"AVENUE"
`"ROUTE"
`"RTE"
`"&"
`ET
`RUE
`"R."
`"BIS"
`"B."
`"BOULEVARD"
`"BLVD"
`"BD"
`"AVE"
`BLD
`SARL
`SA
`ENT
`SAS
`SNC
`STE
`ADMIN)


translatesource:(`AV `RTE `"&")
translatetarget:(`AVENUE `ROUTE `ET)


/ EXECUTION

/ TIME TESTING HARNESS (REMOVE / if you want this)
/ .time.set`.k
/ start: _t
/ END OF TIME TESTING HARNESS (REMOVE / if you want this)


` 0: ,"Test files are assumed: dirtyin and cleanin and should have 28 fields "
` 0: ,"per record. Some of those may be blank."

file: "appendix3bad.csv"
if[0 < #_i
	file: _i[0]
]

/ This is the schema of appendix3bad.csv
enseigne: ()
typeclient: ()
id: ()
codedepot: ()
nom:()
numclient: ()
patronyme: ()
datecreation: ()
tauxderemise: ()
addresse1: ()
addresse2: ()
codepostal: ()
ville: ()
numerotel: ()
numerotelport: ()
fax: ()
contact: ()
numerotelprof: ()
titreclientchantier: ()
nomclientchantier: ()
addresschantier: ()
cpltaddress: ()
cpclientchantier: ()
villeclientchantier: ()
etageclientchantier: ()
telclientchantier: ()
telportclientchantier: ()
telprofclientchantier: ()

/ This is the only dependency on the input file.
/ Have changed , to semicolon to get dirtyin file.
/  sed 's/,/;/g' app4dirty.csv > dirtyin
/ Have eliminated " and "NULL" in clean file.
/ cat app5clean.csv | sed 's/"NULL"//g' | sed 's/"//g' > cleanin
/ Have changed getfieldssemi to get rid of NULL.
/ Have gotten rid of first line of dirtyin and cleanin.
/ extendline has also changed to go to 32
/ CHANGE FOR FORMAT
testrecs: ? getfieldssemi'0: "dirtyin" / independent of file type
/ special case we are trying 52
/ testrecs: ,testrecs[52]

testrecs: extendline'testrecs
nom: suppressonly'testrecs[;4]
address1: suppressconcat'testrecs[;9],'testrecs[;10]
codepostal: testrecs[;11]
ville: testrecs[;12]
x1: ` $ suppresscompress'testrecs[;13]
x2: ` $ suppresscompress'testrecs[;14]
x3: ` $ suppresscompress'testrecs[;15]
x4: ` $ suppresscompress'testrecs[;17]
tellist: x1,'x2,'x3,'x4
siret: testrecs[;30]
contact:  suppressonly'testrecs[;16]
jj: < codepostal
testrecs@: jj
nom@: jj
address1@: jj
codepostal@: jj
ville@: jj
tellist@: jj
siret@: jj
contact@: jj

"testrecs" 1: testrecs
"nom" 1: nom
"address1" 1: address1
"codepostal" 1: codepostal
"ville" 1: ville
"tellist" 1: tellist
"siret" 1: siret
"contact" 1: contact

testrecs: ? getfieldssemi'0: "cleanin"
testrecs: extendline'testrecs / May not be enough
siretclean: testrecs[;0]
nomclean: suppressonly'testrecs[;5]
address1clean: suppressconcat'testrecs[;10],'testrecs[;11]
codepostalclean: testrecs[;12]
villeclean: testrecs[;13]
x1: ` $ suppresscompress'testrecs[;14]
x2: ` $ suppresscompress'testrecs[;15]
x3: ` $ suppresscompress'testrecs[;16]
x4: ` $ suppresscompress'testrecs[;18]
tellistclean: x1,'x2,'x3,'x4
contactclean:  suppressonly'testrecs[;17]
jj: < codepostalclean
testrecs@: jj
siretclean@: jj
contactclean@: jj
nomclean@: jj
address1clean@: jj
codepostalclean@: jj
villeclean@: jj
tellistclean@: jj

"testrecsclean" 1: testrecs
"nomclean" 1: nomclean
"address1clean" 1: address1clean
"codepostalclean" 1: codepostalclean
"villeclean" 1: villeclean
"tellistclean" 1: tellistclean
"siretclean" 1: siretclean
"contactclean" 1: contactclean

\\