/ August 2005.
/ ajouts2 have RNVP statuses of `"1" or `"0"
/ If `"1" we expect more precision then if it says 0.
/ See finalreport for algorithm.
/ August 25, 2005
/ If we have to change this, make the change in findbestmatch
/ that a match to name goes to possible automatically.
/ If still no match, then try a desparate measure which is 
/ look for some common word in name and add in address and Ville and CP.
/ This version includes that code.
/ Uses an index on NomRaisonsocialewords

/ August 30:
/ Allow an ajouts with RNVP to be yes to be compared with a no.

/ September 6:
/ In vecmatchyes and vecmatchno if first letter of name different
/ or similarity is less than 90% then max score is 9.6.

/ If NatureClient is 1 for one of them and not 1 for the other, then
/ reject from double. This is in vecmatchyes and vecmatchno.

/ If SIREN is the same, then at least posible

/ If telephone is the same, then at least possible

/ If first two digits of CP are different then can't be even uncertain
/ unless SIRET is the same.

/ If ville and CP are different then can't be even uncertain.

/ September 9:
/ In vecmatchyes and vecmatchno postprocessing
/ if score > 10
/   If address is empty or not substantially the same then reduce score to 9.6
/   If no name then reduce score to 9.6
/   If numdevoie is present and different then reduce score to 9.6

/ If Siret the same then return S.
/ If name, prenom and address are identical then S
/ If name prenom and telephone are identical then S.

/ September 11: all the steps that begin with verify. See the spec
/ of 11 September in the mail in eric.d/clean.d.

/ Sept 12: There is one more thing we could do and that is marked newverify
/ on second thought this should not be necessary.


ajouts: 1: "ajouts2"
refRNVPyes: 1: "refRNVPyes"
refRNVPno: 1: "refRNVPno"

/ Authors: Dennis Shasha and Eric Simon

/ TIME TESTING HARNESS (REMOVE / if you want this)
/ \l time

/ BASIC ROUTINES


/ find difference between list[0] and list[1]
listdiff:{[list]
  :differ[list[0]; list[1]]
}

/ returns one if x is a subset of y
subset:{[x; y]
  i: y ?/: x
  : ~ (#y) _in i
}

/ returns one if x is a subset of y
subset:{[x;y] (#y) > |/ y ?/: x}        


differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

/ A faster difference, yielding indexes in x that differ from y
differindexes:{[x;y]
  i: y ?/: x
  j: & i = #y
  :j
}


/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  :x[(?i) _dv #x]
}

/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :x[?i[j]]
  ]
  i: y ?/: x
  j: & i < #y
  :y[?i[j]]
}

/finds intersection of two lists
/ fastest of all
hasintersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  : (&/i) < #x
}

intersectnonnull:{[x;y] |/ (x _dv `) _lin (y)}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :(#x) = (#intersect[x;y])
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :subset[x;y]
}

/finds indexes in x and y that intersect
/ If x and y are both sets, then the results will be of the same length
/ fastest of all
intersectindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :(i[j];j)
}

/finds indexes in x that intersect with y
intersectleftindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :i[j]
}

/finds intersection of two lists
/ and returns index pairs of matches. Assumes no duplicates
/ in either list
intersectbothindexes: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  pairs: (i ,' (!#y))
  k: & pairs[;0] < #x
  :pairs[k]
}

/ intersect many lists
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj: ,/ ?:' first (?/:)/: lists[1+ !(size-1)] / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

/ this is a set intersection so we remove duplicates
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

avg:{(+/ x) % # x}

var:{avg[_sqr x] - _sqr avg[x]}
std:{_sqrt var[x]}
cov:{avg[x * y] - avg[x] * avg[y]}
corr:{ (cov[x;y])%((std[x]) * (std[y]))}
/ delay based search
corrdelay:{[delay;x;y] 
  x: (-delay) _ x
  y: delay _ y
  (cov[x;y])%((std[x]) * (std[y]))}


spitline:{[line] (-1) _ ,/ ($line),\: ("|")}

/ END BASICS


/ FILE INPUT

/ transform lower case to upper case 
uppercase:{[item]
  c: _ic'item
  ii: & (c > 96) & (c < 123)
  if[0 < #ii
  	c[ii]-: 32
	item: _ci' c
  ]
  if[2 > #item
	:item
  ]
  :item
}

/ parses a field based on semicolons
/ get as strings
getfieldssemi:{[line]
  i: line = ";"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  y:uppercase'x
  :y
}

/ parses a field based on blanks
/ then eliminates suppressed terms, then returns concatenated
suppressconcat:{[line]
  i: line = " "
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  y:` $ x
  ii: & ~ y _lin suppress
  z:,/$y[ii]
  jj: & ~ z _lin ".;-_*"
  :z[jj]
}

/ This is built for incomplete lines in appendix3bad.csv
extendline:{[line]
  if[27 < (#line); :line]
  x: 28 - #line
  yy: x # ,""
  line,: yy
  :line
}

/ get rid of blanks at either end of the string
delendblanks:{[string]
  if[0 = #string; :""]
  if[string ~ ,"" ; :""]
  i: & ~ string = " "
  if[(#string) = (#i); :string]
  if[0 = (#i); :""]
  string: (- ((#string)  - (1 + *|i))) _ string
  :(*i) _ string
}

/ get rid of blanks and trailing S
delblankandS:{[string]
   s: delendblanks[string]
   if[s ~ ,"S"; :s]
   if[(*|s) = "S"; :(-1) _ s]
   :s
}
  
/ get rid of blanks and trailing S
delblanks:{[string]
   ii: & ~ string _lin " \""
   :string[ii]
}
  

/ MATCHING LOGIC

/ distance function without provisions for gaps or transposition
basicdist:{[x;y] 
  res: {y(1+&)\(1_ x)&(-1_ x)-z}\[!1+#y;1+!#x;x=\:y];res[#x][#y]}

/ similarity based on edit distance
matchdist:{[x;y]
  minlen: (#x) & (#y)
  if[minlen < 2; :0]
  len: (#x) | (#y)
  :(len - basicdist[x;y]) % len
}

/ if exact match then return 2
/ else fraction of intersect over union
/ Also give extra credit if first letters are the same.
/ CHANGE the credit given to first letters
match:{[symb1; symb2]
  if[symb1 ~ symb2; :2]
  s1: $symb1
  s2: $symb2
  if[1 > (#s1) & (#s2); :0]
  c1: #intersect[s1;s2]
  min: (#s1) & (#s2)
  c1+: (s1[!min] = s2[!min]) ? 0 / give extra weight if you match at position
  :c1 % #?s1,s2
}

/ if exact match then return 2
/ else fraction of intersect over union
minimatch:{[symb1; symb2]
  if[symb1 ~ symb2; :2]
  s1: $symb1
  s2: $symb2
  if[1 > (#s1) & (#s2); :0]
  c1: #intersect[s1;s2]
  :c1 % #?s1,s2
}


/ match records
/ CHANGE the scoring function at will.
/ This is with the one that has a good scoring function.
vecmatchyes:{[index; indexclean]
  score: 0
  if[((ajouts.NatureClient[index]) = `"1") & (~ refRNVPyes.NatureClient[indexclean] = `"1"); :(0; indexclean; `ProPart)] 
  if[(~ (ajouts.NatureClient[index]) = `"1") & (refRNVPyes.NatureClient[indexclean] = `"1"); :(0; indexclean; `ProPart)]
  if[(~(ajouts.SIREN[index]) = (refRNVPyes.SIREN[indexclean])) & (7 < ((#$ajouts.SIREN[index]) & (#$refRNVPyes.SIREN[indexclean])))
	:(0; indexclean; `SIREN)
  ]
  if[(~( ajouts.Contact[index]) = (refRNVPyes.Contact[indexclean])) & (2 < ((#$ajouts.Contact[index]) & (#$refRNVPyes.Contact[indexclean])))
	:(0; indexclean;`Contact)
  ]
  if[((ajouts.Codecivil[index]) = `"1") & (refRNVPyes.Codecivil[indexclean] _in `"2" `"3" `"4"); score-: 2]
  if[((ajouts.Codecivil[index]) _in `"2" `"3" `"4") & (refRNVPyes.Codecivil[indexclean] = `"1"); score-: 2]
  if[~ (ajouts.Numdevoie[index]) = (refRNVPyes.Numdevoie[indexclean]) 
	score-: 4]
  if[~ (ajouts.CP[index]) = (refRNVPyes.CP[indexclean]) 
	score-: 2]
  if[intersectnonnull[telsajouts[index]; telsyes[indexclean]]
	score+: 11
  ]
  namescore1: -4
  name: ajouts.NomRaisonsociale[index] 
  nameclean: refRNVPyes.NomRaisonsociale[indexclean] 
  if[0 < (#name) & (#nameclean)
    if[(*name) = (*nameclean)
	namescore1 : 10 * matchdist[name;nameclean]
    ]
    if[~ (*name) = (*nameclean)
        namescore1 : -2 / penalty for mismatch
	namescore1 +: 2 * matchdist[name;nameclean] / give some chance
    ]
  ]
  / Now must look at alternate possibility: name is in first two fields of
  / address of refRNVPyes
  namescore2: -4
  nameclean: :[0 < # refRNVPyes.LigneAdresse1[indexclean]
			refRNVPyes.LigneAdresse1[indexclean]
			refRNVPyes.LigneAdresse2[indexclean]]
  if[0 < (#name) & (#nameclean)
    if[(*name) = (*nameclean)
	namescore2: 10 * matchdist[name;nameclean]
    ]
    if[~ (*name) = (*nameclean)
        namescore2: -2 / penalty for mismatch
	namescore2+: 2 * matchdist[name;nameclean] / give some chance
    ]
  ]
  namescore3: -4
  name: :[0 < # ajouts.LigneAdresse1[index]
			ajouts.LigneAdresse1[index]
			ajouts.LigneAdresse2[index]]
  nameclean: refRNVPyes.NomRaisonsociale[indexclean] 
  if[0 < (#name) & (#nameclean)
    if[(*name) = (*nameclean)
	namescore3: 10 * matchdist[name;nameclean]
    ]
    if[~ (*name) = (*nameclean)
        namescore3: -2 / penalty for mismatch
	namescore3+: 2 * matchdist[name;nameclean] / give some chance
    ]
  ]
  score+: namescore1 | namescore2 | namescore3 / take maximum
  if[score < 3; :(0; indexclean; `NoNameMatch)]
  firstname: ajouts.Prenom[index] 
  firstnameclean: refRNVPyes.Prenom[indexclean] 
  if[0 < (#firstname) & (#firstnameclean)
    if[(*firstname) = (*firstnameclean)
	score+: 3 * matchdist[firstname;firstnameclean]
    ]
    if[~ (*firstname) = (*firstnameclean)
    	score-: 2 / penalty for mismatch
	score+: 2 * matchdist[firstname;firstnameclean] / give some chance
    ]
  ]
  ville: ajouts.Ville[index] 
  villeclean: refRNVPyes.Ville[indexclean] 
  if[0 < (#ville) & (#villeclean)
    if[(*ville) = (*villeclean)
	score+: 3 * matchdist[ville;villeclean]
    ]
    if[~ (*ville) = (*villeclean)
    	score-: 2 / penalty for mismatch
	score+: 2 * matchdist[ville;villeclean] / give some chance
    ]
  ]
  ad: ajouts.LigneAdresse3[index] 
  adclean: refRNVPyes.LigneAdresse3[indexclean] 
  if[0 < (#ad) & (#adclean)
    if[(*ad) = (*adclean)
	score+: 3 * matchdist[ad;adclean]
    ]
    if[~ (*ad) = (*adclean)
    	score-: 2 / penalty for mismatch
	score+: 2 * matchdist[ad;adclean] / give some chance
    ]
  ]
  if[score > 10 / confirm that scores should be this high
	x: namescore1 | namescore2 | namescore3
	if[x < 9; score: 9.5] / reduce it if not 90% the same
	if[score > 10
  	  if[~ (ajouts.Numdevoie[index]) = (refRNVPyes.Numdevoie[indexclean]) 
		score&: 9.5
	  ]
	]
	if[score > 10
		acp: $ ajouts.CP[index]
		ryescp: $ refRNVPyes.CP[indexclean]
		if[2 < (#acp) & (#ryescp)
			if[~ (acp[!2]) ~ (ryescp[!2]); score&: 6] 
			/ first two CP must match
		]
	]
	if[score > 10
  		if[0 = (#ad) & (#adclean); score&: 9.4]
	]
	if[score > 10
  		if[0.8 > matchdist[ad;adclean]; score&: 9.4]
	]
	if[score > 10
  		name: ajouts.NomRaisonsociale[index] 
  		nameclean: refRNVPyes.NomRaisonsociale[indexclean] 
		if[2 > (#name) & (#nameclean); score&: 9.3]
	]
  ]
  :(score; indexclean)
}

vecmatchno:{[index; indexclean]
  score: 0
  if[((ajouts.NatureClient[index]) = `"1") & (~ refRNVPno.NatureClient[indexclean] = `"1"); :(0; indexclean; `ProPart)] 
  if[(~ (ajouts.NatureClient[index]) = `"1") & (refRNVPno.NatureClient[indexclean] = `"1"); :(0; indexclean; `ProPart)]
  if[(~(ajouts.SIREN[index]) = (refRNVPno.SIREN[indexclean])) & (7 < ((#$ajouts.SIREN[index]) & (#$refRNVPno.SIREN[indexclean])))
	:(0; indexclean; `SIREN)
  ]
  if[(~(ajouts.CP[index]) = (refRNVPno.CP[indexclean])) 
	score-: 2
  ]
  if[(~(ajouts.Numdevoie[index]) = (refRNVPno.Numdevoie[indexclean])) 
	score-: 4
  ]
  if[(~(ajouts.Contact[index]) = (refRNVPno.Contact[indexclean])) & (2 < ((#$ajouts.Contact[index]) & (#$refRNVPno.Contact[indexclean])))
	:(0; indexclean;`Contact)
  ]
  if[((ajouts.Codecivil[index]) = `"1") & (refRNVPno.Codecivil[indexclean] _in `"2" `"3" `"4"); score-: 2]
  if[((ajouts.Codecivil[index]) _in `"2" `"3" `"4") & (refRNVPno.Codecivil[indexclean] = `"1"); score-: 2]
  if[intersectnonnull[telsajouts[index]; telsno[indexclean]]
	score+: 11
  ]
  namescore1: -4
  name: ajouts.NomRaisonsociale[index] 
  nameclean: refRNVPno.NomRaisonsociale[indexclean] 
  if[0 < (#name) & (#nameclean)
    if[(*name) = (*nameclean)
	namescore1 : 10 * matchdist[name;nameclean]
    ]
    if[~ (*name) = (*nameclean)
        namescore1 : -2 / penalty for mismatch
	namescore1 +: 2 * matchdist[name;nameclean] / give some chance
    ]
  ]
  / Now must look at other possibility: name is in first two fields of
  / address of refRNVPno
  namescore2: -4
  nameclean: :[0 < # refRNVPno.LigneAdresse1[indexclean]
			refRNVPno.LigneAdresse1[indexclean]
			refRNVPno.LigneAdresse2[indexclean]]
  if[0 < (#name) & (#nameclean)
    if[(*name) = (*nameclean)
	namescore2: 10 * matchdist[name;nameclean]
    ]
    if[~ (*name) = (*nameclean)
        namescore2: -2 / penalty for mismatch
	namescore2+: 2 * matchdist[name;nameclean] / give some chance
    ]
  ]
  namescore3: -4
  name: :[0 < # ajouts.LigneAdresse1[index]
			ajouts.LigneAdresse1[index]
			ajouts.LigneAdresse2[index]]
  nameclean: refRNVPno.NomRaisonsociale[indexclean] 
  if[0 < (#name) & (#nameclean)
    if[(*name) = (*nameclean)
	namescore3: 10 * matchdist[name;nameclean]
    ]
    if[~ (*name) = (*nameclean)
        namescore3: -2 / penalty for mismatch
	namescore3+: 2 * matchdist[name;nameclean] / give some chance
    ]
  ]
  score+: namescore1 | namescore2 | namescore3 / take maximum
  if[score < 3; :(0; indexclean; `NoNameMatch)]
  firstname: ajouts.Prenom[index] 
  firstnameclean: refRNVPno.Prenom[indexclean] 
  if[0 < (#firstname) & (#firstnameclean)
    if[(*firstname) = (*firstnameclean)
	score+: 3 * matchdist[firstname;firstnameclean]
    ]
    if[~ (*firstname) = (*firstnameclean)
    	score-: 2 / penalty for mismatch
	score+: 2 * matchdist[firstname;firstnameclean] / give some chance
    ]
  ]
  ville: ajouts.Ville[index] 
  villeclean: refRNVPno.Ville[indexclean] 
  if[0 < (#ville) & (#villeclean)
    if[(*ville) = (*villeclean)
	score+: 3 * matchdist[ville;villeclean]
    ]
    if[~ (*ville) = (*villeclean)
    	score-: 2 / penalty for mismatch
	score+: 2 * matchdist[ville;villeclean] / give some chance
    ]
  ]
  ad: ajouts.LigneAdresse3[index] 
  adclean: refRNVPno.LigneAdresse3[indexclean] 
  if[0 < (#ad) & (#adclean)
    if[(*ad) = (*adclean)
	score+: 3 * matchdist[ad;adclean]
    ]
    if[~ (*ad) = (*adclean)
    	score-: 2 / penalty for mismatch
	score+: 2 * matchdist[ad;adclean] / give some chance
    ]
  ]
  if[score > 10 / confirm that scores should be this high
	x: namescore1 | namescore2 | namescore3
	if[x < 9; score: 9.5] / reduce it if not 90% the same
	if[score > 10
  	  if[~ (ajouts.Numdevoie[index]) = (refRNVPno.Numdevoie[indexclean]) 
		score&: 9.5
	  ]
	]
	if[score > 10  / reduce if ville not nearly the same
		if[0.9 > matchdist[ville;villeclean]; score: 9.5]
	]
	if[score > 10
		acp: $ ajouts.CP[index]
		rnocp: $ refRNVPno.CP[indexclean]
		if[2 < (#acp) & (#rnocp)
			if[~ (acp[!2]) ~ (rnocp[!2]); score&: 6] 
			/ first two CP must match
		]
	]
	if[score > 10
  		if[0 = (#ad) & (#adclean); score&: 9.4]
	]
	if[score > 10
  		if[0.8 > matchdist[ad;adclean]; score&: 9.4]
	]
	if[score > 10
  		name: ajouts.NomRaisonsociale[index] 
  		nameclean: refRNVPno.NomRaisonsociale[indexclean] 
		if[2 > (#name) & (#nameclean); score&: 9.3]
	]
  ]
  :(score; indexclean)
}


/ find the best matching records based on the score.
/ nom
/ address1
/ codepostal
/ ville
/ tellist
findbestmatch:{[index]
  scoresyes: () / scores compard with refRNVPyes
		/ These will be pairs of scores and ids
  scoresno: () / scores compared with refRNVPno
  if[ajouts.Statut_RNVP[index] _in `"0" `"1" / in either case
	x: (ajouts.CP[index]),(ajouts.Pays[index]),(ajouts.Numdevoie[index]) 
	x: ` $ ,/$x / concatenate the whole thing
	i: uniqsyes ? x
	correctcp: ()
	if[i < #uniqsyes
	  correctcp,: partyes[i] / get proper ones
 	]
	/ also choose those with correct SIREN
	if[(0 < # $ajouts.SIREN[index])
          correctcp,: & (refRNVPyes.SIREN) = ajouts.SIREN[index]
	]
	if[(0 < # $ajouts.NomRaisonsociale[index])
	 correctcp,: & namesyes = ` $ ajouts.NomRaisonsociale[index]
	]
	correctcp?:
 	if[0 < #correctcp
	 scoresyes: vecmatchyes[index]'correctcp / compare against refRNVPyes
	]
  ]
  if[ajouts.Statut_RNVP[index] _in `"0" `"1" / compare both against refRNVPno
	/ because we don't believe it makes sense for an ajouts row
	/ having a good RNVP_statut to be close to a reference row having
	/ a poor RNVP_statut.
	correctcp: ()
	x: getfirstthree (ajouts.CP[index])
	i: uniqsno ? x
	if[i < #uniqsno
	  correctcp,: partno[i] / get proper ones
	]
	if[(0 < # $ajouts.SIREN[index])
          correctcp,: & (refRNVPno.SIREN) = ajouts.SIREN[index]
	]
	if[(0 < # $ajouts.NomRaisonsociale[index])
	  correctcp,: & namesno = ` $ ajouts.NomRaisonsociale[index]
	]
	correctcp?:
 	if[0 < #correctcp
	 scoresno: vecmatchno[index]'correctcp
	]
  ]
  out: ()
  foundmatch: 0
  foundposs: 0
  yeslimit: 10
  posslimit: 5
  if[0 < (#scoresyes) 
     jj:  & yeslimit < scoresyes[;0]
     if[0 < #jj
        goodones: scoresyes[jj;1]
	/ verify that we don't have different non-null sirens
	xx: ajouts.SIREN_SIRET[index]
	if[(~ xx = `) & (0 < #goodones)
		kk: & (refRNVPyes.SIREN_SIRET[goodones]) _lin (`; xx)
		jj@: kk
		goodones@: kk
	]
	if[0 < #goodones
         out,: (ajouts.Id[index]) ,/: (refRNVPyes.Id[goodones]) ,' (,`S_yes) ,/: scoresyes[jj;0]
	 foundmatch: 1
	]
     ]
     if[foundmatch = 0
      jj: & (~ yeslimit < scoresyes[;0]) & (posslimit < scoresyes[;0]) 
      if[0 < #jj
       goodones: scoresyes[jj;1]
	/ newverify: SIREN is ok is done in vecmachyes
       out,: (ajouts.Id[index]) ,/: (refRNVPyes.Id[goodones]) ,' (,`I_yes) ,/: scoresyes[jj;0]
       foundposs: 1
      ]
     ]
  ]
  / do something simlilar for refRNVPno
  nolimit: 10
  posslimit: 5
  if[0 < (#scoresno) 
     jj:  & nolimit < scoresno[;0]
     if[0 < #jj
        goodones: scoresno[jj;1]
	/ verify that we don't have different non-null sirens
	xx: ajouts.SIREN_SIRET[index]
	if[(~ xx = `) & (0 < #goodones)
		kk: & (refRNVPno.SIREN_SIRET[goodones])  _lin (`; xx)
		jj@: kk
		goodones@: kk
	]
	if[0 < #goodones
         out,: (ajouts.Id[index]) ,/: (refRNVPno.Id[goodones]) ,' (,`S_no) ,/: scoresno[jj;0]
	 foundmatch: 1
	]
     ]
     if[foundmatch = 0
      jj: & (~ nolimit < scoresno[;0]) & (posslimit < scoresno[;0]) 
      if[0 < #jj
       goodones: scoresno[jj;1]
	/ newverify: SIREN is ok is done in vecmachno
       out,: (ajouts.Id[index]) ,/: (refRNVPno.Id[goodones]) ,' (,`I_no) ,/: scoresno[jj;0]
       foundposs: 1
      ]
     ]
  ]
  extrasfound: 0
  / if siren_siret is the same, even if there are other problems,
  / then indicate that this is possible
  / Sept 9: make it certain
  if[(foundmatch=0) & (10 < # $ajouts.SIREN_SIRET[index])
    goodones: & (refRNVPyes.SIREN_SIRET) = ajouts.SIREN_SIRET[index]
    if[0 < #goodones
       out,:(ajouts.Id[index]),/:(refRNVPyes.Id[goodones]) ,\: (,`S_yes) , ,10.2
	foundmatch: 1
	extrasfound: 1
    ]
    goodones: & (refRNVPno.SIREN_SIRET) = ajouts.SIREN_SIRET[index]
    if[0 < #goodones
       out,:(ajouts.Id[index]),/: (refRNVPno.Id[goodones]) ,\: (,`S_no) , ,10.2
	foundmatch: 1
	extrasfound: 1
    ]
  ]
  if[(foundmatch=0) & (0 < # $ajouts.NomRaisonsociale[index])
	goodones: & namesyes = ` $ ajouts.NomRaisonsociale[index]
	if[(0 < #goodones) & (0 < #ajouts.LigneAdresse3[index]) & (0 < #ajouts.Prenom[index])
		/ Sufficient condition for certain doubles Sept 9 2005
		xa:  ajouts.Prenom[index]
		xr:  refRNVPyes.Prenom[goodones]
		jj1: & xa = xr
		rem: goodones[jj1]
		xa:  ` $ ajouts.LigneAdresse3[index]
		xr: ` $  ,/refRNVPyes.LigneAdresse3[rem]
		jj: & xa = xr
		rem@: jj
		if[0 < #rem
		 xa: ajouts.Ville[index]
		 xr: refRNVPyes.Ville[rem]
		 jj: & xa = xr
		 rem@: jj
		]
		if[0 < #rem
		 xa:  ajouts.CP[index]
		 xr:  refRNVPyes.CP[rem]
		 jj: & xa = xr
		 rem@: jj
		]
		/ verify that we don't have different non-null siret-sirens
		xx: (ajouts.SIREN_SIRET[index])
		if[(~ xx = `) & (0 < #rem)
			jj: & (refRNVPyes.SIREN_SIRET[rem]) _lin (`; xx)
			rem@: jj
		]
		c: rem
		if[0 < #c
		 out,:(ajouts.Id[index]),/:(refRNVPyes.Id[c]),\:(,`S_yes),,10.1
		 foundmatch: 1
		]
	]
	if[(foundmatch=0) & (0 < #goodones) & (2 < #,/$telsajouts[index]) & (0 < #ajouts.Prenom[index])
		/ Sufficient condition for certain doubles Sept 9 2005
		/ might consider dropping the prenom condition in this case
		xa: ajouts.Prenom[index]
		xr:  refRNVPyes.Prenom[goodones]
		jj1: & xa = xr
		rem: goodones[jj1]
		if[0 < #rem
		 xa: telsajouts[index]
		 xr: telsyes[rem]
		 jj: & intersectnonnull[xa]'xr	
		 rem@: jj
		]
		/ verify that we don't have different non-null siret-sirens
		xx: (ajouts.SIREN_SIRET[index])
		if[(~ xx = `) & (0 < #rem)
			jj: & (refRNVPyes.SIREN_SIRET[rem]) _lin (`; xx)
			rem@: jj
		]
		c: rem
		if[0 < #c
		  out,:(ajouts.Id[index]),/:(refRNVPyes.Id[c]),\:(,`S_yes),,10.1
		  foundmatch: 1
		]
	]
	if[(0=#out) &(0 < #goodones)
	 scoresyes: vecmatchyes[index]'goodones
         jj:  & posslimit < scoresyes[;0]
         if[0 < #jj
          goodones: scoresyes[jj;1]
	  / newverify: SIREN is ok is done in vecmachyes
          out,: (ajouts.Id[index]) ,/: (refRNVPyes.Id[goodones]) ,\: (,`I_yes) ,  ,5.1
	  extrasfound: 1
         ]
	]
	goodones: & namesno = ` $ ajouts.NomRaisonsociale[index]
	if[(foundmatch=0)&(0 < #goodones) & (0 < #ajouts.LigneAdresse3[index]) & (0 < #ajouts.Prenom[index])
		/ Sufficient condition for certain doubles Sept 9 2005
		xa:  ajouts.Prenom[index]
		xr:  refRNVPno.Prenom[goodones]
		jj1: & xa = xr
		rem: goodones[jj1]
		xa:  ` $ ajouts.LigneAdresse3[index]
		xr: ` $  refRNVPno.LigneAdresse3[rem]
		jj: & xa = xr
		rem@: jj
		if[0 < #rem
		 xa:  ajouts.Ville[index]
		 xr: refRNVPno.Ville[rem]
		 jj: & xa = xr
		 rem@: jj
		]
		if[0 < #rem
		 xa:  ajouts.CP[index]
		 xr:  refRNVPno.CP[rem]
		 jj: & xa = xr
		 rem@: jj
		]
		/ verify that we don't have different non-null siret-sirens
		xx: (ajouts.SIREN_SIRET[index])
		if[(~ xx = `) & (0 < #rem)
			jj: & (refRNVPno.SIREN_SIRET[rem]) _lin (`; xx)
			rem@: jj
		]
		c: rem
		if[0 < #c
		 out,:(ajouts.Id[index]),/:(refRNVPno.Id[c]),\:(,`S_no),,10.1
		 foundmatch: 1
		]
	]
	if[(foundmatch=0) & (0 < #goodones) & (2 < #,/$telsajouts[index]) & (0 < #ajouts.Prenom[index])
		/ Sufficient condition for certain doubles Sept 9 2005
		/ might consider dropping the prenom condition in this case
		xa: ajouts.Prenom[index]
		xr:  refRNVPno.Prenom[goodones]
		jj1: & xa = xr
		rem: goodones[jj1]
		if[0 < #rem
		 xa: telsajouts[index]
		 xr: telsno[rem]
		 jj: & intersectnonnull[xa]'xr	
		 rem@: jj
		]
		/ verify that we don't have different non-null siret-sirens
		xx: (ajouts.SIREN_SIRET[index])
		if[(~ xx = `) & (0 < #rem)
			jj: & (refRNVPno.SIREN_SIRET[rem]) _lin (`; xx)
			rem@: jj
		]
		c: rem
		if[0 < #c
		  out,:(ajouts.Id[index]),/:(refRNVPno.Id[c]),\:(,`S_no),,10.1
		  foundmatch: 1	
		]
 	]
	if[(0=#out) &(0 < #goodones)
	 scoresno: vecmatchno[index]'goodones
         jj:  & posslimit < scoresno[;0]
         if[0 < #jj
          goodones: scoresno[jj;1]
	  / newverify: SIREN is ok is done in vecmachno
          out,: (ajouts.Id[index]) ,/: (refRNVPno.Id[goodones]) ,\: (,`I_no) , ,5.1
	  extrasfound: 1
         ]
	]
  ]
  if[(0 = #out) & (0 < # $ajouts.NomRaisonsociale[index])
  	/ desperatemeasures (look at word similarity in name; if any
	/ then rank these based on match of CP and Ville
	/ and address.
	numout: 3
	ajoutswords: ajouts.NomRaisonsocialewords[index]
	ii: & (nameswordsyes _lin ajoutswords) / find matches to words here
	candidates:  nameswordsindexesyes[ii] / indexes may appear often
	if[0 < #candidates
	 scores: (#candidates) # 0
	 if[0 < #$ajouts.Prenom[index]
	  jj: & (ajouts.Prenom[index]) = refRNVPyes.Prenom[candidates]
	  scores[jj]+: 2
	 ]
	 if[0 < #$ajouts.Numdevoie[index]
	   jj: & (ajouts.Numdevoie[index]) = refRNVPyes.Numdevoie[candidates]
	   scores[jj]+: 2
	 ]
	 if[0 < #$ajouts.Ville[index]
	   jj: & (ajouts.Ville[index]) = refRNVPyes.Ville[candidates]
	   scores[jj]+: 1
	 ]
	 if[0 < #$ajouts.CP[index]
	   jj: & (ajouts.CP[index]) = refRNVPyes.CP[candidates]
	   scores[jj]+: 1
	 ]
	 if[0 < #$ajouts.Pays[index]
	  jj: & (ajouts.Pays[index]) = refRNVPyes.Pays[candidates]
	  scores[jj]+: 1
	 ]
  	if[0 < #,/$telsajouts[index] / new in sept 05
		jj: & intersectnonnull[telsajouts[index]]' telsyes[candidates]
		scores[jj]+: 10
  	]
	 kk: > scores
	 candidates@: kk
	 if[numout < #candidates; candidates@: !numout]
	 goodones: candidates
	 / verify that we don't have different non-null sirens
	 xx: (ajouts.SIREN[index])
	 if[(~ xx = `) & (0 < #goodones)
		jj: & (refRNVPyes.SIREN[goodones]) _lin (`; xx)
		goodones@: jj
	 ]
         out,: (ajouts.Id[index]) ,/: (refRNVPyes.Id[goodones]) ,\: (,`I_yes),,4
	]
	ii: & (nameswordsno _lin ajoutswords) / find matches to words here
	candidates: ? nameswordsindexesno[ii] / indexes may appear often
	if[0 < #candidates
	 scores: (#candidates) # 0
	 if[0 < #$ajouts.Prenom[index]
	  jj: & (ajouts.Prenom[index]) = refRNVPno.Prenom[candidates]
	  scores[jj]+: 2
	 ]
	 if[0 < #$ajouts.Numdevoie[index]
	   jj: & (ajouts.Numdevoie[index]) = refRNVPno.Numdevoie[candidates]
	   scores[jj]+: 4
	 ]
	 if[0 < #$ajouts.Ville[index]
	   jj: & (ajouts.Ville[index]) = refRNVPno.Ville[candidates]
	   scores[jj]+: 1
	 ]
	 if[0 < #$ajouts.Pays[index]
	  jj: & (ajouts.Pays[index]) = refRNVPno.Pays[candidates]
	  scores[jj]+: 1
	 ]
	 if[0 < #$ajouts.CP[index]
	   jj: & (ajouts.CP[index]) = refRNVPno.CP[candidates]
	   scores[jj]+: 1
	 ]
  	if[0 < #,/$telsajouts[index] / new in sept 05
		jj: & intersectnonnull[telsajouts[index]]' telsno[candidates]
		scores[jj]+: 10
  	]
	 kk: > scores
	 candidates@: kk
	 if[numout < #candidates; candidates@: !numout]
	 goodones: candidates
	 / verify that we don't have different non-null sirens
	 xx: (ajouts.SIREN[index])
	 if[(~ xx = `) & (0 < #goodones)
		jj: & (refRNVPno.SIREN[goodones]) _lin (`; xx)
		goodones@: jj
	 ]
         out,: (ajouts.Id[index]) ,/: (refRNVPno.Id[goodones]) ,\: (,`I_no) ,,4
	]
  ]
  :out
}


/ DUMP TABLE dumptable


/ formstring takes a list and makes a string
formstring:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (" ")
}

formstringvertbar:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: ("|")
}

formstringcomma:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (",")
}

/ Output a table (a variable) to a text file outfile (string)
/ e.g.   output[`guide; guide; "foobar"]
dumptable:{[tablename; table; outfile]
  out: ,("# "), ($tablename), ("|"), formstringvertbar[!table]
  first: *!table
  numofelements: . ("#"), ($tablename), ("."), ($first)
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: ("|")
	i+: 1
  ]
  outfile 0: out
}

dumptablecsv:{[tablename; table; outfile]
  out: , formstringcomma[!table]
  first: *!table
  numofelements: . ("#"), ($tablename), ("."), ($first)
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: (",")
	i+: 1
  ]
  outfile 0: out
}

/ APPLICATION


makeupper:{[let]
  x: _ic let
  if[(x < 97) | (x > 122); :let]
  x-: 32
  :_ci x
}

/ parses a field based on white space and other stuff
/ CHANGE HERE: other things to ignore
getfieldswhite:{[line]
  i: line _lin " .-*/;'"
  j1: &i
  j2: &~i
  line @:j2
  if[0 = #line; :,`" "]
  size: #j1
  x:(0,(j1 - !size)) _ line
  counts: #:'x
  ii: & 0 < counts
  if[0 = #ii; :`]
  :` $ delblankandS'x[ii]
}

canon:{[item]
  i: translatesource ? item
  if[i = #translatesource; :item]
  :translatetarget[i]
}
  
splittokenize:{[line]
	x: * tokenize[line]
	if[3 > #$x; :x]
	:(` $ (($x)[!2])), x
}

/ create a new member of this line if the first two symbols are singletons
/ Prepends some guys together if they have one letter.
conglomtokenize:{[line]
	s1: tokenize[line] / eliminates all manner of delimiter
		/ blanks, ;, etc. and eliminates S.
	if[2 > #s1; :s1]
	if[2 = #s1
	  if[(1 = #$s1[0]) & (1 = #$s1[1]); s1: (` $ ($s1[0]),($s1[1])),s1] 
	]
	if[2 < #s1
	  if[(1 = #$s1[0]) & (1 = #$s1[1]) & (1 = #$s1[2])
		s1: (` $ ($s1[0]),($s1[1]),($s1[2])),s1
	  ]
	]
	:s1
}

splitnumeric:{[line]
	s1: tokenize[line]
	num1: -3
	ii: & isnumeric's1
	if[0 < #ii
		num1: 0 $ $ s1[*ii]
	]
	if[0 > num1; :s1]
	first:(` $ ($num1))
	:first,s1
}


/ tokenize each list
/ This picks strings out from the middle.
tokenize:{[line]
   my: makeupper'$line
   f1: getfieldswhite my
   f2: canon'f1
   f3: differ[f2; suppress]
   :f3
}

/ tokenize each list
/ This picks strings out from the middle.
teltokenize:{[line]
   my: $line
   ii: & ~my = "."
   my@: ii / get rid of .
   ii: & ~my = ","
   my@: ii / get rid of .
   countwhite: +/ my = " "
   if[countwhite > 3 / probably between pairs of digits
      ii: & ~my = " "
      my@: ii / get rid of .
   ]
   f1: getfieldswhite my
   if[2 > #,/$f1; :` $ $ * 1 _draw 10000000] / generate a random number 
   :f1
}

/ Above does parsing
/ Now process

sort:{[list] : list[<list]}
  
/ Use the tokenized table new to find things
/ Algorithm: partition based on code (postal)
/ Then filter each group by using the first word of nom
/ (unless it's an acronym in which case use the letters) 
/ After that, try the address to get a better sense of the match.

/ This tries to unify two partitions. Untried so far.
process:{[]
  / part: = (*:' new.code),'(*:' new.address)
  / part: = (*:' new.code),'(*:' new.nom),'(*:'new.address)
  part: = (*:' new.code),'(*:' new.nom)
  x: getbest'part
}

processrollsroyce:{[]
  / part: = (*:' new.code),'(*:' new.address)
  / part: = (*:' new.code),'(*:' new.nom),'(*:'new.address)
  part: = (*:' new.code)
  x: processgroup'part
}

/ a bunch of indexes with the same code postale
/ Only from processrollsroyce
processgroupold:{[indexes]
  mynames: *:' new.nom[indexes]
  part: = mynames
  x: getbest'[indexes[part]]
}

/ This tries to unify two partitions. Untried so far.
processgroup:{[indexes]
  part: = (*:' new.nom[indexes])
  part2: = (*:' new.address[indexes])
  d12: differ[part; part2]
  d21: differ[part2; part1]
  subflag: ()
  j: 0
  while[j < #d12
  	subflag,: |/ propersubset[d12[j]]'d21
	j+: 1
  ]
  / Any index in subflag should be deleted
  d12: d12 _di &subflag
  subflag: ()
  j: 0
  while[j < #d21
  	subflag,: |/ propersubset[d21[j]]'d12
	j+: 1
  ]
  / Any index in subflag should be deleted
  d21: d21 _di &subflag
  both: intersect[part; part2]
  / x: processgroup'part
  newpart: both,d12,d21
  x: getbest'[indexes[newpart]]
}


/ These ones have the same code postale and the same first part of nom 
/ Try to find the best matches.
getbest:{[indexes]
  thresh: 4
  if[1 < #indexes
	mynames: new.nom[indexes]
	myadds:  new.address[indexes]
	myville: new.ville[indexes]
	mytel: new.tel[indexes]
	mycode: new.code[indexes]
	/ if[(`JPC _in ,/mynames) ; !-1]
	ind: !#mynames
	pairs: ,/ ind ,/:\: ind
	pairs@: & (pairs[;0]) < (pairs[;1])
	xname: findvalnames'(mynames[pairs])
	xadd: findvaladdresses'(myadds[pairs])
	xville: findvalvilles'(myville[pairs])
	xtel: findvaltel'(mytel[pairs])
	xcode:  findvalcodes'(mycode[pairs])
	/ CHANGE HERE: also see thresh above.
	/ Weighted add approach combined with logic.
	/ The logic says the name is the same and either the code
	/ city or address.
	/ xtel by the same is conclusive
	/ following was eliminated because a similarity in the name,
	/ same city and same code and anything in address was enough.
	/ xtot: (2*xname) + (0.5* xadd) + xville + (2 * xcode) + (5*xtel)
	xtot: (2*xname) + (0.5* xadd) + xville + (xcode) + (5*xtel)
	ii: & (xtot > thresh) | (1=(xname & (xcode|xville|xadd) & (xadd > 0.3)))
	goodpairs: indexes[pairs[ii]]
	/ CHANGE HERE: if it's a transitive closure but not a clique
	/ then it's probable but not certain. For later.
	if[0 < #goodpairs
	  mymat: findcycles[goodpairs]
	  nodes: ?(goodpairs[;0]),(goodpairs[;1])
	  left: nodes
	  yy: ()
	  while[0 < #left
		i: nodes ? *left
		/ CHANGE HERE: We take the transitive closure as a group.
		vv: & mymat[i]
		jj: ?nodes[vv]
		if[0 < #jj
		  yy,: ,"                "
		  globalcount+: 1
		  yy,: ,$globalcount
		  k: 0
		  while[k < #jj
			v: jj[k]
			yy,: ,spitline (lapeyre.Clientnum[v]),(lapeyre.Nom[v]),(lapeyre.Adresse1[v]),(lapeyre.CodePostal[v]),(lapeyre.Ville[v]),(lapeyre.CANetHTfacture[v]),(lapeyre.NumeroTellineaire[v])
			k+: 1
		  ]
		]
		left: differ[left; jj]
		/ if[0 < #left; !-1]
	  ]
	  globalresult,: yy
	]
  ]
}

findvalnames:{[namepair]
	:(#intersect[namepair[0]; namepair[1]])%(#?(namepair[0]),(namepair[1]))
}

findvalcodes:{[codepair]
	:(#intersect[codepair[0]; codepair[1]])%(#?(codepair[0]),(codepair[1]))
}

/ CHANGE HERE: if there is any intersection then it is one
findvaltel:{[telpair]
	:0 < (#intersect[telpair[0]; telpair[1]])
}

findvaladdresses:{[addpair]
	y:(#intersect[addpair[0]; addpair[1]]) % (#?(addpair[0]),(addpair[1]))
	:(y)
}

findvalvilles:{[vpair]
	:(#intersect[vpair[0]; vpair[1]]) % (#?(vpair[0]),(vpair[1]))
}

/ build edge matrix
buildmatrix:{[edges]
  els: ?edges[;0],edges[;1]
  num: #els
  mat: (num; num) # 0
  i: 0
  while[i < #mat
	mat[i;i]: 1
	i+: 1
  ]
  i: 0
  while[i < #edges
        pair: edges[i]
        j1: els ? pair[0]
        j2: els ? pair[1]
        mat[j1;j2]: 1
        mat[j2;j1]: 1
        i+: 1
  ]
  :mat
}

/ find transitive closure of adjacency matrix
transclos:{[mat]
  flag: 1
  while[flag
        newmat: ()
        tmat: +mat
        i: 0
        while[i < #mat
                yy: |/'mat[i] */: tmat
                newmat,: ,yy |' mat[i]
                i+: 1
        ]
        flag: ~ mat ~ newmat
        mat: newmat
  ]
  :mat
}

/ transitive closure
/ build matrix and then do multiplication as needed
findcycles:{[edges]
  mat: buildmatrix[edges]
  tmpmat:: mat
  :transclos[mat]
}

/ DATA


suppress: (`"DE"
`AU
`ST
`SAINT
`"LE"
`"LA"
`"DU"
`"-"
`"DES"
`"AV"
`"AVENUE"
`"ROUTE"
`"RTE"
`"&"
`ET
`RUE
`"R."
`"BIS"
`"B."
`"BOULEVARD"
`"BLVD"
`"BD"
`"AVE"
`BLD
`SARL
`CEDEX
`SA
`"&"
`ENT
`SAS
`SNC
`STE
`ADMIN)


translatesource:(`AV `RTE `"&")
translatetarget:(`AVENUE `ROUTE `ET)


/ EXECUTION

/ TIME TESTING HARNESS (REMOVE / if you want this)
/ .time.set`.k
/ start: _t
/ END OF TIME TESTING HARNESS (REMOVE / if you want this)

/ Codecivil|NomRaisonsociale|Prenom|LigneAdresse1|LigneAdresse2|
/ LigneAdresse3|Numdevoie|LigneAdresse4|CP|Ville|Pays|TelDom|
/ TelPortLap|TelBuroLap|Fax|email|NatureClient|Contact|Id|
/ SIREN_SIRET|Statut_RNVP|Enseigne

` 0: ,"Test files are ajouts2.l, refRNVPyes.l and refRNVPno.l "
` 0: ,"Will put out candidate matches."

/ update Statut_RNVP so if a Ville has very few customers and RNVP
/ is 2, then considered non-fiable (so goes to 0)
/ Otherwise it is 1.

/ Eliminate stop words from address. Make those fields be a string without
/ blanks  or other punctuation.

/ For Nomraisonsociale, eliminate punctuation, then obtain a collection
/ of words which we first rearrange alphabetically and then sort into a string.


/ data entry at the very beginning
/ ajouts: 1: "ajouts2"
/ refRNVPyes: 1: "refRNVPyes"
/ refRNVPno: 1: "refRNVPno"

/ Partition based on code postal and pays and ville and numerodevoie
/ Because these are reliable
partyes: = (refRNVPyes.CP) ,' (refRNVPyes.Pays) ,' (refRNVPyes.Numdevoie)
uniqsyes: ` $ ,/' $ ? (refRNVPyes.CP),'(refRNVPyes.Pays),'(refRNVPyes.Numdevoie)

getfirstthree:{[word]
  w: $word
  if[3 > #w; :`]
  :` $ w[!3]
}

firstthree: getfirstthree' $ refRNVPno.CP
partno: = firstthree
uniqsno: ? firstthree
i: uniqsno ? `
if[i < #uniqsno
  partno: partno _di i
  uniqsno: uniqsno _di i
]


addressesajouts: (ajouts.LigneAdresse1),'(ajouts.LigneAdresse2),'(ajouts.LigneAdresse3),'(ajouts.LigneAdresse4)

/ addressesyes: (refRNVPyes.LigneAdresse1),'(refRNVPyes.LigneAdresse2),'(refRNVPyes.LigneAdresse3),'(refRNVPyes.LigneAdresse4)

/ addressesno: (refRNVPno.LigneAdresse1),'(refRNVPno.LigneAdresse2),'(refRNVPno.LigneAdresse3),'(refRNVPno.LigneAdresse4)

telsajouts: (ajouts.TelDom),'(ajouts.TelPortLap),'(ajouts.TelBuroLap)
telsyes: (refRNVPyes.TelDom),'(refRNVPyes.TelPortLap),'(refRNVPyes.TelBuroLap)
telsno: (refRNVPno.TelDom),'(refRNVPno.TelPortLap),'(refRNVPno.TelBuroLap)

namesyes: ` $ refRNVPyes.NomRaisonsociale
namesno: ` $ refRNVPno.NomRaisonsociale

/ new as of aug 25
indexes: !#refRNVPyes.NomRaisonsocialewords
nameswordsyes: ,/ refRNVPyes.NomRaisonsocialewords
nameswordsindexesyes: ,/(#:' refRNVPyes.NomRaisonsocialewords) #' indexes

indexes: !#refRNVPno.NomRaisonsocialewords
nameswordsno: ,/ refRNVPno.NomRaisonsocialewords
nameswordsindexesno: ,/(#:' refRNVPno.NomRaisonsocialewords) #' indexes
/ end of new as of aug 25

/ Create an index on refRNVP rows with statut = 1
/ based on code postal and pays and ville and numerodevoie
/ Create another one on all refRNVP rows based on SIREN
/ Create a third based on NomRaisonsociale.

/ Describe how we get RNVP of 1 and 0.
/ Any RNVP of 2 becomes 1 if there are at least 20 lapeyre customers
/ in that Ville. Any RNVP of 1 also becomes 1.
/ All others become 0.

/ We assume that a row in ajouts can potentially map any reference
/ row having Statut_RNVP of 1.
/ However, if a row in ajouts has a Statut_RNVP of 1, we assume it will NOT
/ map to a reference row having Statut_RNVP of 0
/ because we don't think it makes sense for an ajouts
/ row to pass the RNVP processing whereas a matching reference row doesn't.

/ We divide the processing into comparisons with reference rows having
/ Statut_RNVP of 1 (we call these refRNVPyes rows)
/ and those having Statut_RNVP of 0 (we call these refRNVPno rows).

/ For each ajouts row r, the following are candidate sets in refRNVPyes:
/ any row having the same CP, Pays, Numdevoie, 
/ or any row having the same non-empty NomRaisonsociale
/ or any row having the same non-empty SIREN
/ (which is reduced to the first 9 digits)
/ Then we follow vecmatchyes rules.

/ For each ajouts row r having RNVP_Statut of 0, an additional set of
/ candidates from refRNVPno is defined as follows:
/ any row having the same first three digits of the CP
/ or any row having the same non-empty NomRaisonsociale
/ or any row having the same non-empty SIREN
/ (which is reduced to the first 9 digits).
/ Then we follow vecmatchno rules.
/ At the end, if we haven't found anything, then we recover possibles
/ if they have the same SIRET_SIREN number ditto for name

/ Performance: Using a Sun UltraSPARC-IIi 333 Mhz
/ and  Solaris 9 s9_58shwpl3 
/ with 256 megs of ram, the program
/ went through 28,000 ajouts rows per hour
/ about 8 per second.
/ (11.5 hours for 326,258 rows)

/ If NatureClient is 1 in one case and 2 in the other, then reject.
/ If both have Sirets (first 9 digits of that field) and different, then reject
/ If both have contacts and different, then reject.
/ If both have telephones and they are at least 8 long and the same then accept.
/ If both have code civile and one is 1 and the other is 2, 3, or 4,
/ then add negative value
/ Compare names: if first letter different then reduce score.
/ Similarly for prenoms and for ville.

/ Now look at non-reliable RNVP. 
/ These will be compared with both reliable RNVP and non-reliable RNVP.
/ Partition (index) based on first digits of CP.
/ If both have telephones and they are the same then accept.
/ If names and ville and CP and address are same, then accept.
/ Otherwise at best possible.
/ If NatureClient is 1 in one case and 2 in the other, then reject.
/ If both have Sirets (first 9 digits of that field) and different, then reject
/ If both have contacts and different, then reject.
/ Compare names: if first letter different then reject.
/ If name is very different then reject
/ If Ville is very different then reject.
/ If both have code civile and one is 1 and the other is 2, 3, or 4,
/ then reject.
/ Else give measure.
/ Look also at prenoms.
/ Look at addresses.
/ Look at rest of CP.

findmatches: 1
if[0 < #_i / to get this just type k cleanmini doodad
  findmatches: 0
]

/ The idea is that we look on each pair that we miss and we understand
/ why we missed it.
if[findmatches = 0 / we want to check on false negatives
   outmissing: ()
   falsenegs: 1: "falsenegs"
   i: 0
   while[i < #falsenegs
	pair: falsenegs[i]
	jj: & (ajouts.Id) = pair[0]
	if[0 = #jj; ` 0: , ("No chance for "), $pair[0]]
	if[0 < #jj
		index: *jj
		yesflag: ajouts.Statut_RNVP = `"1"
		jjyes: & (refRNVPyes.Id = pair[1])
		if[(0 < #jjyes)
		  indexc: *jjyes
 		  x1: (ajouts.CP[index]) = (refRNVPyes.CP[indexc])
 		  x2: (ajouts.Pays[index]) = (refRNVPyes.Pays[indexc])
 		  x3: (ajouts.Numdevoie[index]) = (refRNVPyes.Numdevoie[indexc])
		  if[x1 = 0; outmissing,: ,(`CPdifferent_yes; index; indexc)]
		  if[x2 = 0; outmissing,: ,(`Paysdifferent_yes; index; indexc)]
		  if[x3 = 0; outmissing,: ,(`Numdevoiedifferent_yes; index; indexc)]
		  if[x1 & x2 & x3
		  	y: vecmatchyes[index;indexc]
			y,: (`yes; index; indexc)
			outmissing,: ,y
		  ]
		]
		jjno: & (refRNVPno.Id = pair[1])
		if[(0 < #jjno)
		  indexc: *jjno
 		  x1: (ajouts.CP[index]) = (refRNVPno.CP[indexc])
 		  x2: (ajouts.Pays[index]) = (refRNVPno.Pays[indexc])
 		  x3: (ajouts.Numdevoie[index]) = (refRNVPno.Numdevoie[indexc])
			/ also look at yes flag
		  if[x1 = 0; outmissing,: ,(`CPdifferent_no; index; indexc)]
		  if[x2 = 0; outmissing,: ,(`Paysdifferent_no; index; indexc)]
		  if[x3 = 0; outmissing,: ,(`Numdevoiedifferent_no; index; indexc)]
		  if[x1 & x2 & x3
		  	y: vecmatchno[index;indexc]
			y,: (`no; index; indexc)
			outmissing,: ,y
		  ]
		]
			
	]
	i+: 1
   ]
   "outmissing" 1: outmissing
!-11 / finished processing false negatives 
]

  
/ For experiments, add in tmpminiaddon
outmatch: ()
limit:  #ajouts.Codecivil
quads: ()
i: 0
while[i < limit
  quads,: findbestmatch[i]
  / ` 0: ($i), ("  ")
  if[((i % 1000) = _ (i % 1000)) & (i > 0)
	` 0: ,("Number of quads: "),$#quads
	` 0: ,("Number of ajouts: "),$i
	` 0: ,("  ")
	"tmpquads" 1: quads
  ]
  i+: 1
]
"quads" 1: quads
"tmpout" 0: spitline'quads


/ TIME TESTING HARNESS (REMOVE / if you want this)
/ .time.sum[]
/ xf: .TIME.f
/ xt: .TIME.t
/ xx: xf,'xt
/ zz: xx @ <xx[;1]
/ `show `.TIME
/ TIME TESTING HARNESS (ADD / if you want this)

/ TIME comment out the \\ if you want the timing harness
\\