/ This file generates tables in file format for loading.
/ Vertical bar delimited

/ New July 20, 2004: ability to put in any number of distributions.
/ Default is uniform. When non-uniform, we specify as normal, fractal
/ etc. The way we do these is as follows: we get the range of values
/ and then we apply the distribution to that range. 
/ This means that we take all the possible values and we choose an
/ arbitrary permutation of the numbers. The most frequent go in the
/ first member of the permutation, then the next, the next etc.
/ Then we permute the constructed vectors at the end.
/ When putting this up do a diff with the current version of gentable.k
/ to see what was already there. The biggest issue is foreign keys.
/ My best belief is that it is unnecessary to do anything special for them
/ but the version at nyu may have nevertheless done something.
/ Finally, a good example of a specification is 
/ sale 1.0
/ item 0.01 normal
/ customer 0.01 fractal
/ store 100.0
/ n 100.0 fractal
/ n 50.0 normal
/ With a call of the form k gentable 100000 sales salesspec 1

/ Input as follows:
/ k gentable numrows outfile specificationfile numkeyfields
/ For example, suppose we want to produce data to go into
/ roomtype(hotelid, roomtypeid, description)
/ where the first two fields are the keys.
/ If the file roomtypespec has the data
/ hotel 0.25
/ n 4.0
/ desc 16.0
/
/ Then we can produce this as follows as follows:
/ k gentable 30 roomtype roomtypespec 2
/
/ and we will produce a 30 row, 3 column output
/ with the first two fields constituting a key.
/ The second field of each spec is either 1 in which case
/ the number of different values equals the number of rows
/ or less than 1 in which case it is a fraction
/ or it is a fixed number in which case the choice is from
/ that number.
/ In the above example, there are 1/4 as many distinct
/ values of hotel as there are rows, there are only 4 room
/ types and only 16 descriptions.


/ APPLICATION

spitvert:{[list] (-1) _ ,/ ($list),\: (,"|")}

/ parses a field based on spaces
getfields:{[line]
  i: line = " "
  j2: &~i
  line @:j2
  size2: #j2
  j1: &i
  ts1: j1 > j2[size2 - 1] / eliminating trailing spaces
  its1: &~ts1
  j1: j1[its1]
  size1: #j1
  :(0,(j1 - !size1)) _ line
}


/ generate a table
gentable:{[numrows; numcols; specpairs; numkeys]
  out: ()
  i: 0
  while[i < numcols
	pair: specpairs[i]
        basevalue: pair[2]
	if[pair[1] = 1.0
		x: 1  + numrows _draw -numrows
	]
	if[1 > pair[1]
		x: 1 + numrows _draw _ (pair[1]) * numrows
	]
	if[1 < pair[1]
		x: 1 + numrows _draw  (_ pair[1]) 
	]
	base: :[pair[0] = `n
	 	`	
		pair[0]]
	y: $ ($base) ,/: $x
	maxlen: |/ #:'y
	numericflag: pair[0] = `n
	first: :[numericflag; `numeric; ` $ ("varchar("),($maxlen),(")")]
	y: ` $ y
	out,: ,first, y
	i+: 1
  ]
  x: +out
  if[0 < numkeys
  	keys: +out[!numkeys]
  	part: = keys
  	y: x[*:'part]
  ]
  if[0 = numkeys
	y: x
  ]
  z: spitvert'y
  :z
}

/ July 2004 changes below

/ remove comments in spec file
removecomments:{[file]
 first: file[;0]
 i:(first = "/") | (first = " ")
 j: &~i
 :file[j]
}

/ New getspec that allows us to do things like hotel 0.8 fractal
/ something like hotel 0.8 
/ becomes `hotel 0.8
/ I don't think we need the foreign key field so I'm going to
/ assume the third field describes the distribution.
getspec:{[line]
  x: getfields[line]
  if[2 = #x
  	:(` $ x[0]; 0.0 $ x[1]; 0.0)
  ]
  if[3 = #x / base value is given
  	:(` $ x[0]; 0.0 $ x[1]; 0.0 $ x[2])
  ]
  if[4 = #x / base value and distribution are given
  	:(` $ x[0]; 0.0 $ x[1]; 0.0 $ x[2]; ` $ x[3])
  ]
} 

/ generate a table
/ Note that the specpairs may in some cases be triplets.
/ We are assuming in those cases that the third field is something like
/ "fractal" or "normal"
gentable:{[numrows; numcols; specpairs; numkeys]
  out: ()
  i: 0
  while[i < numcols
	pair: specpairs[i]
	basevalue: pair[2]
	function: `uniform
	if[4 = #pair
	  if[("fractal") ~ pair[3]
		function: `fractal
	  ]
	  if[("normal") ~ pair[3]
		function: `normal
	  ]
	]
	if[pair[1] = 1.0 / doesn't matter what distribution
		x: 1 + numrows _draw -numrows
	]
	if[(1 > pair[1]) & (function = `uniform)
		x: 1 + numrows _draw _ (pair[1]) * numrows
	]
	if[(1 > pair[1]) & (function = `fractal)
		x: fractal[numrows; 1 +! _ (pair[1]) * numrows]
	]
	if[(1 > pair[1]) & (function = `normal)
		x: normal[numrows; 1 +! _ (pair[1]) * numrows]
	]
	if[(1 < pair[1]) & (function = `uniform)
		x: 1 + numrows _draw  (_ pair[1]) 
	]
	if[(1 < pair[1]) & (function = `fractal)
		x: fractal[numrows; 1 + ! (_ pair[1])] 
	]
	if[(1 < pair[1]) & (function = `normal)
		x: normal[numrows; 1 + ! (_ pair[1])] 
	]
	x: x + basevalue
	base: :[pair[0] = `n
	 	`	
		pair[0]]
	y: $ ($base) ,/: $x
	maxlen: |/ #:'y
	numericflag: pair[0] = `n
	first: :[numericflag; `numeric; ` $ ("varchar("),($maxlen),(")")]
	y: ` $ y
	out,: ,first, y
	i+: 1
  ]
  x: +out
  if[0 < numkeys
  	keys: +out[!numkeys]
  	part: = keys
  	y: x[*:'part]
  ]
  if[0 = numkeys
	y: x
  ]
  z: spitvert'y
  :z
}


perm:{[vec] vec[(#vec) _draw -#vec]}

round:{[x] _ x + 0.5}

assemble:{[count; minivec] :minivec[count _draw #minivec]}

/ We want to take n elements of vec according to a fractal (80/20) rule.
/ Top 20% gets 80% of the hits. Top 20% of that gets 80% of those hits etc.
/ The vec has no duplicates. So what will happen is that we will assign
/ A certain number to a bunch of subvectors and then we will assemble the
/ whole from them.
fractal:{[n; vec]
  tmp: perm[vec]
  hi: 0.8
  lo: 1-hi
  tot: n
  out: ()
  while[1 < #tmp
	mynum:  _ hi * #tmp
	mytot: _ lo * n
	out,: assemble[mytot; tmp[!mynum]]
	tot-: mytot
	tmp: mynum _ tmp
  ]
  if[1 = #tmp; out,: tot # tmp[0]]
  :perm[out]
}

findval:{[vec; frac]
  i: _ frac * #vec
  :vec[i]
}

findind:{[vec; frac]
  i: _ frac * #vec
  :i
}

avg:{(+/ x) % # x}
var:{avg[_sqr x] - _sqr avg[x]}
std:{_sqrt var[x]}

/ We want to take n elements of vec according to a normal distribution.
/ We take the mean value of vec to be the center of the distribution.
/ We take mean +/- 10% of the values either way from the mean to get
/ 68% of the values. (e.g. if #vec is 100, we take 40-60 as being
/ within one SD).
/ Then mean +/- 20% for the second SD which will have 95% of the values
/ etc.
/ The vec has no duplicates
normal:{[n; vec]
  tot: n
  tmp: vec[<vec]
  out: ()
  middle: findval[tmp;0.5]
  low1: findind[tmp; 0.4]
  high1: findind[tmp; 0.6]
  mytot: _ 0.68 * n
  out,: assemble[mytot; tmp[low1+!(high1-low1)]]
  tot-: mytot
  low2: findind[tmp; 0.3]
  high2: findind[tmp; 0.7]
  mytot2: (_ 0.95 * n) - mytot
  out,: assemble[mytot2; (tmp[low2+!(low1-low2)]),(tmp[high1+!(high2-high1)])]
  tot-: mytot2
  out,: assemble[tot; (tmp[!low2]),(tmp[high2+!((#tmp)-high2)])]
  :perm[out]
}


/ EXECUTION

numrows: 20
specpairs: 4 # ,(`n; 1.0)
	/ `n means numeric and 1 means uniqueness
	/ Otherwise can give a symbol which will be the
	/ basic string of the table.
specpairs[2]: (`hotel; 0.5)
targtable: "tmpout"
numkeys: 1
/ July 2004 changed.
if[4 = #_i
  numrows: 0 $ _i[0]
  targtable: _i[1]
  a: 0: _i[2]
  b: removecomments[a]
  specpairs: getspec'b
  numcols: # specpairs
  numkeys: 0 $ _i[3]
]


` 0: ,"k gentable numrows targtable specpairfile numkeys "
` 0: ,("Look in file: "), targtable, (" for results.")
/ July 2004 added
` 0: ,("Type k gentable H for help")
if[(0 < #_i) & (~ 4 = #_i)
` 0:,("Specfile entry is of form n 500 meaning choose from 1 to 500, uniformly")
` 0:,("or of form n 500 fractal, where we draw based on a fractal distribution")
` 0:,("or foobar 500, where we draw from foobar1 to foobar500 uniformly")
` 0:,("or foobar 500 fractal: we draw from foobar1 to foobar500 fractally")
` 0:,("or foobar 0.01, or foobar 0.01 fractal or n 0.01 or n 0.01 fractal")
` 0:,("These last signify that we draw from 1 to 0.01*numrows.")
` 0:,("Can also choose a close to normal distribution (but with wider tails)")
` 0:,("by using the word normal as a qualifier rather than fractal.")
` 0:,("Example spec: ")
` 0:,("sale 1.0")
` 0:,("item 0.01 normal")
` 0:,("customer 0.01 fractal")
` 0:,("store 100.0")
` 0:,("n 100.0 fractal")
` 0:,("n 50.0 normal")
. "\\\\"
]

targtable 0: gentable[numrows;numcols; specpairs; numkeys]
\\