/ From notes of Alex Rubinsteyn
/ The eta term has to be computed using leave-out-one.
/ Regularization tries to reduce the number of inputs or perhaps
/ reduce the size of each contribution.

/ In an active learning scenario, must vary the experiments (could
/ use combinatorial design).

/ So assumption is that we get a bunch of labeled values (all inputs
/ plus a labeled output).
/ Then we find the best eta. Then we find the best weights.
/ We have a very primitive regularization as of November 15, 2008

/ For our time series, for each target gene, we will do this
/ procedure over all time points.

/ given input vector x (values for the candidate for each attribute)
/ current weights w
/ score s (the label for that input)
/ eta (a fudge factor)
/ regularization (number of non-zero weights)
/ returns a new set of weights 
stocgraddescent:{[x; w; s; eta]
  diff: s - sum x*w;
  grads: x*diff; / change the weights in proportion to the size of the xi
  :w + eta*grads;
  }

/ call stochastic gradient descent for each labeled experiment
/ return a set of weights
graddescent:{[X; w; S; eta]
  i: 0;
  while[i < count X;
	w: stocgraddescent[X[i]; w; S[i]; eta];
	w: regularize[w; numnonzero];
	i+: 1; 
  ];
  :regularize[w; numnonzero];
  }

/ if numnonzero < non-zeros in w, then take away the low weights
/ in absolute valuea to make numnonzero == non-zeros in w
regularize:{[w; numnonzero]
  if[numnonzero < 0; :w]; / don't do anything in that case
  if[numnonzero > ((count w)-1); :w]; / don't do anything in that case
  a: abs each w;
  ii: idesc a;
  indtozero: numnonzero _ ii;
  new: w;
  new[indtozero]: 0;
  :new;
  }

/ discover the best eta  from the list of etaposs
leaveoutone:{[X; w; S; etaposs]
  besteta: neg 1;
  bestcost: 1000 * sum abs each  S;
  i: 0;
  while[i < count etaposs;
	myeta: etaposs[i];
	c: myleaveoutone[X;w;S;myeta];
	if[c < bestcost;
		besteta: myeta;
		bestcost: c;
	];
	i+: 1;
  ];
  :besteta
  }

/ This one does a gradient descent on all but one sample each time
/ and then returns the average cost
myleaveoutone:{[X; winitial; S; myeta]
  cost: 0;
  i: 0;
  while[i < count X;
	w: graddescent[X _ i; winitial; S _ i; myeta];
	cost+: (sum ((sum X[i]) * w) - S[i]) xexp 2;
	i+: 1;
  ];
  :cost
  }

/ DATA


X:(0 0 0 0;
 0 0 0 1;
 0 0 1 0;
 0 0 1 1;
 0 1 0 0;
 0 1 0 1;
 0 1 1 0;
 0 1 1 1;
 1 0 0 0;
 1 0 0 1;
 1 0 1 0;
 1 0 1 1;
 1 1 0 0;
 1 1 0 1;
 1 1 1 0;
 1 1 1 1)

S: 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
S: 0 0 0.7 1 0 0 1 0.4 0 0 1 1 0 0 1 1

winitial: 0 0 0 0
  

/ EXECUTE
numnonzero: -1; / if -1 then allow an unbounded number of non-zero weights
possible_eta: 0.00001 0.00005 0.0001 0.0005 0.001 0.005 0.01 0.05 0.1 0.5 1.0;

besteta: leaveoutone[X; winitial; S; possible_eta];
besteta

w: graddescent[X; winitial; S; besteta]

w