/ Always go back to most recent NAT.
/ convertnum why does the partition. 


/ parse the genes and the traits (species in the same order)
/ then try to predict traits directly from genes
/ If you want to change how question marks are treated (as 0s or as
/ a middle value between 0 and 1), then see the routine "convertquest"

/ Look at changed to go from taking the median to not doing so.


/ DECISION TREE

log2:{[x] (_log[x]) % (_log[2])}

/ First, if we have lists of raw data:

/ how much outlist entropy is reduced by knowing inlist-outlist relationship
infogain:{[inlist; outlist]
  :(entropy[outlist]) - (condentropy[inlist;outlist])
}

/ compute conditional entropy of output on a single input
condentropy:{[inlist; outlist]
   part: = inlist
   counts: #:' part
   probs: counts % #inlist
   uniqs: ?inlist
   sum: 0
   i: 0
   while[i < #uniqs
	sum+: (probs[i]) * entropy[outlist[part[i]]]
	i+: 1
   ]
   :sum
}

numcorrect: 0 / for training
numwrong: 0
trainwrongrcs: ()
conditionouts: ()

/ build entire tree including cross-validation
buildtreeall:{[innames; inlists; outlist] 
  outdectree:: ,"          "
  globalcount:: 1
  numcorrect:: 0
  numwrong:: 0
  trainwrongrecs:: ()
  numtrainfalsepos:: 0
  numtrainfalseneg:: 0
  numtestfalsepos:: 0
  numtestfalseneg:: 0
  buildtree[(); (); 1; 0; innames; inlists; outlist] 
	/ conditionouts are () 
	/ excluded list is (), label is 1; outdent is 0
  outdectree,: ,("Training set error: "), ($numwrong % (numcorrect + numwrong))
  if[0 < numwrong
      part: = trainwrongrecs
      counts: #:'part
      uniqs: ?trainwrongrecs
      outdectree,: ,("Output value | number of times missed in training")
      outdectree,: spit'uniqs,'(`"|"),/: counts
  ]
  do[10 / cross-validation
    outdectree,: ,"          "
    count: #outlist
    jj: (_ 0.9 * count) _draw -count
    globalcount:: 1
    numcorrect:: 0 / for training
    numwrong:: 0
    trainwrongrecs:: ()
    numtrainfalsepos:: 0
    numtrainfalseneg:: 0
    numtestfalsepos:: 0
    numtestfalseneg:: 0
    conditionouts:: () / condition-outcome pairs
    buildtree[(); (); 1;0;innames;inlists[;jj]; outlist[jj]]
    outdectree,:,("Training set error: "), ($numwrong % (numcorrect + numwrong))
    if[0 < numwrong
      part: = trainwrongrecs
      counts: #:'part
      uniqs: ?trainwrongrecs
      outdectree,: ,("Output value | number of times missed in training")
      outdectree,: spit'uniqs,'(`"|"),/: counts
    ]
    jjrem: differ[(!#outlist); jj]
    outdectree,: ,spitvert innames
    x: spitvert' (+inlists[;jjrem]) ,' outlist[jjrem]
    outdectree,: x
    correctvec: evaltest[conditionouts]' (+inlists[;jjrem]) ,' outlist[jjrem]
    testwrong: #& correctvec = 0
    testwrongrecs: outlist[jjrem[& correctvec = 0]] / output values bad
    testwrongin: (+inlists[;jjrem[& correctvec = 0]])
    testall: #correctvec
    outdectree,:,("Test set error: "), ($testwrong % testall)
    if[0 < #testwrongrecs
      part: = testwrongrecs
      counts: #:'part
      uniqs: ?testwrongrecs
      outdectree,: ,("Output value | number of times missed in test|input")
      z: spit'uniqs,'(`"|"),/: counts,' (`"|"),/:(spit'testwrongin[part])
      outdectree,: z
    ]
  ]
}

/ This builds the decision tree. If using excluded, then never see
/ the same variable more than once.
/ This kind of makes sense if all variables are ordered.
/ co stands for the conditions and the output majority level.
/ This is for test set error calculations.
buildtree:{[co; excluded; label; outdent; innames; inlists; outlist]
 / (bestsplit; bestval; majoutabove; iname)
 x: () / output of choosenext
 ii: & ~ innames _lin excluded
 if[0 < #ii
   pair: choosenext[innames[ii]; inlists[ii]; outlist]
   type: pair[0] 
	/ If the type is `ordered then greater than or less than or equal
	/ If type is `unordered, then unordered type e.g. manufacturelocation
   x: pair[1]
 ]
 if[(0 = #x) | (outdent = maxoutdent)
	part: = outlist
	uniqs: ? outlist
	counts: #:'part
	yy: ,/ uniqs ,' (`"=") ,/: counts
	outdectree,: ,(outdent # (" ")), ("L"), ($label),(": "), spit yy
	maxcount: |/counts
	numcorrect+: maxcount
	i: counts ? maxcount
    	conditionouts,:,(` $ ("L"),$label; co; uniqs[i] ) 
		/ conditions and majority output
	if[1 < #uniqs
		numwrong+: (#outlist) - maxcount
		trainwrongrecs,: outlist _dv (uniqs[i])
	]
	:()
 ]
 z: (outdent # (" ")), ("L"), ($label),(": ")
 if[type = `ordered
   sx: $x[3]
   sx: (1+sx ? "_") _ sx
   allbestgenes,: 0 $ sx
   z,: ("Is "), ($x[3]), (" > "), ($x[0]), ("? (yes=L") 
   z,: ($globalcount+1), (" and no=L"), ($globalcount+2), (")")
   outdectree,: ,z
   i: innames ? x[3]
   inlistall: inlists[i]
   jup: & inlistall > x[0]
   jdown: & ~ inlistall > x[0]
   tmp: globalcount
   globalcount+: 2
   newex: excluded, ` $ $x[3]
   c1: co, ,(x[3];i; `">"; x[0]) / name, index, comparative, value
   c2: co, ,(x[3];i; `"<="; x[0])
   z:buildtree[c1;newex; tmp+1; outdent+2; innames; inlists[;jup]; outlist[jup]]
   z:buildtree[c2;newex;tmp+2;outdent+2;innames;inlists[;jdown]; outlist[jdown]]
 ]
 if[type = `unordered
   i: innames ? x[3]
   inlistall: inlists[i]
   newex: excluded, ` $ $x[3]
   tmpg: globalcount + 1 + !#x[0]
   globalcount:: globalcount +#x[0]
   j: 0
   while[j < #x[0]
      z: ("Is "), ($x[3]), (" = "), ($x[0;j]), ("? (yes=L") 
      z,: ($tmpg[j]), (")")
      outdectree,: ,z
      jeq: & inlistall = x[0;j]
      c: co, ,(x[3];i; `"="; x[0;j])
      z:buildtree[c;newex;tmpg[j];outdent+2;innames;inlists[;jeq];outlist[jeq]]
      j+: 1
   ]
 ]
}

/ evaluate the test set against the tree
/ condouts are all the conditions and inout contains conditions
/ and a predominant output value.
evaltest:{[condouts; inout]
  val: (-1) _ inout
  output: *|inout
  i: 0
  while[i < #condouts
	cout: condouts[i;2]
	cconds: condouts[i;1]
	flag: &/ cmatch[val]'cconds
	if[flag;  :cout = output]
	i+: 1
  ]
  !-11 / no match to any condition?
}

/ cmatch[val; conds]
/ val is a vector of values and conds are conditions in decision tree
/ e.g. val is 4 0 2 5 and cond is (`modelyear;3;`">";0)
/ meaning that ocation 3 should be greater than 0. (True in this case.)
cmatch:{[val;cond]
  x: val[cond[1]]
  comp: cond[2]
  if[comp = `">"
	:x > cond[3]
  ]
  if[comp = `"="
	:x = cond[3]
  ]
  if[comp = `"<="
	:~ x > cond[3]
  ]
}
/ choose the next attribute in the decision tree
/ and its split-point as well as its name
/ If the table is unordered then it will return an unordered answer
choosenext:{[innames; inlists; outlist]
  if[1 = #? outlist; :(`ordered; ())] / output value already homogeneous
  if[1 = #? +inlists; :(`ordered; ())] / input values all the same
  out: ()
  i: 0
  while[i < #innames
	takeflag: 0
	inlist: inlists[i]
	mytype: 4: inlist[0]
	if[mytype _in 1 2
		x: condentropyordered[inlist; outlist]
   		/ (bestsplit; bestval; majoutabove)
	]
	if[mytype = 4 / for symbols
		part: = inlist
		majoutabove: ()
		j: 0
		while[j < #part
			zz: outlist[part[j]]
			part2: = zz
			myc: #:' part2
			myuniqs: ?zz
			k: myc ? |/myc
			majoutabove,: myuniqs[k]
			j+: 1
		]
		x:  (?inlist; (condentropy[inlist; outlist]); majoutabove)
	]
	if[(0 = #out); takeflag: 1] 
	if[0 < #out; if[(x[1]) < (out[1]); takeflag: 1]]
	if[takeflag = 1; out: x, innames[i]]
	i+: 1
  ]
  if[mytype _in 1 2
    if[9999 = out[1]; :(`ordered; ())] / haven't found anything good
    :(`ordered; out)
  ]
  if[mytype = 4 / for symbols
    :(`unordered; out)
  ]
}

/ compute conditional entropy where the inlist is an ordered
/ Finds the best split point among the inlist numbers
/ to give the most entropy gain.
/ Technique: sort inlist. 
/ Divide at each point and compute the condentropy.
/ Return the min condentropy, the split point and majority outlist value
/ for that split point.
condentropyordered:{[inlist; outlist]
   jj: < inlist
   myin: ? inlist[jj]
   myin: (-1) _ myin
   bestval: 9999
   bestsplit: 8888
   majoutabove: 7777 / what the majority outlist value above split point is
   i: 0
   while[i < #myin
	split: myin[i]
	testin: (#inlist) # 0
	jj: & inlist > split
	testin[jj]: 1
	val: condentropy[testin; outlist]
	if[val < bestval
		bestval: val
		bestsplit: split
	]
	i+: 1
   ]
   jj: &inlist > bestsplit
   myouts: outlist[jj]
   part: = myouts
   counts: #:' part
   uniqs: ? myouts
   kk: > counts
   majoutabove: uniqs[*kk]
   :(bestsplit; bestval; majoutabove)
}

/ compute the entropy of a set
entropy:{[list]
  part: = list
  probs: (#:' part) % #list
  terms: probs * log2[1%probs]
  :+/terms
}

/ Second: if we have the tables of the marginals in the format
/ inlist: `female `female `male `male
/ outlist: `poor `rich `poor `rich 
/ numbers: 14423 1769 22732 9918

infogainnum:{[inlist; outlist; numbers]
  part: = outlist
  numpart: +/'numbers[part]
  x:(entropynum[numpart]) - (condentropynum[inlist;outlist; numbers])
  :x
}
   

/ compute conditional entropy of output on a single input
condentropynum:{[inlist; outlist; numbers]
   part: = inlist
   counts: +/' numbers[part]
   probs: counts % +/numbers
   uniqs: ?inlist
   sum: 0
   i: 0
   while[i < #uniqs
	sum+: (probs[i]) * entropynum[numbers[part[i]]]
	i+: 1
   ]
   :sum
}


/ entropy of a list of numbers, e.g. 1 1 2 4
/ one corresponding to each partition
entropynum:{[numlist]
  tot: +/numlist
  probs: numlist % tot
  terms: probs * log2[1%probs]
  :+/terms
}


/ BASIC ROUTINES
dot:{[x;y] +/x*y}

isnumeric:{[string] &/ string _lin "0123456789.-"}
  

/ parses a field based on vertical bars
getfields:{[line]
  i: line = " "
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  :(0,(j1 - !size)) _ line
}

spit:{[line] (-1) _ ,/ ($line) ,\: (" ")}
spitvert:{[line] (-1) _ ,/ ($line) ,\: ("|")}
spitcomma:{[line] (-1) _ ,/ ($line) ,\: (",")}

/ find difference between list[0] and list[1]
listdiff:{[list]
  :differ[list[0]; list[1]]
}

/ returns one if x is a subset of y
subset:{[x; y]
  i: y ?/: x
  : ~ (#y) _in i
}

/ returns one if x is a subset of y
subset:{[x;y] (#y) > |/ y ?/: x}        

differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

/ A faster difference, yielding indexes in x that differ from y
differindexes:{[x;y]
  i: y ?/: x
  j: & i = #y
  :j
}


/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  :x[(?i) _dv #x]
}

/finds intersection of two lists
/ fastest of all
intersect: {[x;y]
  x,: ()
  y,: ()
  if[(#x) < (#y)
    i: x ?/: y
    j: & i < #x
    :x[?i[j]]
  ]
  i: y ?/: x
  j: & i < #y
  :y[?i[j]]
}

/finds intersection of two lists
/ fastest of all
hasintersect: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  : (&/i) < #x
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :(#x) = (#intersect[x;y])
}

/ x is a proper subset of y
propersubset:{[x;y]
  x,: ()
  y,: ()
  if[~ (#x) < (#y); :0] / must be smaller
  :subset[x;y]
}

/finds indexes in x and y that intersect
/ If x and y are both sets, then the results will be of the same length
/ fastest of all
intersectindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :(i[j];j)
}

/finds indexes in x that intersect with y
intersectleftindexes: {[x;y]
  i: x ?/: y / where each y hits
  j: & i < #x / those ys that hit
  :i[j]
}

/finds intersection of two lists
/ and returns index pairs of matches. Assumes no duplicates
/ in either list
intersectbothindexes: {[x;y]
  x,: ()
  y,: ()
  i: x ?/: y
  pairs: (i ,' (!#y))
  k: & pairs[;0] < #x
  :pairs[k]
}

/ finds intersection of two lists that may have duplicates
bagintersectbothindexes:{[x;y]
  alreadyused: out: ()
  i: 0
  while[i < #x
	my: x[i]
	jj: & y ~\: my
	jj: differ[jj; alreadyused]
	if[0 < #jj
		out,: ,(i;*jj)
		alreadyused,: *jj
	]
	i+: 1
  ]
  :out
}

/ intersect many lists
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj: ,/ ?:' first (?/:)/: lists[1+ !(size-1)] / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}

/ this is a set intersection so we remove duplicates
multiintersect:{[lists]
 size: #lists
 if[2 > size; :lists]
 first: lists[0],()
 jj:  first ?/: (,/ ?:' lists[1+ !(size-1)]) / find indexes in first
 x: @[(1+#first) # 0; jj; + ; 1]
 x: (-1) _ x / delete missing entry
 kk: & x = size - 1
 :first[kk]
}


crossprod:{[listoflists]
  out: listoflists[0],()
  i: 1
  while[i < #listoflists
	out: ,/ out ,/:\: (listoflists[i],())
	i+: 1
  ]
  :out
}


median:{[x]
  y: x[<x]
  len: #x
  :y[_ (len % 2)]
}

avg:{(+/ x) % # x}

var:{avg[_sqr x] - _sqr avg[x]}
std:{_sqrt var[x]}
cov:{avg[x * y] - avg[x] * avg[y]}
corr:{ (cov[x;y])%((std[x]) * (std[y]))}
/ delay based search
corrdelay:{[delay;x;y] 
  x: (-delay) _ x
  y: delay _ y
  (cov[x;y])%((std[x]) * (std[y]))}


/ END BASICS


/ DUMP TABLE dumptable


/ formstring takes a list and makes a string
formstring:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (" ")
}

formstringvertbar:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: ("|")
}

formstringcomma:{[list]
  list,: ()
  : (-1) _ ,/ ($list) ,\: (",")
}


/ Output a table (a variable) to a text file outfile (string)
/ e.g.   output[`guide; guide; "foobar"]
dumptable:{[tablename; table; outfile]
  out: ,("# "), ($tablename), ("|"), formstringvertbar[!table]
  first: *!table
  numofelements: . ("#"), ($tablename), ("."), ($first)
  i: 0
  while[i < numofelements
	list: table[;i]
	x: formstring'list
	out,: , ($x[0]), (":	"), (-1) _ ,/(1 _ x) ,\: (" ")
	i+: 1
  ]
  outfile 0: out
}

dumpmatcsv:{[mat; outfile]
	if[`disregard _in mat[0]
	  mat: (-1) _' mat
	]
	othernums: 1 _ !#mat[;0]
	i: 1
	while[i < #mat[;0]
	  mat[i]: (` $ $ i),mat[i]
	  i+: 1
	]
	mat[0]: (`num), mat[0]
  out: , formstringcomma[mat[0]]
  / first: *!table
  numofelements: #mat[;0]
  i: 1
  while[i < numofelements
	list: mat[i]
	x: formstring'list
	out,: , (-1) _ ,/x ,\: (",")
	i+: 1
  ]
  outfile 0: out
}


permdetail:{[list]
  if[1 = #list; :list[0]]
  out: ()
  i: 0
  while[i < #list
	out,: ,list[i],/: permall[list _di i]
	i+: 1
  ]
  :out
}

/ get all permutations of a list
permall:{[list]
  x: permdetail[list]
  :,/x
}

perm:{[list] :list[(#list) _draw -#list]}
permindex:{[list] (#list) _draw -#list}

/ APPLICATION SPECIFIC
/ converts 1 to 2, ? to 1, 0 to 0
/ So 2 is definite present, ? is uncertain, 0 is absent
convertquest:{[c]
  if[c = "1"
	:"2"
  ]
  if[c = "?"
	:"1"
  ]
  :"0"
}

/ parses genes
parsegene:{[line]
  if[(line[0]) _in "?10"
	curvec,: 0 $' convertquest'line[!line ? " "]
  ]
  if[~ (line[0]) _in "?10"
	if[0 < #curvec
		genevals,: ,curvec
	]
	curvec:: ()
  	genespecies,: ` $ line[!line ? "	"]
  	myline: (1 + (line ? " ")) _ line
  	ii: & ~ myline = " "
  	myline@: ii
	curvec,: 0 $' convertquest'myline
  ]
}
  

/ parse 17DWS.ss into name and vector
parsetrait:{[line]
  name: ` $ line[!line ? " "]
  myline: (1 + (line ? " ")) _ line
  ii: & ~ myline = " "
  myline@: ii
  :(name; myline)
}

/ create a file for weka analysis
/ ??? Assumes binary
assembleforweka:{[classname; outlist; innames; inlists]
  out: ,spitcomma innames,classname
  x: inlists
  outlist: converttosymb'outlist
  out,: spitcomma'(+(x)),'outlist
  filename: ($classname),(".csv")
  filename 0: out
}

converttosymb:{[el] 
  if[el = 0; :`absent]
  if[el = 1; :`present]
}


/ FILE INPUT
/ Table schemas are written:
/ with a leading number sign, so # R| A |B|  Car |Salary
/ for a table R(A,B,Car,Salary).
/ The values are written without a number sign
/ Interior blanks are significant, but leading and trailing blanks in
/ a field are not.
/ Null values should be represented by having only blanks in a field.
/ Tables are separated by a single blank line


/ special assuming we have mostly just numbers
inputfromfile:{[filename]
  a: 0: filename
  globalvals:: ()
  oldglobalvals:: () / should not be necessary
  justempty:: 1 / as if we've just seen an empty line. should process table
  a,: ,,""
  processline'a
}

/ special assuming we have mostly just numbers
inputfromfilereg:{[filename]
  / a: ,("#"),(filename),(",source,type,dest")
  a: ,("#tmpedge,source,type,dest")
  a,: 0: filename
  globalvals:: ()
  oldglobalvals:: () / should not be necessary
  justempty:: 1 / as if we've just seen an empty line. should process table
  a,: ,,""
  processline'a
  (filename,("bin")) 1: tmpedge
}

/ a list is numeric if each member is either empty, a period
/ or a digit
isnumeric:{[list]
  :0
  s: ,/list
  nums: "-1234567890."
  ii: & s = "."
  if[1 < #ii; :0]
  x: nums ?/: s
  :(#nums) > |/x / if greater then everything in s met its match
}

/ parses a field based on vertical bars
getfields:{[line]
  i: line = ","
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  :x
}

/ parses a field based on vertical bars
getfieldssemi:{[line]
  line: $line
  i: line = ";"
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  :` $' x
}

/ get rid of blanks at either end of the string
delendblanks:{[string]
  if[0 = #string; :""]
  if[string ~ ,"" ; :""]
  i: & ~ string = " "
  if[(#string) = (#i); :string]
  if[0 = (#i); :""]
  string: (- ((#string)  - (1 + *|i))) _ string
  :(*i) _ string
}
  
/ convert all characters to lower case
lower:{[let]
 if[0 ~ #let
        :let]
 v: _ic let
 if[(64 < v) & (v < 91)
	v+: (97-65)
	:_ci v
 ]
 :let
}
  

/ convert all characters to upper case
upper:{[let]
 if[0 ~ #let
        :let]
 v: _ic let
 if[(96 < v) & (v < 123)
	v-: (97-65)
	:_ci v
 ]
 :let
}
  

/ Handles one line of input at a time according to table schema above.
/ A little more space efficient than the previous version.
processline:{[line]
 emptyorblank:{[line] (0 = #delendblanks[line])}
 emptyflag: emptyorblank[line]
 if[~emptyflag
  justempty:: 0
  if[line[0] = "#"
    newline: delendblanks'getfields[1 _ line]
    if[(1 < #newline) 
	globaltable:: *newline
	globalatt:: 1 _ newline
	globalvals:: () / initialize
    ] / newline
  ] / table declaration
  if[~ line[0] = "#"
	newline: delendblanks'getfields[line]
	if[(1 < #newline) | (0 < #newline[0]) 
		globalvals,: ,newline
	]
  ] / end of test on data line
 ] / end of test on line is non-empty
 if[emptyflag & (0 = justempty)
	justempty:: 1
	/ if[ ((#globalatt) > 1) | (~ (#globalatt) = (^globalvals)[0])
		/ globalvals:: + globalvals / take transpose
	/ ]
	if[~ (#globalatt) = (#globalvals[0]); !-1]
	numericflag:()
	hh: 0
	while[hh < #globalatt
		numericflag,: isnumeric[globalvals[;hh]] 
		hh+: 1
	]
	inindex:: 0
	while[inindex < #globalatt
	  / table.att[2] :: ` $ vals[3]
		string1: globaltable, (".")
		string1,: globalatt[inindex]
		teststring: ("0 = #"), string1
		/ For big sizes, uncomment following two statements
		/ User will get an error message, but it will work.
		/ x: @[.: ; teststring; :]
		/ currentlyempty: *x / error means empty
		/ For big sizes, comment following
		currentlyempty: 1
		/ if empty, then assign, else append
		string2: :[currentlyempty; (":: "); (",: ")] 
		string3: :[0 = numericflag[inindex]
			"` $ "
			:[1 = numericflag[inindex]; "0.0 $ "; "0 $ "]]
		string3,: "globalvals[;inindex]"
		. string1, string2, string3
		if[1 = numericflag[inindex]
			globalindexes:: & 0N = . string1
			string3: string1, ("[globalindexes] :: `")
			. string3
		]
		inindex+: 1
	]
 ]
}

converttonum:{[part; list] 
  ii: & list = `NA
  list[ii]: `"-1"
  x: 0.0 $ $ list
  /  :x // !changed; uncommenting this would get all data
  y: median'x[part]
  :y
}

/ subtract out native value and then compute standard deviation
/ and z score.
converttozscore:{[indnative; list]
  x: list[indnative]
  /  x: avg list / changed!!!
  mylist: list - x
  if[(0 = |/mylist) & (0 = &/mylist); :mylist] / all zscores are 0
  s: std[mylist]
  if[s = 0; s+: 0.0001] / to avoid undefined
  :mylist % s
}

/ convert an amino acid to its properties
converttoprop:{[amino]
  if[amino _in `GLY `ALA; amino: `SmallHydrophobic]
  if[amino _in `VAL `LEU `ILE `MET `PRO `PHE `TRP; amino: `LargeHydrophobic]
  if[amino _in `SER `THR `ASN `GLN `CYS `TYR ; amino: `Polar]
  if[amino _in `LYS `ARG `HIS `ASP `GLU ; amino: `Charged]
  :amino
}

/ convert an amino acid to its properties
/ using a different grouping by Glenn
converttopropglenn:{[amino]
  if[amino _in `GLY `PRO; amino: `Glenn1]
  if[amino _in `VAL `ALA `ILE `LEU; amino: `Glenn2]
  if[amino _in `MET `PHE `TRP `TYR ; amino: `Glenn3]
  if[amino _in `LYS `ARG `HIS ; amino: `Glenn4]
  if[amino _in `GLN `ASN ; amino: `Glenn5]
  if[amino _in `ASP `GLU ; amino: `Glenn6]
  if[amino _in `SER `THR `CYS ; amino: `Glenn7]
  :amino
}


/ change to amino acid
/ if native amino acid then return 0
/ else if same class then return 1
/ else return 2
aminochange:{[wtamino; otheramino]
  if[(otheramino = wtamino) | (otheramino = `""); :`"0"]
  wtprop: converttoprop[wtamino]
  otherprop: converttoprop[otheramino]
  if[wtprop = otherprop; :`"1"]
  :`"2"
}

/ uses Glenn's groupings
aminochangeglenn:{[wtamino; otheramino]
  if[(otheramino = wtamino) | (otheramino = `""); :`"0"]
  wtprop: converttopropglenn[wtamino]
  otherprop: converttopropglenn[otheramino]
  if[wtprop = otherprop; :`"1"]
  :`"2"
}

/ take care of one csv file corresponding to a position on the protein
/ Notice that this changes one of the inlists to reflect whether
/ an amino acid has changed class or not
processposition:{[indnative; innames; inlists; aminopart; wtamino; aminos ]
  dontconvert: ()  / don't always want to convert to z-score
  / replace description by type of amino acid transformation
  i: innames ? `description / put aminotransform into description field
  innames[i]: `aminotransform / ` means apply to each
  inlists[i]: aminochange[wtamino]'aminos
  dontconvert,: i / don't convert these to z scores
  / replace aminotransformglenn by glenn values
  i: innames ? `aminotransformglenn
  inlists[i]: aminochangeglenn[wtamino]'aminos / comment this out to ignore Glenn class
  dontconvert,: i / don't convert these to z scores
  / change to eliminate filename
  i: innames ? `filename
  if[i < #innames
	inlists[i]: (#inlists[i]) # `"0"	
  ]
  / end of change to elminate filename field
  nums: #:' aminopart / number of rows for each amino acid
  inlists: converttonum[aminopart]'inlists / aminopart partitions per 
	/ candidate replacing amino acid
  / ?? change to eliminate ara
  i: innames ? `ara
  if[i < #innames
  	inlists[i]*: 0 / change zero it out so arabinose doesn't matter
/  	inlists[i]: _:' (nums % 50) / number of times arabinose appears
/ 	dontconvert,: i / don't convert ara to z-scores
  ]
  / dontconvert hack for ACC stuff
  i: innames ? `ACCF
/  dontconvert,: i
  i: innames ? `ACCFSA
/  dontconvert,: i
  i: innames ? `ACCP
/  dontconvert,: i
  i: innames ? `ACCPSA
/  dontconvert,: i
  i: innames ? `ACCPR
/  dontconvert,: i
  i: innames ? `ACCPRSA
/  dontconvert,: i
  / ?? end of change to eliminate ara
  / remove low res terms
  is: innames ? `env
  ie: innames ? `cb
  if[(is < (#innames)) & (ie < (#innames))
    j: is
    while[j < ie+1
/      inlists[j]*: 0
      j+: 1
    ]
  ]
  / ?? end of change to remove low res terms
  / remove unknown terms
  i: innames ? `maxrms
  i,: innames ? `srms
  i,: innames ? `mxn
  j: 0
  while[j < (#i)
    if[i[j] < #innames
/      inlists[i[j]]*: 0
    ]
    j+: 1
  ]
  / ?? end of change to remove unknown terms
  i: (#inlists)-1 / last one is the target variable
  x: _:' inlists[i]
  ii0: & x = 0
  ii1: & x = 1
  ii2: & x = 2
  inlists[i;ii0]: `minus
  inlists[i;ii1]: `plus
  inlists[i;ii2]: `ts
  j: 0
  while[j < ((#inlists) - 1)
	if[~ j _in dontconvert
	  / if[inlists[j;0] = 22; !-12]
	  inlists[j]: converttozscore[indnative; inlists[j]]
	]
	j+: 1
  ]
  outinlists,: +inlists
  out: spitcomma'+inlists
  :out
}

/ finds indexes of rows corresponding to each residue mutation.
/ nat is row 0.
preprocessOLD:{[filename]
  indexes: ()
  counts: ()
  out: ()
  currentlabel: "nolabel"
  a: 0: filename
  i: 2 / start at first non-natural row
  startresidueindex: ()
  while[i < #a
	firstcomma: a[i] ? ","
	firstlab: a[i;!firstcomma]
	if[ ~ firstlab ~ currentlabel 
		currentlabel: firstlab
		startresidueindex,: i
	]
	i+: 1
  ]
  counts:  -':(startresidueindex,#a) / differences
  starts: 1, 1 + +\counts / start at row 1 (0 is for NAT) and then prefix sum
  i: 0
  while[i < #counts
	indexes,: , 0,(starts[i] + !(counts[i]))
		/ 0 is for NAT
	i+: 1
  ]
  :indexes
}


/ finds indexes of rows corresponding to each residue mutation.
/ nat is the first row of each new position. 
preprocess:{[filename]
  indexes: ()
  counts: ()
  out: ()
  currentlabel: "nolabel"
  a: 0: filename
  i: 1 / start at first non-natural row; May 6, 2009. First row is the change
  startresidueindex: ()
  while[i < #a
	firstcomma: a[i] ? ","
	firstlab: a[i;!firstcomma]
	if[ ~ firstlab ~ currentlabel 
		currentlabel: firstlab
		startresidueindex,: i
	]
	i+: 1
  ]
  counts:  -':(startresidueindex,#a) / differences
  starts: 0, + +\counts / start at row 0 and then prefix sum, May 6, 2009
  i: 0
  while[i < #counts
	/ indexes,: , 0,(starts[i] + !(counts[i]))
	indexes,: , (starts[i] + !(counts[i]))
		/ May 6, 2009: new idea is that each time label changes,
		/ first row is a NAT 
	i+: 1
  ]
  :indexes
}


/ !-1
/ :1

/ EXECUTION

outinlists: ()
allout: ()
allnums: ()
filename: _i[0]
indexes: preprocess[filename]
inputfromfile[filename]
/ This puts everything into the table sub
/ Now we take chunks of sub based on residue.
innames: 3 _ !sub
innames: `aminotransformglenn , innames / adds new col name at beginning
my: innames
i: my ? `description / replacing description field (again?)
my[i]: `aminotransform
allout,: ,spitcomma my
i: 0
while[i < #indexes
 mysub: sub[;indexes[i]]
 aminos: mysub[;][1]
 aminopart: = aminos
 x: *aminopart[0]
 aminopart[0]: 1 _ aminopart[0]
 aminopart: (,,x),aminopart
 types: mysub[;][2]
 indnative: types ? `NAT
 inlists: 3 _ mysub[;]
 inlists: (,(#inlists[0]) # `"0"), inlists
 indwt: types ? `WT
 wtamino: aminos[indwt]
 allout,:processposition[indnative; innames; inlists; aminopart; wtamino;aminos]
 allnums,: #allout
 i+: 1
]

"allnums" 1: allnums
` 0: allout

\\