/ Here is how we process the cog file
/ First we take care of abbreviations.
/ Notes, each gene appears in exactly one cog.
/ There are sometimes duplicate genes within a genome that are all in the 
/ same cog.
/ Those are called repeats.
/ Algorithm: Find repeats and create a single gene with =
/ between them but record the individual gene in the repeats variable.
/ When we get a list of all genes, subtract the genes we find (including
/ those linked with underbars and add in those from repeats):
/ i.e. differ[allgenes; (,/genes),(,/repeats)]
/ Those genes that are singletons and are associated with
/ one genome should be linked with # signs.
/ They are an equivalence class.
/ All traits that are equivalence classes are also linked with =.

/ Needed inputs: cogtxt for cogs (have this), 
/ all genes Now we have this as genenames.

/ outputs:
/ NO LONGER: singlegenes, singlegenomes (for elements not in cogs)
/ We use = for multiple genes within a single genome in a single cog.
/ We use # for multiple genes within a single genome that are all singleton.
/ We use # for multiple genes within a single genome that all belong
/  to cogs having the same genomes. 
/  For example suppose two cogs have genomes X, Y, and Z.
/  cog1: has gene x1 in X, gene y1 in Y, and gene z1 in Z.
/  cog2: has gene x2 in X, gene y2 in Y, and gene z2 in Z.
/  Then there is no way for us to distinguish the effect x1 from x2,
/  since any trait that correlates with cog1 also correlates with cog2.
/  Singletons are just a special case of this.
/ nomes.number and nomes.name translates between genome numbers and names
/ enes.number and enes.name translates between gene numbers and gene names
/ genecomps takes each gene connected component and translates it to gene
/   numbers with no two gene numbers in a component appearing twice
/ genomecomps takes each genome connected component and translates it to
/   genome numbers.
/ 

differ:{[x;y]
  x,: ()
  y,: ()
  i: y ?/: x
  j: & i = #y
  :?x[j]
}

if[0 / done in the bacteria file
ab.symb: ()
ab.long: ()

processabbrev:{[line]
  i: & ~ line = " "
  first: *i
  i: *1 _ i
  ab.symb,: ` $ line[first]
  ab.long,: ` $ line[i+!((#line) - i)]
}

b: ("Afu Archaeoglobus fulgidus"
"Hbs Halobacterium sp. NRC-1"
"Mac Methanosarcina acetivorans"
"Mth Methanothermobacter thermautotrophicus"
"Mja Methanococcus jannaschii"
"Mka Methanopyrus kandleri AV19"
"Tac Thermoplasma acidophilum"
"Tvo Thermoplasma volcanium"
"Pho Pyrococcus horikoshii"
"Pab Pyrococcus abyssi"
"Pya Pyrobaculum aerophilum"
"Sso Sulfolobus solfataricus"
"Ape Aeropyrum pernix"
"Sce Saccharomyces cerevisiae"
"Spo Schizosaccharomyces pombe"
"Ecu Encephalitozoon cuniculi"
"Aae Aquifex aeolicus"
"Tma Thermotoga maritima"
"Ctr Chlamydia trachomatis"
"Cpn Chlamydophila pneumoniae CWL029"
"Tpa Treponema pallidum"
"Bbu Borrelia burgdorferi"
"Syn Synechocystis"
"Nos Nostoc sp. PCC 7120"
"Fnu Fusobacterium nucleatum"
"Dra Deinococcus radiodurans"
"Cgl Corynebacterium glutamicum"
"Mtu Mycobacterium tuberculosis H37Rv"
"MtC Mycobacterium tuberculosis CDC1551"
"Mle Mycobacterium leprae"
"Cac Clostridium acetobutylicum"
"Lla Lactococcus lactis"
"Spy Streptococcus pyogenes M1 GAS"
"Spn Streptococcus pneumoniae TIGR4"
"Sau Staphylococcus aureus N315"
"Lin Listeria innocua"
"Bsu Bacillus subtilis"
"Bha Bacillus halodurans"
"Uur Ureaplasma urealyticum"
"Mpu Mycoplasma pulmonis"
"Mpn Mycoplasma pneumoniae"
"Mge Mycoplasma genitalium"
"Eco Escherichia coli K12"
"EcZ Escherichia coli O157:H7 EDL933"
"Ecs Escherichia coli O157:H7"
"Ype Yersinia pestis"
"Sty Salmonella typhimurium LT2"
"Buc Buchnera sp. APS"
"Vch Vibrio cholerae"
"Pae Pseudomonas aeruginosa"
"Hin Haemophilus influenzae"
"Pmu Pasteurella multocida"
"Xfa Xylella fastidiosa 9a5c"
"Nme Neisseria meningitidis MC58"
"NmA Neisseria meningitidis Z2491"
"Rso Ralstonia solanacearum"
"Hpy Helicobacter pylori 26695"
"jHp Helicobacter pylori J99"
"Cje Campylobacter jejuni"
"Atu Agrobacterium tumefaciens strain C58 (Cereon)"
"Sme Sinorhizobium meliloti"
"Bme Brucella melitensis"
"Mlo Mesorhizobium loti"
"Ccr Caulobacter crescentus CB15"
"Rpr Rickettsia prowazekii"
"Rco Rickettsia conorii")
processabbrev'b
] / end of if 0


cog: ()
genomes: ()
genes: ()
currentgenomes: ()
currentgenes: ()
linenum: 0

/ parses a field based on spaces 
getfields:{[line]
  i: line = " "
  j1: &i
  j2: &~i
  line @:j2
  size: #j1
  x:(0,(j1 - !size)) _ line
  counts: #:'x
  :x[& 0 < counts]
}

/ findgenes takes a line with blanks and finds all the genes
/ this line has gene names only
findgenes:{[line]
  :` $ getfields[line]
}


/ process a line of cogtxt but keep only those symbols that are
/ in Mitch's data.
/ produce genomes and genes for cogs
processline:{[allowedgenomes; line]
  linenum+: 1
  if[0 = #line; :()]
  if[line[0] = "_"
	jj: < currentgenomes
	genomes,: ,currentgenomes[jj] / order by genome.
	genes,: ,currentgenes[jj]
	currentgenomes:: ()
	currentgenes:: ()
	:()
  ]
  if[ line[0] = "["
	yy: & line = " "
	top: 3 & ((#yy)-1)
	/ cog,: ` $ line[!yy[top]] / short version of cogs
	cog,: ` $ line / long version of cogs
	:()
  ]
  if[line[0] = " "
      i: & ~ line = " "
      first: *i
      if[first = 2
        i: line ? ":"
        x: ` $ line[first + !(i-first)] / genome
        if[x _in allowedgenomes
	  y: findgenes[(i+1) _ line]
	  / old way: represent each of these genes from the same genome
	  / as individuals. No good because these are an equivalence class.
	  / currentgenomes,: (#y) # x
	  / currentgenes,: y
	  if[1 = #y
		currentgenomes,: x
		currentgenes,: y
	  ]
	  if[1 < #y 
		repeats,: ,y
		currentgenomes,: x
		currentgenes,: ` $ (-1) _ ,/ ($y) ,\: ("=")
	  ]
      ]
     ] / first = 2
     if[first > 4 / have gene only on a line
	  y: findgenes[line]
	  ww: currentgenes[(#currentgenes)-1]
	  repeats[(#repeats)-1],: y
	  currentgenes[(#currentgenes)-1]:` $ (-1) _ ,/ ($ww,y) ,\: ("=") 
     ]
     :()
  ]
  !-11
}

/ take all cogs that range over the same genomes and put the genes together
/ in each genome.
/ Genes thus put together are linked by #.
findequivcogs:{[]
  part: = genomes
  foo:{[list] ` $ (-1) _ ,/ ($list) ,\: ("#")}
  joinup:{[genes; linkedindexes]
	if[1 = #linkedindexes; :genes[* linkedindexes]]
	x: + genes[linkedindexes]
	:foo'x
  }
  x: joinup[genes]'part
  genomes?:
  genes:: x
  foo2:{[list] ` $ ("{"), ((-1) _ ,/ ($list) ,\: ("#")), ("}")}
  cog:: foo2'cog[part]
}

nomes.number:()
nomes.name: ()
enes.number: ()
enes.name: ()

/ number the genomes
/ nomes.number and nomes.name translates between genome numbers and names
/ enes.number and enes.name translates between gene numbers and gene names
/ genecomps takes each gene connected component and translates it to gene
/   numbers with no two gene numbers in a component appearing twice
/ genomecomps takes each genome connected component and translates it to
/   genome numbers.
numbereverything:{[]
  x: ,/genomes
  y: ,/genes
  / i: < x
  / x@: i / genomes
  / y@: i
  z: ?x
  z@: < z
  nomes.number:: !#z / it doesn't matter which order
  nomes.name:: z
  y@: <y
  enes.number:: !#y
  enes.name:: y
  findcomponents:{[genelist] enes.number[enes.name _bin/: genelist]}
  genecomps:: findcomponents'genes
  findnomes:{[nomelist] nomes.number[nomes.name _bin/: nomelist]}
  genomecomps:: findnomes'genomes
}

/ find all the genes that are not in any cog.
/ Those are called singletons (because they would make singleton cogs).
/ Those are tied to a single genome, so there is no way we can distinguish
/ one of these from any other, so we make an equivalence class of
/ all singleton genes associated with a single genome.
findsingletons:{[allgenes; allgenomes]
  if[0 = #allgenes; :(();())]
  allgenesincogs: (,/genes) ,(,/repeats)
  singletons: differ[allgenes; allgenesincogs]
  i: allgenes ?/: singletons
  singletongenomes: allgenomes[i]
  part: = singletongenomes
  equivclass: singletons[part]
  addpound:{[list] :` $ (-1) _ ,/ ($list) ,\: ("#")}
  compositesinglegenes: addpound'equivclass
  compositesinglegenomes: ? singletongenomes
  :(compositesinglegenes; compositesinglegenomes)
}


repeats: () / remarkably enough, each repeat is seen only once.
	/ the cogs partition the world
a: 0: "cogtxt"
x: processline[genometab.symb]'a / only take the part of the cogs from
	/ relevant genomes

/ now find singleton genes and put them in an equivalence class
/ April 2000: create singlegenes as Aspecific, Bspecific etc
/ because we never distinguish between genes that are singletons.
/ So, Aspecific stands for all genes in A.
allgenes: ` $ ($genometab.symb) ,\: ("_specific")
allgenomes: genometab.symb
pair: findsingletons[allgenes; allgenomes]
singlegenes: ,:' pair[0] / e.g. `"a#b" `"c#d#ef"
singlegenomes: ,:' pair[1] / e.g. `X `Y
genes,: singlegenes
genomes,: singlegenomes
cog,: ` $ ($genometab.symb) ,\: ("_nocog") 
findequivcogs[] / Two cogs are equivalent if they range over the same
	/ genomes. The reason we consider them to be equivalent is that
	/ either one may be responsible for some traits.
numbereverything[] / gives genecomps which are components in terms of genes
	/ and genomecomps which are components in terms of genomes
	/ each component is one cog, so genecomps[0] are the genes
	/ in cog[0] and genomecomps[0] are the genomes in cog[0]

coggenes: genes