#!/usr/bin/env python

import sys, os, re, htmlutils
from htmlutils import getcodec, remove_spaces
from htmlparser3 import HTMLParser3, HTMLHandler
from html2txt import HTMLTextHandler
from tokenizer3 import tokenize


PAT = re.compile(r"^[-a-zA-Z0-9.']+")
def stem(s):
  m = PAT.match(s)
  return m.group(0)


##  NameHandler
##
class NameHandler:

  def __init__(self, fname):
    self.freqwords = {}
    fp = file(fname)
    for line in fp:
      w = line.strip().split(' ')[0]
      self.freqwords[w] = 1
    fp.close()
    self.name = []
    return
  
  def flush(self):
    if self.name:
      print ' '.join(self.name)
      self.name = []
    return

  def close(self):
    return
  
  def write(self, s):
    for w in tokenize(s):
      if (not w) or (not w[0].isalpha()) or (not w[0].isupper()):
        self.flush()
      elif self.name or (w.lower() not in self.freqwords):
        w1 = stem(w)
        self.name.append(w1)
        if w1 != w:
          self.flush()
    return


# main
if __name__ == "__main__":
  import getopt, urllib
  def usage():
    print 'usage: html2txt.py [-c charset_in] [-C charset_out] files ...'
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], 'vc:C:')
  except getopt.GetoptError:
    usage()
  (verbose, charset_in, charset_out) = (False, 'iso-8859-1', 'iso-8859-1')
  for (k,v) in opts:
    if k == '-v': verbose = True
    elif k == '-c': charset_in = v
    elif k == '-C': charset_out = v
  if not args: args = ['-']
  for url in args:
    if url == '-':
      fp = sys.stdin
    elif url.startswith('http:') or url.startswith('ftp:'):
      fp = urllib.urlopen(url)
    else:
      fp = file(url)
    p = HTMLParser3(HTMLTextHandler(htmlutils.getcodec(charset_out), out=NameHandler("freqwords")), charset=charset_in)
    p.feedfile(fp)
    p.close()
    fp.close()
