#!/usr/bin/env python

import sys, os, re, htmlutils
from htmlutils import getcodec, remove_spaces
from htmlparser3 import HTMLParser3, HTMLHandler


##  HTMLTextHandler
##
class HTMLTextHandler(HTMLHandler):

  CUTSP = re.compile(ur'([\u3000-\u9fff])\n+([\u3000-\u9fff])')
  IGNORED_TAGS = dict.fromkeys(
    '! script style select'.split(' ')
    )
  NEWLINE_TAGS = dict.fromkeys(
    'p br div td th li blockquote pre form hr h1 h2 h3 h4 h5 h6 address'.split(' ')
    )
  
  def __init__(self, codec, out=sys.stdout,
               ignored_tags=IGNORED_TAGS, newline_tags=NEWLINE_TAGS):
    HTMLHandler.__init__(self)
    self.out = out
    self.codec = codec
    self.ignored_tags = ignored_tags
    self.newline_tags = newline_tags
    self.ignore = 0
    self.text = []
    return
  
  def flush(self, newline=False):
    if self.text:
      s = remove_spaces(self.CUTSP.sub(r'\1\2', ''.join(self.text).strip()))
      if s:
        self.out.write(s.encode(self.codec, 'replace'))
        self.out.write('\n')
        self.out.flush()
        self.text = []
    return
  
  def start_unknown(self, tag, attrs):
    if tag in self.ignored_tags:
      self.ignore += 1
    if tag in self.newline_tags:
      self.flush(True)
    return
  
  def end_unknown(self, tag):
    if tag in self.ignored_tags:
      self.ignore -= 1
    return
  
  def handle_data(self, data):
    if not self.ignore:
      self.text.append(data)
    return
  
  def finish(self):
    self.flush()
    self.out.close()
    return


# main
if __name__ == "__main__":
  import getopt, urllib
  def usage():
    print 'usage: html2txt.py [-c charset_in] [-C charset_out] files ...'
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], 'vc:C:')
  except getopt.GetoptError:
    usage()
  (verbose, charset_in, charset_out) = (False, 'iso-8859-1', 'iso-8859-1')
  for (k,v) in opts:
    if k == '-v': verbose = True
    elif k == '-c': charset_in = v
    elif k == '-C': charset_out = v
  if not args: args = ['-']
  for url in args:
    if url == '-':
      fp = sys.stdin
    elif url.startswith('http:') or url.startswith('ftp:'):
      fp = urllib.urlopen(url)
    else:
      fp = file(url)
    p = HTMLParser3(HTMLTextHandler(htmlutils.getcodec(charset_out)), charset=charset_in)
    p.feedfile(fp).close()
    fp.close()
