#!/usr/bin/env python

import re
from urllib.parse import quote, unquote
from html.entities import codepoint2name


# english, cyrillic, greek, japanese, chinese
ALPHABET = re.compile(r'[a-zA-Z0-9\u0390-\u03cf\u0400-\u045f\u3040-\u30ff\u4e00-\u9fff\uff10-\uff19\uff20-\uff9f]')
def get_alphabets(s):
  return ALPHABET.findall(x)

SPACE = re.compile(r'^\s+$')
def is_space(s):
  return SPACE.match(s)

REMOVE_SPACES = re.compile(r'\s+')
def remove_spaces(s):
  return REMOVE_SPACES.sub(' ', s)

def concat(r):
  return remove_spaces(''.join(r).strip())


# html tags

BLOCK = dict.fromkeys(
  "head title body p h1 h2 h3 h4 h5 h6 tr td th dt dd li "
  "ul ol dir menu pre dl div center frameset "
  "blockquote table fieldset address".split(" "))

CDATA = dict.fromkeys('script style'.split(' '))

INLINE = dict.fromkeys(
  "! tt i b u s strike big small nobr em strong dfn code samp kbd var cite abbr "
  "acronym a applet object button font map q sub sup span bdo layer ilayer iframe "
  "select textarea label button".split(" "))

INLINE_IMMED = dict.fromkeys(
  "basefont br area link img param hr input option "
  "colgroup col frame isindex meta base embed".split(" "))

NON_NESTED = dict.fromkeys("form noembed noframes noscript nolayer".split(" "))

VALID_TAGS = {}
for d in (BLOCK, CDATA, INLINE, INLINE_IMMED, NON_NESTED):
  for t in d.keys():
    assert t not in VALID_TAGS, t
    VALID_TAGS[t] = 1
    

# mappings from html charsets to python codecs.
CODECS = {
  "ascii": "us-ascii",
  "us-ascii": "us-ascii",
  "iso-8859-1": "iso-8859-1",
  "jis": "iso-2022-jp",
  "iso-2022-jp": "iso-2022-jp",
  "euc": "euc-jp",
  "euc-jp": "euc-jp",
  "euc_jp": "euc-jp",
  "x-euc-jp": "euc-jp",
  "sjis": "ms932",
  "x-sjis": "ms932",
  "shift_jis": "ms932",
  "shift-jis": "ms932",
  "utf": "utf-8",
  "utf8": "utf-8",
  "utf-8": "utf-8",
}

# getcodec
def getcodec(charset, default='iso-8859-1'):
  charset = charset.lower()
  try:
    return CODECS[charset]
  except KeyError:
    return CODECS[default]

# attrquote(s)
QUOTEREFS = { '&':'&amp;', '<':'&lt;', '>':'&gt;', '"':'&quot;' }
def attrquote(s):
  """Converts special characters in the attribute string s into entityrefs.
  The return value must be raw (non-Unicode) string."""
  def conv1(c):
    if c in codepoint2name:
      return '&%s;' % codepoint2name[c]
    elif c < 32 or 126 < c:
      return '&#%d;' % c
    else:
      return chr(c)
  return str(''.join([ conv1(ord(c)) for c in s ]))

# htmlquote(s, codec)
def htmlquote(s, codec="ascii", quote_special=True):
  """Convert special characters in the string s into proper entitierefs
  with the given codec. The return value must be raw (non-Unicode) string."""
  def conv1(c):
    if quote_special and c in codepoint2name:
      return '&%s;' % codepoint2name[c]
    elif c < 32:
      return chr(c)
    else:
      try:
        return chr(c).encode(codec)
      except UnicodeError:
        return '&#%d;' % c
  return str(''.join([ conv1(ord(c)) for c in s ]))

# urldecode(s)
ARGPAT = re.compile(r'([^=]+)=(.*)')
def urldecode(s):
  """Convert a query string s into a mapping type.
  Duplicated arguments are not supported.
  Each parameter must occur at most only once."""
  args = {}
  for arg1 in s.split("&"):
    m = ARGPAT.match(arg1)
    if m:
      args[unquote(m.group(1))] = unquote(m.group(2))
  return args

# attr2str
def attr2str(attrs):
  r = []
  for (k,v) in attrs.items():
    if v == None:
      r.append(' '+attrquote(k))
    else:
      r.append(' %s="%s"' % (attrquote(k), attrquote(v)))
  return ''.join(r)
