#!/usr/bin/env python
##
##  Copyright (c) 2004  Yusuke Shinyama <yusuke at cs dot nyu dot edu>
##
##  Permission is hereby granted, free of charge, to any person
##  obtaining a copy of this software and associated documentation
##  files (the "Software"), to deal in the Software without
##  restriction, including without limitation the rights to use,
##  copy, modify, merge, publish, distribute, sublicense, and/or
##  sell copies of the Software, and to permit persons to whom the
##  Software is furnished to do so, subject to the following
##  conditions:
##
##  The above copyright notice and this permission notice shall be
##  included in all copies or substantial portions of the Software.
##
##  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
##  KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
##  WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
##  PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
##  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
##  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
##  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
##  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
##

import re

SPECIAL1 = dict.fromkeys("dr. mr. ms. mt. ft. jr. no. op. st.".split(" "))
SPECIAL2 = re.compile(r"([a-z]\.)+")

# simple tokenizer

TOKENS = re.compile(r"""
        ( '' | `` |             # quotation mark
          -+ |                  # dash
          [!\?]+ |              # "!" and "?"
          ([0-9]+[,.])+[0-9]+ | # number
          [-a-zA-Z0-9.]+'[a-zA-Z0-9.]+'[a-zA-Z0-9.]+ |  # A'B's
          [-a-zA-Z0-9.]+'[a-zA-Z0-9.]+ |                # A's
          [-a-zA-Z0-9.]+ |                              # A
          \S                    # other letters
        )""", re.VERBOSE)

WORD_APOSTROPHE = re.compile(r"^(.+)('[a-z]+)$")

def tokenize(s):
  """Splits string s into a sequence of primary tokens."""
  w0 = ''
  for (w,z) in TOKENS.findall(s):
    if w0:
      yield w0
    if w.endswith("n't"):
      if 4 <= len(w):
        yield w[:-3]
        w0 = w[-3:]
      else:
        w0 = w
      continue
    m = WORD_APOSTROPHE.match(w)
    if m:
      yield m.group(1)
      w0 = m.group(2)
      continue
    if w0.endswith('s') and w == "'":
      yield "'s"
      w0 = ''
      continue
    w0 = w
  if w0:
    yield w0
  return


# sentence splitter

class SentenceSplitter:
  
  def __init__(self, capita=True):
    """Split a sequence of tokens to sentences."""
    self.w1 = None
    self.words = []
    # capita: True if the first words of sentences are capitalized.
    self.capita = capita
    return
  
  def feed(self, tokens):
    for w2 in tokens:
      #print self.w1, w2
      if self.w1:
        w1 = self.w1
        self.words.append(w1)
        e = False
        if w2 == None:
          e = True
        elif w1[0] in '!?':
          e = True                      # punctuation?
        elif w1[-1] not in '.:;':
          pass                          # normal word?
        elif self.capita and (w2[0].isalpha() and not w2[0].isupper()):
          pass                   # followed by uncapitalized word?
        elif (w1.lower() in SPECIAL1) or SPECIAL2.match(w1) or (w1[0].isupper() and len(w1) <= 4):
          pass                          # "A.", "Mr.", "Sen."
        elif w1[0].isupper() and len(w1) <= 5 and w2[0].isdigit():
          pass
        else:
          e = True
        if e:
          # end of sentence
          assert w1 == self.words[-1]
          if w1[-1] in '.:;' and 1 < len(w1):
            sentwords = self.words[:-1] + [w1[:-1], w1[-1]] # separate the period: "aaa." 
          else:
            sentwords = self.words
          self.words = []
          if w2 == '"' or w2 == "''":   # include a quote mark after the period.
            sentwords.append(w2)
            yield sentwords
            self.w1 = None # skip this next time.
            continue
          yield sentwords
      self.w1 = w2
    return

  def close(self):
    return self.feed([None])

  
# testing

def test():
  text = '''ABC -- Foo bar baz. Blah ``foo, bar, zzz'',
  hee hee 123-goop 4920.29 oof 3,123,231,132 doofs moo
  quack's quock pros' stra ff-ww A. B. Cooo E.G.-foov
  moo-mo-f.f.fufu epp E.E.A-Qun Barr sto "daax," floe
  ``pfe fga emo." Faa Baa. Baar Zaar moi. Foera?
  aerf aerf!? O'Neil's belief.'''
  sents0 = [
    [ 'ABC', '--', 'Foo', 'bar', 'baz', '.' ],
    [ 'Blah', '``', 'foo', ',', 'bar', ',', 'zzz', "''", ',', 'hee', 'hee',
      '123-goop', '4920.29', 'oof', '3,123,231,132', 'doofs', 'moo', 'quack', "'s",
      'quock', 'pros', "'s", 'stra', 'ff-ww', 'A.', 'B.', 'Cooo', 'E.G.-foov',
      'moo-mo-f.f.fufu', 'epp', 'E.E.A-Qun', 'Barr', 'sto', '"', 'daax', ',', '"', 'floe',
      '``', 'pfe', 'fga', 'emo', '.', '"' ],
    [ 'Faa', 'Baa.', 'Baar', 'Zaar', 'moi', '.' ],
    [ 'Foera', '?' ],
    [ 'aerf', 'aerf', '!?' ],
    [ "O'Neil", "'s", 'belief', '.' ],
    ]
  splitter = SentenceSplitter()
  sents1 = list(splitter.feed(tokenize(text))) + list(splitter.close())
  print(sents0)
  print(sents1)
  assert sents0 == sents1
#test()


# main
if __name__ == "__main__":
  import sys, getopt, fileinput
  def usage():
    print("usage: tokenize.py [-u] [file ...]")
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], "u")
  except getopt.GetoptError:
    usage()
  (capita,) = (True,)
  for (k,v) in opts:
    if k == "-u": capita = False
  #
  splitter = SentenceSplitter(capita)
  for line in fileinput.input(args):
    tokens = tokenize(line.strip())
    for words in splitter.feed(tokens):
      print(' '.join(words))
  for words in splitter.close():
    print(' '.join(words))    
