//Title:        JET
//Version:      1.16
//Copyright:    Copyright (c) 2003
//Author:       Ralph Grishman
//Description:  A Java-based Information Extraction Tool

package Jet.Zoner;

import Jet.Tipster.*;
import java.util.*;
import AceJet.Ace;

/**
 *  container for static method for sentence splitting.
 */
 
public class SentenceSplitter {

  static HashSet abbreviations = new HashSet();
  static HashSet monocaseAbbreviations = new HashSet();

  static {// titles
  	      abbreviations.add("Adm.");
          abbreviations.add("Capt.");
          abbreviations.add("Cmdr.");
          abbreviations.add("Col.");
          abbreviations.add("Dr.");
  	      abbreviations.add("Gen.");
  	      abbreviations.add("Gov.");
  	      abbreviations.add("Lt.");
  	      abbreviations.add("Maj.");
  	      abbreviations.add("Messrs.");
  	      abbreviations.add("Mr.");
          abbreviations.add("Mrs.");
          abbreviations.add("Ms.");
          abbreviations.add("Prof.");
          abbreviations.add("Rep.");
          abbreviations.add("Reps.");
          abbreviations.add("Rev.");
          abbreviations.add("Sen.");
          abbreviations.add("Sens.");
          abbreviations.add("Sgt.");
          abbreviations.add("Sr.");
          abbreviations.add("St.");
          
          // abbreviated first names
          abbreviations.add("Alex.");
          abbreviations.add("Benj.");
          abbreviations.add("Chas.");
          
          // other abbreviations
          abbreviations.add("a.k.a.");
          abbreviations.add("c.f.");
          abbreviations.add("i.e.");
          abbreviations.add("vs.");
          abbreviations.add("v.");
          
          Iterator it = abbreviations.iterator();
          while (it.hasNext())
          	monocaseAbbreviations.add(((String)it.next()).toLowerCase());
  }

  /**
   *  splits the text in <I>textSpan</I> into sentences, adding <B>sentence</B>
   *  annotations to the document.   We split after a period if the following
   *  token is capitalized, and the preceding token is not a known
   *  not-sentence-ending abbreviation (such as a title) or a single capital
   *  letter.
   */

  public static void split (Document doc, Span textSpan) {
    int start = textSpan.start();
    int end = textSpan.end();
    String text = doc.text();
    int posn = start;
    int tokenCount = 0;
    int sentenceStart = start;
    int sentenceEnd;
    int nextTokenStart;
    String currentToken = null;
    String nextToken;
    boolean startOfSentence = true;
    //  advance 'position' to first non-blank
    while ((posn < end) && Character.isWhitespace(text.charAt(posn))) posn++;
    //  if all blank (or empty span), exit:  no sentence annotations
    if (posn >= end) return;
    while (posn < end) {
      nextTokenStart = posn;
      while ((posn < end) && !Character.isWhitespace(text.charAt(posn))) posn++;
      nextToken = text.substring(nextTokenStart, posn);
      tokenCount++;
      // advance to next non-blank
      while ((posn < end) && Character.isWhitespace(text.charAt(posn))) posn++;
      if (isSentenceEnd(currentToken, nextToken, startOfSentence) ||
          isDatelineEnd(currentToken, tokenCount)) {
        sentenceEnd = nextTokenStart;
        doc.annotate("sentence", new Span (sentenceStart, sentenceEnd), null);
        // System.out.println ("Sentence from " + sentenceStart + " to " + sentenceEnd);
        sentenceStart = sentenceEnd;
        startOfSentence = true;
      } else {
      	startOfSentence = false;
      }
      currentToken = nextToken;
    }
    sentenceEnd = end;
    // if there is text in the textSegment following the last period,
    // record it as an additional sentence
    if (sentenceStart != sentenceEnd) {
      doc.annotate("sentence", new Span (sentenceStart, sentenceEnd), null);
      // System.out.println ("Sentence from " + sentenceStart + " to " + sentenceEnd);
    }
  }

  /**
   *  returns true if <I>currentToken</I> is the final token of a sentence.
   *  <P> This is a simplified version of the OAK sentence splitter.
   */
   
  private static boolean isSentenceEnd (String currentToken, String nextToken,
                                        boolean startOfSentence) {
    if (currentToken == null) return false;
    int cTL = currentToken.length();
    // token is a mid-sentence abbreviation (mainly, titles) --> middle of sent
    if (isAbbreviation(currentToken)) return false;
    if (cTL > 1 &&
        in(currentToken.charAt(0), "`'\"([{<") &&
        isAbbreviation(currentToken.substring(1))) return false;
    if (cTL > 2 &&
        ( (currentToken.charAt(0) == '\'' && currentToken.charAt(1) == '\'') ||
          (currentToken.charAt(0) == '`' && currentToken.charAt(1) == '`') ) &&
        isAbbreviation(currentToken.substring(2))) return false;
    char currentToken0 = currentToken.charAt(cTL-1);
    char currentToken1 = (cTL > 1) ? currentToken.charAt(cTL-2) : ' ';
    char currentToken2 = (cTL > 2) ? currentToken.charAt(cTL-3) : ' ';
    int nTL = nextToken.length();
    char nextToken0 = nextToken.charAt(0);
    char nextToken1 = (nTL > 1) ? nextToken.charAt(1) : ' ';
    char nextToken2 = (nTL > 2) ? nextToken.charAt(2) : ' ';
    // nextToken does not begin with an upper case, 
    //    [`'"([{<] + upper case, `` + upper case, or < -> middle of sent.
    if (!(Character.isUpperCase(nextToken0) ||
    	  AceJet.Ace.monocase ||		// << added Oct. 3
          (Character.isUpperCase(nextToken1) &&
           in(nextToken0, "`'\"([{<")) ||
          (Character.isUpperCase(nextToken2) &&
           ( (nextToken0 == '`' && nextToken1 == '`') ||
             (nextToken0 == '\'' && nextToken1 == '\'') ) ) ||
          // for ACE, where '_' represents '--'
          nextToken.equals("_") ||
          nextToken0 == '<')) return false;
    // ends with ?, !, [!?.]["'}>)], or [?!.]'' -> end of sentence
    if (currentToken0 == '?' ||
        currentToken0 == '!' ||
        (in(currentToken1, "?!.") && in(currentToken0, "\"'}>)")) ||
        (in(currentToken2, "?!.") && currentToken1 == '\'' && currentToken0 == '\''))
    	return true;
    // last char not "." -> middle of sentence
    if (currentToken0 != '.') return false;
    // -- added to handle Q. / A. in news wire ---------
    // Q. or A. at start of sentence --> end of sentence
    // (so 'Q.' or 'A.' is treated as a 1-word sentence)
    if (startOfSentence && 
        (currentToken.equalsIgnoreCase("Q.") || 
         currentToken.equalsIgnoreCase("A."))) return true;
    // single upper-case alpha + "." -> middle of sentence
    if (cTL == 2 && 
    	(AceJet.Ace.monocase ?
    		Character.isLetter(currentToken1) :
        	Character.isUpperCase(currentToken1))) return false;
    // double initial (X.Y.) -> middle of sentence << added for ACE
    if (cTL == 4 && 
    	currentToken2 == '.' &&
    	(AceJet.Ace.monocase ?
    		(Character.isLetter(currentToken1) &&
        	 Character.isLetter(currentToken.charAt(0))) :
        	(Character.isUpperCase(currentToken1) &&
        	 Character.isUpperCase(currentToken.charAt(0))))) return false;
    // U.S. or U.N. -> middle of sentence
    if (AceJet.Ace.monocase)
    	if (currentToken.equalsIgnoreCase("U.S.") || 
    	    currentToken.equalsIgnoreCase("U.N."))
    	    return false;
    else
    	if (currentToken.equals("U.S.") || currentToken.equals("U.N.")) 
    		return false;
    // (for XML-marked text) next char is < -> end of sentence
    if (nextToken0 =='<') return true;
    return true;
  }

  private static boolean in (char c, String s) {
  	return s.indexOf(c) >= 0;
  }
  
  private static boolean forcesCap (Annotation currentToken, Document doc) {
    if (currentToken == null) return false;
    String word =  doc.text(currentToken).trim();
    return (word.equals("\"") || word.equals("'"));
  }
  
  private static boolean isAbbreviation (String token) {
  	if (AceJet.Ace.monocase)
  		return monocaseAbbreviations.contains(token.toLowerCase());
  	else
  		return abbreviations.contains(token);
  }
  
  // a '_' within the first 5 characters is treated as the end of a dateline
  
  private static boolean isDatelineEnd (String currentToken, int tokenCount) {
  	return currentToken != null && currentToken.equals("_") && tokenCount <= 5;
  }
}