package Jet.Chunk;

import java.io.*;
import java.util.*;

import AceJet.Gazetteer;
import Jet.Lex.Lexicon;
import Jet.Lisp.FeatureSet;
import Jet.Tipster.*;

/**
 *  Onoma provides a dictionary of proper names ('onomasticon').  This dictionary
 *  is used by the 'tagNamesFromOnoma' command as as features by the
 *  (max ent) name tagger.
 */

public class Onoma {

	public static boolean loaded = false;

	/**
	 *  Read the onomasticon from file 'fileName'.  Each line of the file
	 *  consists of two fields, separated by a tab: <br>
	 *  the name, consisting of one or more blank-separated tokens <br>
	 *  the type of the name
	 */

	public static void read (String fileName) throws IOException {
		loaded = true;
		int n = 0;
		BufferedReader reader = new BufferedReader (new FileReader (fileName));
		String line;
		while ((line = reader.readLine()) != null) {
			String[] fields = line.split("\t");
			if (fields.length < 2 || fields.length > 3) {
				System.out.println ("Invalid onoma line: " + line);
				continue;
			}
			String name = fields[0];
			String type = fields[1];
			String subtype = null;
			if (fields.length == 3)
				subtype = fields[2];
			Lexicon.addEntry (Gazetteer.splitAtWS(name),
                                          new FeatureSet ("TYPE", type, "SUBTYPE", subtype),
                                          "onoma");
			n++;
		}
		System.out.println ("Onoma:  read " + n + " names.");
	}

	/**
	 *  This is a stub which remains from code that was added at SRI's 
	 *  request for Dovetail in order to tag drug names..
	 */

	public static void tagDrugs (Document doc, Span span) {
	}

	/**
	 *  tag names which appear in the onomasticon.  If there are multiple
	 *  matches at a given position, the longest match is used.  Two
	 *  annotations are added: <br>
	 *  an ENAMEX annotation with TYPE attribute as specified in the onomasticon <br>
	 *  an isName type=other annotation;  this blocks subsequent name taggers
	 *  from assigning additional ENAMEX annotations to the same tokens.
	 */

	public static void tagNames (Document doc, Span span) {
		int posn = span.start();
		Annotation token;
		while (posn < span.end()) {
			Vector<Annotation> nameAnns = doc.annotationsAt(posn, "onoma");
			if (nameAnns != null && nameAnns.size() > 0) {
				Annotation nameAnn = nameAnns.get(0);
				String type = (String) nameAnn.get("TYPE");
				String subtype = (String) nameAnn.get("SUBTYPE");
				if (type != null) {
					doc.annotate ("isName", nameAnn.span(), new FeatureSet("type", "other"));
					doc.annotate ("ENAMEX", nameAnn.span(), 
					              new FeatureSet("TYPE", type, "SUBTYPE", subtype));
				}
				posn = nameAnn.end();
			} else if ((token = doc.tokenAt(posn)) != null) {
				posn = token.end();
			} else return;
		} 
	}

}
