package Jet.Chunk;

import java.io.*;
import java.util.*;

import AceJet.Gazetteer;
import Jet.Lex.Lexicon;
import Jet.Lisp.FeatureSet;
import Jet.Tipster.*;

/**
 *  Onoma provides a dictionary of proper names ('onomasticon').  This dictionary
 *  is used by the (max ent) name tagger.
 */

public class Onoma {

	/**
	 *  Read the onomasticon from file 'fileName'.  Each line of the file
	 *  consists of two fields, separated by a tab: <br>
	 *  the name, consisting of one or more blank-separated tokens <br>
	 *  the type of the name
	 */

	public static void read (String fileName) throws IOException {
		int n = 0;
		BufferedReader reader = new BufferedReader (new FileReader (fileName));
		String line;
		while ((line = reader.readLine()) != null) {
			String[] fields = line.split("\t");
			if (fields.length != 2) {
				System.out.println ("Invalid onoma line: " + line);
			}
			String name = fields[0];
			String type = fields[1];
			if (type.equals("DRG")) {
				drugs.add(name);
				drugs.add(name.substring(0, 1).toUpperCase() + name.substring(1));
			}
			Lexicon.addEntry (Gazetteer.splitAtWS(name),
                                          new FeatureSet ("type", type),
                                          "onoma");
			n++;
		}
		System.out.println ("Onoma:  read " + n + " names.");
	}

	static Set<String> drugs = new HashSet<String>();

	/**
	 *  Tags all drug name tokens within Span 'span' of Document 'doc'
	 *  with annotation ENAMEX with TYPE DRG (These are terms
	 *  assigned this type in the onomasticon.
	 *
	 *  This code was added at SRI's request for Dovetail.
	 */

	public static void tagDrugs (Document doc, Span span) {
		int posn = span.start();
		Annotation token;
		while (posn < span.end()) {
			Vector<Annotation> nameAnns = doc.annotationsAt(posn, "ENAMEX");
			if (nameAnns != null && nameAnns.size() > 0) {
				posn = nameAnns.get(0).end();
			} else if ((token = doc.tokenAt(posn)) != null) {
				String tokenText = doc.text(token).trim();
				if (drugs.contains(tokenText)) {
					doc.annotate ("ENAMEX", token.span(), new FeatureSet("TYPE", "DRG"));
				}
				posn = token.end();
			} else return;
		} 
	}

}

