package AceJet;

import java.util.*;
import java.io.*;

import org.w3c.dom.*;
import org.xml.sax.*;
import javax.xml.parsers.*;

/**
 *  analyze a set of ACE APF files for coreference relations
 *  between nominals.
 */

public class APFAnalyzer {

	static String encoding = "ISO-8859-1";  // default:  ISO-LATIN-1	
	static HashMap startTag;
	static HashSet endTag;
	static DocumentBuilder builder;
	static final String ACEdir =
	    "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/";
	static final String fileList =
		// ACEdir + "training all.txt";
		// ACEdir + "feb02 all.txt";
		// ACEdir + "sep02 all.txt";
		// ACEdir + "aug03 all.txt";
		// ACEdir + "files-to-process.txt";
		ACEdir + "training nwire.txt";
		
	static int identityCount = 0;
	static int synonymCount = 0;
	static int differentCount = 0;
	static ArrayList mentionTypes = new ArrayList();
	static ArrayList mentionTexts = new ArrayList();
	static ArrayList headTexts = new ArrayList();
	
	public static void main (String [] args) throws Exception  {
		// initialize WordNet
		WordNetInterface.initialize();
		// initialize APF reader
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
		factory.setValidating(false);
		builder = factory.newDocumentBuilder();
		
		// open list of files
		BufferedReader reader = new BufferedReader (new FileReader(fileList));
		int docCount = 0;
		String currentDoc;
		while ((currentDoc = reader.readLine()) != null) { 
			// process file 'currentDoc'
			docCount++;
			// if (docCount != 65) continue;
			System.out.println ("\nProcessing document " + docCount + ": " + currentDoc);
			String textFileName = ACEdir + currentDoc + ".sgm";
			boolean newData = fileList.indexOf("03") > 0;
			String APFfileName = ACEdir + currentDoc + (newData ? ".apf.xml" : ".sgm.tmx.rdc.xml");
			analyzeDocument (textFileName, APFfileName);
		}
		report();
	}
	
	private static void analyzeDocument (String textFileName, String APFfileName) 
	    throws SAXException, IOException {
		Document apfDoc = builder.parse(APFfileName);
		StringBuffer fileText = readDocument(textFileName);
		computeOffsets (fileText);
		findEntityMentions (apfDoc, fileText);
	}
	
	/**
	 *  read the names in the APF file.  For each name, create an entry in
	 *  the startTag map and endTag set.
	 */
	 
	static void findEntityMentions (Document apfDoc, StringBuffer fileText) {
		clearMentionTables ();
		NodeList entities = apfDoc.getElementsByTagName("entity");
		for (int i=0; i<entities.getLength(); i++) {
			Element entity = (Element) entities.item(i);
			String entityID = entity.getAttribute("ID");
			// System.out.println ("Found entity " + entityID);
			NodeList entityTypeList = entity.getElementsByTagName("entity_type");
			Element entityType = (Element) entityTypeList.item(0);
			String generic = entityType.getAttribute("GENERIC");
			// don't record mentions for generic entities -- not used in relations
			if (generic.equals("TRUE")) continue;
			String type = getElementText (entity, "entity_type");
			if (standardType.containsKey(type))
					type = (String) standardType.get(type);
			NodeList mentions = entity.getElementsByTagName("entity_mention");
			clearMentionTables();
			for (int j=0; j<mentions.getLength(); j++) {
				Element mention = (Element) mentions.item(j);
				String id = mention.getAttribute("ID");
				String mentionType = mention.getAttribute("TYPE");
				// System.out.print ("Found mention " + id + " of type " + type);
				NodeList extents = mention.getElementsByTagName("extent");
				Element extent = (Element) extents.item(0);
				//
				String startS = getElementText (extent, "start");
				int start = Integer.parseInt(startS);
				int startJet = JEToffsetMap[start];
				String endS = getElementText (extent, "end");
				int end = Integer.parseInt(endS);
				int endJet = JEToffsetMap[end];
				String text = fileText.substring(startJet, endJet+1); 
				//
				NodeList heads = mention.getElementsByTagName("head");
				Element head = (Element) heads.item(0);
				//
				String headStartS = getElementText (head, "start");
				int headStart = Integer.parseInt(headStartS);
				int headStartJet = JEToffsetMap[headStart];
				String headEndS = getElementText (head, "end");
				int headEnd = Integer.parseInt(headEndS);
				int headEndJet = JEToffsetMap[headEnd];
				String headText = fileText.substring(headStartJet, headEndJet+1); 
				//
				// System.out.println (" " + span + " (" + text + ")");
				addMention (mentionType, text, headText);
			}
			analyzeMentions(type);
		}
	}
	
	/*  assumes elementType is a leaf element type */
	
	private static String getElementText (Element e, String elementType) {
		NodeList typeList = e.getElementsByTagName(elementType);
		Element typeElement = (Element) typeList.item(0);
		String text = (String) typeElement.getFirstChild().getNodeValue();
		return text;
	}
	
	private static void clearMentionTables () {
		mentionTypes.clear();
		mentionTexts.clear();
		headTexts.clear();
	}
	
	private static void addMention (String mentionType, String text, String headText) {
		mentionTypes.add(mentionType);
		mentionTexts.add(text);
		headTexts.add(headText);
	}
	
	private static void analyzeMentions (String entityType) {
		HashSet priorHeads = new HashSet();
		for (int imention = 0; imention < mentionTexts.size(); imention++) {
			String type = (String) mentionTypes.get(imention);
			String head = ((String) headTexts.get(imention)).toLowerCase();
			if (type.equals("NOMINAL")) {
				if (priorHeads.isEmpty()) {
					// first nominal mention -- ignore
				} else if (priorHeads.contains(head)) {
					identityCount++;
				} else if (WordNetInterface.containsSynonym(priorHeads, head)) {
					synonymCount++;
				}	else {
					System.out.println ("different heads: " + head + " and " + priorHeads);
					differentCount++;
				}
				priorHeads.add(head);
			}
		}
	}
	
	private static void report () {
		System.out.println (identityCount + " same head pairs");  
		System.out.println (synonymCount + " synonymous head pairs");
		System.out.println (differentCount + " different head pairs");
	}			
				
	/**
	 *  read file 'fileName' and return its contents as a StringBuffer
	 */
	 
	static StringBuffer readDocument (String fileName) throws IOException {
		File file = new File(fileName);
		String line;
		BufferedReader reader = new BufferedReader (
			// (new FileReader(file));
			new InputStreamReader (new FileInputStream(file), encoding));
		StringBuffer fileText = new StringBuffer();
		while((line = reader.readLine()) != null)
			fileText.append(line + "\n");
		return fileText;
	}
	
	// map from ACE offset to Jet offset
	static int[] ACEoffsetMap = null;
	static int[] JEToffsetMap = null;
	
	/**
	 *  compute ACEoffsetMap, a map from ACE offsets (which exclude XML tags
	 *  to Jet offsets (which include all characters in the file)
	 */
	 
	static void computeOffsets (StringBuffer fileText) {
		boolean inTag = false;
		int xmlCount = 0;
		int length = fileText.length();
		ACEoffsetMap = new int[length];
		JEToffsetMap = new int[length];
		for (int i=0; i<length; i++) {
			if(fileText.charAt(i) == '<') inTag = true;
			JEToffsetMap[i - xmlCount] = i;
			if (inTag) xmlCount++;
			ACEoffsetMap[i] = i - xmlCount;
			if(fileText.charAt(i) == '>') inTag = false;
		}
	}
	
	// map from APF type names to 'standard' names
	
	static HashMap standardType = new HashMap();
	static {standardType.put("GSP", "GPE");
	        standardType.put("PER", "PERSON");
	        standardType.put("ORG", "ORGANIZATION");
	        standardType.put("LOC", "LOCATION");
	        standardType.put("FAC", "FACILITY");
	     }	
}