/*
 * Decompiled with CFR 0.152.
 */
package Jet.Lex;

import Jet.Lisp.FeatureSet;
import Jet.Tipster.Annotation;
import Jet.Tipster.Document;
import Jet.Tipster.Span;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Tokenizer {
    static Vector tokens;
    static String lastToken;
    static HashSet suffixes2;
    static HashSet suffixes3;
    private static HashMap<Integer, Integer> specialTokenEnd;
    private static HashMap<Integer, String> specialTokenType;
    private static String userNamePatStg;
    private static String domainNamePatStg;
    private static String emailPatStg;
    private static String pathPatStg;
    private static String urlPatStg;
    private static Pattern emailPat;
    private static Pattern urlPat;

    public static void tokenize(Document document, Span span) {
        Tokenizer.findTokens(document, document.text(), span.start(), span.end());
    }

    public static String[] tokenize(String string) {
        tokens = new Vector();
        Tokenizer.findTokens(null, string, 0, string.length());
        return tokens.toArray(new String[0]);
    }

    private static void findTokens(Document document, String string, int n, int n2) {
        boolean bl = true;
        lastToken = "";
        Tokenizer.findTokensByPattern(document, string, n, n2);
        n = Tokenizer.skipWSX(string, n, n2);
        while (n < n2) {
            int n3 = n;
            Integer n4 = specialTokenEnd.get(n);
            n = n4 != null ? n4 : ++n;
            while (n < n2 && !Character.isWhitespace(string.charAt(n))) {
                n4 = specialTokenEnd.get(n);
                if (n4 != null) {
                    n = n4;
                    continue;
                }
                ++n;
            }
            String string2 = string.substring(n3, n);
            while (n < n2 && Character.isWhitespace(string.charAt(n))) {
                ++n;
            }
            boolean bl2 = n >= n2 && document != null;
            boolean[] blArray = Tokenizer.splitIntoTokens(string2, n3, bl2);
            Tokenizer.buildTokens(document, string2, blArray, n3, n, bl);
            bl = false;
        }
    }

    private static void findTokensByPattern(Document document, String string, int n, int n2) {
        int n3;
        Matcher matcher = emailPat.matcher(string).region(n, n2);
        specialTokenEnd = new HashMap();
        specialTokenType = new HashMap();
        while (matcher.find()) {
            int n4 = matcher.start();
            n3 = matcher.end();
            specialTokenEnd.put(n4, n3);
            specialTokenType.put(n4, "email");
        }
        Matcher matcher2 = urlPat.matcher(string).region(n, n2);
        while (matcher2.find()) {
            n3 = matcher2.start();
            int n5 = matcher2.end();
            specialTokenEnd.put(n3, n5);
            specialTokenType.put(n3, "url");
        }
    }

    private static boolean[] splitIntoTokens(String string, int n, boolean bl) {
        int n2;
        int n3;
        int n4;
        char[] cArray = string.toCharArray();
        int n5 = cArray.length;
        boolean[] blArray = new boolean[n5 + 1];
        blArray[n5] = true;
        for (n4 = 0; n4 < n5; ++n4) {
            n3 = cArray[n4];
            if (Character.isLetterOrDigit((char)n3) || n3 == 46) continue;
            blArray[n4] = true;
            blArray[n4 + 1] = true;
        }
        for (n4 = 0; n4 < n5 - 1; ++n4) {
            n3 = cArray[n4];
            if (n3 != 96 && n3 != 39 && n3 != 45 || n3 != cArray[n4 + 1] || !blArray[n4]) continue;
            blArray[n4 + 1] = false;
        }
        for (n4 = 0; n4 < n5 - 2; ++n4) {
            if (cArray[n4] != '.' || cArray[n4 + 1] != '.' || cArray[n4 + 2] != '.' || !blArray[n4]) continue;
            blArray[n4 + 1] = false;
            blArray[n4 + 2] = false;
        }
        for (n4 = 1; n4 < n5 - 2; ++n4) {
            if (cArray[n4] != ',' || !Character.isDigit(cArray[n4 - 1]) || !Character.isDigit(cArray[n4 + 1])) continue;
            blArray[n4] = false;
            blArray[n4 + 1] = false;
        }
        if (bl) {
            if (cArray[n5 - 1] == '.') {
                blArray[n5 - 1] = true;
            } else if (n5 > 1 && cArray[n5 - 2] == '.' && "\"'}>)".indexOf(cArray[n5 - 1]) >= 0) {
                blArray[n5 - 2] = true;
            } else if (n5 > 2 && cArray[n5 - 3] == '.' && cArray[n5 - 2] == '\'' && cArray[n5 - 1] == '\'') {
                blArray[n5 - 3] = true;
            }
        }
        for (n4 = 0; n4 < n5 - 2; ++n4) {
            if (!blArray[n4 + 3] || !suffixes3.contains(string.substring(n4, n4 + 3))) continue;
            blArray[n4] = true;
            blArray[n4 + 1] = false;
            blArray[n4 + 2] = false;
        }
        for (n4 = 0; n4 < n5 - 1; ++n4) {
            if (!blArray[n4 + 2] || !suffixes2.contains(string.substring(n4, n4 + 2))) continue;
            blArray[n4] = true;
            blArray[n4 + 1] = false;
        }
        for (n4 = 0; n4 < n5 - 1; ++n4) {
            if (cArray[n4] != '&') continue;
            for (n3 = n4 + 1; n3 < n5; ++n3) {
                if (cArray[n3] != ';') continue;
                for (n2 = n4 + 1; n2 <= n3; ++n2) {
                    blArray[n2] = false;
                }
            }
        }
        for (n4 = 0; n4 < n5; ++n4) {
            Integer n6 = specialTokenEnd.get(n + n4);
            if (n6 == null) continue;
            blArray[n4] = true;
            for (n2 = n4 + 1; n2 < n5 && n2 + n < n6; ++n2) {
                blArray[n2] = false;
            }
        }
        return blArray;
    }

    private static void buildTokens(Document document, String string, boolean[] blArray, int n, int n2, boolean bl) {
        int n3 = 0;
        for (int i = 1; i <= string.length(); ++i) {
            String string2;
            if (!blArray[i]) continue;
            int n4 = i;
            FeatureSet featureSet = null;
            int n5 = 0;
            for (int j = n3; j < n4; ++j) {
                if (Character.isDigit(string.charAt(j))) {
                    n5 = n5 * 10 + Character.digit(string.charAt(j), 10);
                    continue;
                }
                if (string.charAt(j) == ',' && n5 > 0) continue;
                n5 = -1;
                break;
            }
            featureSet = (string2 = specialTokenType.get(n3 + n)) != null ? new FeatureSet("type", string2) : (Character.isUpperCase(string.charAt(n3)) ? (bl || lastToken.equals("_") || lastToken.equals("\"") || lastToken.equals("``") || lastToken.equals("`") ? new FeatureSet("case", "forcedCap") : new FeatureSet("case", "cap")) : (n5 >= 0 ? new FeatureSet("intvalue", new Integer(n5)) : new FeatureSet()));
            int n6 = n4 == string.length() ? n2 : n4 + n;
            String string3 = string.substring(n3, n4);
            Tokenizer.recordToken(document, string3, n3 + n, n6, featureSet);
            n3 = n4;
            lastToken = string3;
        }
    }

    private static void recordToken(Document document, String string, int n, int n2, FeatureSet featureSet) {
        if (document == null) {
            tokens.addElement(string);
        } else {
            document.annotate("token", new Span(n, n2), featureSet);
            if (featureSet.get("type") != null) {
                document.annotate("ENAMEX", new Span(n, n2), new FeatureSet("TYPE", featureSet.get("type")));
            }
        }
    }

    public static void tokenizeOnWS(Document document, Span span) {
        int n;
        String string = document.text();
        int n2 = span.end();
        for (n = span.start(); n < n2 && Character.isWhitespace(string.charAt(n)); ++n) {
        }
        while (n < n2) {
            int n3 = n++;
            while (n < n2 && !Character.isWhitespace(string.charAt(n))) {
                ++n;
            }
            while (n < n2 && Character.isWhitespace(string.charAt(n))) {
                ++n;
            }
            Tokenizer.recordToken(document, string, n3, n, new FeatureSet());
        }
    }

    public static int skipWS(Document document, int n, int n2) {
        while (n < n2 && Character.isWhitespace(document.charAt(n))) {
            ++n;
        }
        return n;
    }

    public static int skipWS(String string, int n, int n2) {
        while (n < n2 && Character.isWhitespace(string.charAt(n))) {
            ++n;
        }
        return n;
    }

    public static int skipWSX(Document document, int n, int n2) {
        while (n < n2) {
            if (Character.isWhitespace(document.charAt(n))) {
                ++n;
                continue;
            }
            if (document.charAt(n) != '<') break;
            ++n;
            while (n < n2 && document.charAt(n) != '>') {
                ++n;
            }
            if (n >= n2) continue;
            ++n;
        }
        return n;
    }

    public static int skipWSX(String string, int n, int n2) {
        while (n < n2) {
            if (Character.isWhitespace(string.charAt(n))) {
                ++n;
                continue;
            }
            if (string.charAt(n) != '<') break;
            ++n;
            while (n < n2 && string.charAt(n) != '>') {
                ++n;
            }
            if (n >= n2) continue;
            ++n;
        }
        return n;
    }

    public static Annotation[] gatherTokens(Document document, Span span) {
        Annotation annotation;
        int n = span.start();
        int n2 = span.end();
        ArrayList<Annotation> arrayList = new ArrayList<Annotation>();
        int n3 = Tokenizer.skipWSX(document, n, n2);
        while (n3 < n2 && (annotation = document.tokenAt(n3)) != null) {
            arrayList.add(annotation);
            n3 = annotation.span().end();
        }
        int n4 = arrayList.size();
        return arrayList.toArray(new Annotation[n4]);
    }

    public static String[] gatherTokenStrings(Document document, Span span) {
        Annotation[] annotationArray = Tokenizer.gatherTokens(document, span);
        int n = annotationArray.length;
        String[] stringArray = new String[n];
        for (int i = 0; i < n; ++i) {
            stringArray[i] = document.text(annotationArray[i]).trim();
        }
        return stringArray;
    }

    public static void main(String[] stringArray) {
        Document document = new Document(", DKo...@hotmail.com (Daniel Kolle)");
        Tokenizer.tokenize(document, document.fullSpan());
        String[] stringArray2 = Tokenizer.gatherTokenStrings(document, document.fullSpan());
        if (stringArray2.length == 8 && stringArray2[0].equals("'") && stringArray2[1].equals("grishman ... @cs.nyu.edu") && stringArray2[2].equals("'") && stringArray2[3].equals("sold") && stringArray2[4].equals("$") && stringArray2[5].equals("3,100") && stringArray2[6].equals("shares") && stringArray2[7].equals(".")) {
            System.out.println("Tokenizer validation succeeds.");
        } else {
            System.out.println("Tokenizer validation fails.");
        }
        for (int i = 0; i < stringArray2.length; ++i) {
            System.out.println("  tokens[" + i + "] = " + stringArray2[i]);
        }
    }

    static {
        suffixes2 = new HashSet();
        suffixes3 = new HashSet();
        suffixes2.add("'s");
        suffixes2.add("'m");
        suffixes2.add("'d");
        suffixes2.add("'S");
        suffixes2.add("'M");
        suffixes2.add("'D");
        suffixes3.add("'re");
        suffixes3.add("'ve");
        suffixes3.add("n't");
        suffixes3.add("'ll");
        suffixes3.add("'RE");
        suffixes3.add("'VE");
        suffixes3.add("N'T");
        suffixes3.add("'LL");
        userNamePatStg = "[a-zA-Z0-9_\\.-]+";
        domainNamePatStg = "([a-zA-Z0-9-]+\\.)+[a-zA-Z0-9]+";
        emailPatStg = userNamePatStg + "( ?\\.\\.\\. ?)?@" + domainNamePatStg;
        pathPatStg = "[a-zA-Z0-9_=\\?/-]+";
        urlPatStg = "http://" + domainNamePatStg + pathPatStg;
        emailPat = Pattern.compile(emailPatStg);
        urlPat = Pattern.compile(urlPatStg);
    }
}

