package edu.stanford.nlp.ie.machinereading.domains.ace.reader;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ie.machinereading.common.SimpleTokenize;
import edu.stanford.nlp.ie.machinereading.common.StringDictionary;
import edu.stanford.nlp.trees.Span;
import edu.stanford.nlp.util.Generics;
public class AceToken {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(AceToken.class);
/**
* The actual token bytes
* Normally we work with mWord (see below), but mLiteral is needed when
* we need to check if a sequence of tokens exists in a gazetteer
*/
private String mLiteral;
/** The index of the literal in the WORDS hash */
private int mWord;
/** Case of mWord */
private int mCase;
/** Suffixes of mWord */
private int[] mSuffixes;
private int mLemma;
private int mPos;
private int mChunk;
private int mNerc;
private Span mByteOffset;
/** Raw byte offset in the SGM doc */
private Span mRawByteOffset;
private int mSentence;
/** Entity class from Massi */
private String mMassiClass;
/** Entity label from the BBN corpus */
private String mMassiBbn;
/** WordNet super-senses detected by Massi */
private String mMassiWnss;
/** Dictionary for all words in the corpus */
public static final StringDictionary WORDS;
/** Dictionary for all lemmas in the corpus */
public static final StringDictionary LEMMAS;
/** Dictionary for all other strings in the corpus */
public static final StringDictionary OTHERS;
/** Map of all proximity classes */
public static final Map<Integer, ArrayList<Integer>> PROX_CLASSES;
/** How many elements per proximity class */
private static final int PROXIMITY_CLASS_SIZE = 5;
/** The location gazetteer */
private static Map<String, String> LOC_GAZ = null;
/** The person first name dictionary */
private static Map<String, String> FIRST_GAZ = null;
/** The person last name dictionary */
private static Map<String, String> LAST_GAZ = null;
/** List of trigger words */
private static Map<String, String> TRIGGER_GAZ = null;
private final static Pattern SGML_PATTERN;
static {
WORDS = new StringDictionary("words");
LEMMAS = new StringDictionary("lemmas");
OTHERS = new StringDictionary("others");
WORDS.setMode(true);
LEMMAS.setMode(true);
OTHERS.setMode(true);
PROX_CLASSES = Generics.newHashMap();
SGML_PATTERN = Pattern.compile("<[^<>]+>");
}
public static void loadGazetteers(String dataPath) throws java.io.FileNotFoundException, java.io.IOException {
log.info("Loading location gazetteer... ");
LOC_GAZ = Generics.newHashMap();
loadDictionary(LOC_GAZ, dataPath + File.separator + "world_small.gaz.nonambiguous");
log.info("done.");
log.info("Loading first-name gazetteer... ");
FIRST_GAZ = Generics.newHashMap();
loadDictionary(FIRST_GAZ, dataPath + File.separator + "per_first.gaz");
log.info("done.");
log.info("Loading last-name gazetteer... ");
LAST_GAZ = Generics.newHashMap();
loadDictionary(LAST_GAZ, dataPath + File.separator + "per_last.gaz");
log.info("done.");
log.info("Loading trigger-word gazetteer... ");
TRIGGER_GAZ = Generics.newHashMap();
loadDictionary(TRIGGER_GAZ, dataPath + File.separator + "triggers.gaz");
log.info("done.");
}
/** Loads one dictionary from disk */
private static void loadDictionary(Map<String, String> dict, String file) throws java.io.FileNotFoundException,
java.io.IOException {
BufferedReader in = new BufferedReader(new FileReader(file));
String line;
while ((line = in.readLine()) != null) {
ArrayList<String> tokens = SimpleTokenize.tokenize(line);
if (tokens.size() > 0) {
String lower = tokens.get(0).toLowerCase();
if (tokens.size() == 1)
dict.put(lower, "true");
else
dict.put(lower, tokens.get(1));
}
}
}
public static boolean isLocation(String lower) {
return exists(LOC_GAZ, lower);
}
public static boolean isFirstName(String lower) {
return exists(FIRST_GAZ, lower);
}
public static boolean isLastName(String lower) {
return exists(LAST_GAZ, lower);
}
public static String isTriggerWord(String lower) {
return TRIGGER_GAZ.get(lower);
}
/**
* Verifies if the given string exists in the given dictionary
*/
public static boolean exists(Map<String, String> dict, String elem) {
if (dict.get(elem) != null)
return true;
return false;
}
/**
* Loads all proximity classes from the hard disk The WORDS map must be
* created before!
*/
public static void loadProximityClasses(String proxFileName) throws java.io.IOException {
log.info("Loading proximity classes...");
BufferedReader in = null;
try {
in = new BufferedReader(new FileReader(proxFileName));
} catch (java.io.IOException e) {
log.info("Warning: no proximity database found.");
return;
}
String line;
while ((line = in.readLine()) != null) {
ArrayList<String> tokens = SimpleTokenize.tokenize(line);
if (tokens.size() > 0) {
Integer key = WORDS.get(tokens.get(0));
ArrayList<Integer> value = new ArrayList<>();
for (int i = 0; i < tokens.size() && i < PROXIMITY_CLASS_SIZE; i++) {
Integer word = WORDS.get(tokens.get(i));
value.add(word);
}
PROX_CLASSES.put(key, value);
}
}
in.close();
log.info("Finished loading proximity classes.");
}
public String getLiteral() {
return mLiteral;
}
public int getWord() {
return mWord;
}
public int getCase() {
return mCase;
}
public int[] getSuffixes() {
return mSuffixes;
}
public int getLemma() {
return mLemma;
}
public int getPos() {
return mPos;
}
public int getChunk() {
return mChunk;
}
public int getNerc() {
return mNerc;
}
public Span getByteOffset() {
return mByteOffset;
}
public int getByteStart() {
return mByteOffset.start();
}
public int getByteEnd() {
return mByteOffset.end();
}
public int getSentence() {
return mSentence;
}
public Span getRawByteOffset() {
return mRawByteOffset;
}
public int getRawByteStart() {
return mRawByteOffset.start();
}
public int getRawByteEnd() {
return mRawByteOffset.end();
}
public void setMassiClass(String i) {
mMassiClass = i;
}
public String getMassiClass() {
return mMassiClass;
}
public void setMassiBbn(String i) {
mMassiBbn = i;
}
public String getMassiBbn() {
return mMassiBbn;
}
public void setMassiWnss(String i) {
mMassiWnss = i;
}
public String getMassiWnss() {
return mMassiWnss;
}
public static boolean isSgml(String s) {
Matcher match = SGML_PATTERN.matcher(s);
return match.find(0);
}
public static String removeSpaces(String s) {
if (s == null)
return s;
return s.replaceAll(" ", "_");
}
public static final int CASE_OTHER = 0;
public static final int CASE_ALLCAPS = 1;
public static final int CASE_ALLCAPSORDOTS = 2;
public static final int CASE_CAPINI = 3;
public static final int CASE_INCAP = 4;
public static final int CASE_ALLDIGITS = 5;
public static final int CASE_ALLDIGITSORDOTS = 6;
private static int detectCase(String word) {
//
// is the word all caps? (e.g. IBM)
//
boolean isAllCaps = true;
for (int i = 0; i < word.length(); i++) {
if (!Character.isUpperCase(word.charAt(i))) {
isAllCaps = false;
break;
}
}
if (isAllCaps)
return CASE_ALLCAPS;
//
// is the word all caps or dots?(e.g. I.B.M.)
//
boolean isAllCapsOrDots = true;
if (Character.isUpperCase(word.charAt(0))) {
for (int i = 0; i < word.length(); i++) {
if (!Character.isUpperCase(word.charAt(i)) && word.charAt(i) != '.') {
isAllCapsOrDots = false;
break;
}
}
} else {
isAllCapsOrDots = false;
}
if (isAllCapsOrDots)
return CASE_ALLCAPSORDOTS;
//
// does the word start with a cap?(e.g. Tuesday)
//
boolean isInitialCap = false;
if (Character.isUpperCase(word.charAt(0)))
isInitialCap = true;
if (isInitialCap)
return CASE_CAPINI;
//
// does the word contain a capitalized letter?
//
boolean isInCap = false;
for (int i = 1; i < word.length(); i++) {
if (Character.isUpperCase(word.charAt(i))) {
isInCap = true;
break;
}
}
if (isInCap)
return CASE_INCAP;
//
// is the word all digits? (e.g. 123)
//
boolean isAllDigits = false;
for (int i = 0; i < word.length(); i++) {
if (!Character.isDigit(word.charAt(i))) {
isAllDigits = false;
break;
}
}
if (isAllDigits)
return CASE_ALLDIGITS;
//
// is the word all digits or . or ,? (e.g. 1.3)
//
boolean isAllDigitsOrDots = true;
if (Character.isDigit(word.charAt(0))) {
for (int i = 0; i < word.length(); i++) {
if (!Character.isDigit(word.charAt(i)) && word.charAt(i) != '.' && word.charAt(i) != ',') {
isAllDigitsOrDots = false;
break;
}
}
} else {
isAllDigitsOrDots = false;
}
if (isAllDigitsOrDots)
return CASE_ALLDIGITSORDOTS;
return CASE_OTHER;
}
private static int[] extractSuffixes(String word) {
String lower = word.toLowerCase();
ArrayList<Integer> suffixes = new ArrayList<>();
for (int i = 2; i <= 4; i++) {
if (lower.length() >= i) {
try {
String suf = lower.substring(lower.length() - i);
suffixes.add(WORDS.get(suf));
} catch (java.lang.RuntimeException e) {
// unknown suffix
}
} else {
break;
}
}
int[] sufs = new int[suffixes.size()];
for (int i = 0; i < suffixes.size(); i++) {
sufs[i] = suffixes.get(i);
}
return sufs;
}
/**
* Constructs an AceToken from a tokenized line generated by Tokey
*/
public AceToken(String word, String lemma, String pos, String chunk, String nerc, String start, String end,
int sentence) {
mLiteral = word;
if (word == null) {
mWord = -1;
mCase = -1;
mSuffixes = null;
} else {
mWord = WORDS.get(removeSpaces(word), false);
mCase = detectCase(word);
mSuffixes = extractSuffixes(word);
}
if (lemma == null)
mLemma = -1;
else
mLemma = LEMMAS.get(removeSpaces(lemma), false);
if (pos == null)
mPos = -1;
else
mPos = OTHERS.get(pos, false);
if (chunk == null)
mChunk = -1;
else
mChunk = OTHERS.get(chunk, false);
if (nerc == null)
mNerc = -1;
else
mNerc = OTHERS.get(nerc, false);
if (start != null && end != null) {
mByteOffset = new Span(Integer.parseInt(start), Integer.parseInt(end));
mRawByteOffset = new Span(Integer.parseInt(start), Integer.parseInt(end));
}
mSentence = sentence;
mMassiClass = "";
mMassiBbn = "";
mMassiWnss = "";
}
/**
* Recomputes start/end phrase positions by removing SGML tag strings This is
* required because ACE annotations skip over SGML tags when computing
* positions in stream, hence annotations do not match with our preprocessing
* positions, which count everything
*/
public int adjustPhrasePositions(int offsetToSubtract, String word) {
if (isSgml(word)) {
// offsetToSubtract += word.length();
// the token length may be different than (end - start)!
// i.e. QUOTE_PREVIOUSPOST is cleaned in Tokey!
offsetToSubtract += mByteOffset.end() - mByteOffset.start();
mByteOffset.setStart(-1);
mByteOffset.setEnd(-1);
} else {
mByteOffset.setStart(mByteOffset.start() - offsetToSubtract);
mByteOffset.setEnd(mByteOffset.end() - offsetToSubtract);
}
return offsetToSubtract;
}
/** Pretty display */
public String display() {
if (mByteOffset != null) {
return "['" + WORDS.get(mWord) + "', " + OTHERS.get(mPos) + ", " + mByteOffset.start() + ", "
+ mByteOffset.end() + "]";
}
return "['" + WORDS.get(mWord) + "', " + OTHERS.get(mPos) + "]";
}
public String toString() {
return display();
}
}