/**
* Copyright (c) 2010, Regents of the University of Colorado All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided
* with the distribution. Neither the name of the University of Colorado at
* Boulder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package clear.morph;
import clear.pos.PosEnLib;
import clear.util.tuple.JObjectObjectTuple;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
/**
* English morphological analyzer.
*
* @author Jinho D. Choi <b>Last update:</b> 11/4/2010
*/
public class MorphEnAnalyzer {
final public String FIELD_DELIM = "_";
final public String NOUN_EXC = "noun.exc";
final public String VERB_EXC = "verb.exc";
final public String ADJ_EXC = "adj.exc";
final public String ADV_EXC = "adv.exc";
final public String NOUN_BASE = "noun.txt";
final public String VERB_BASE = "verb.txt";
final public String ADJ_BASE = "adj.txt";
final public String ORD_BASE = "ordinal.txt";
final public String NOUN_RULE = "noun.rule";
final public String VERB_RULE = "verb.rule";
final public String ADJ_RULE = "adj.rule";
final public String ABBR_RULE = "abbr.rule";
/**
* Noun exceptions
*/
HashMap<String, String> m_noun_exc;
/**
* Verb exceptions
*/
HashMap<String, String> m_verb_exc;
/**
* Adjective exceptions
*/
HashMap<String, String> m_adj_exc;
/**
* Adverb exceptions
*/
HashMap<String, String> m_adv_exc;
/**
* Noun base-forms
*/
HashSet<String> s_noun_base;
/**
* Verb base-forms
*/
HashSet<String> s_verb_base;
/**
* Adjective base-forms
*/
HashSet<String> s_adj_base;
/**
* Ordinal forms
*/
HashSet<String> s_ord_base;
/**
* Noun detachment rules
*/
ArrayList<JObjectObjectTuple<String, String>> a_noun_rule;
/**
* Verb detachment rules
*/
ArrayList<JObjectObjectTuple<String, String>> a_verb_rule;
/**
* Adjective detachment rules
*/
ArrayList<JObjectObjectTuple<String, String>> a_adj_rule;
/**
* Abbreviation replacement rules
*/
HashMap<String, String> m_abbr_rule;
/**
* Calls {@link MorphEnAnalyzer#init}.
*
* @param zipFile "en_dict.jar"
*/
public MorphEnAnalyzer(String zipFile) {
try {
InputStream inputStream = new FileInputStream(zipFile);
try {
this.init(inputStream);
} finally {
inputStream.close();
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public MorphEnAnalyzer(URL zipFileURL) throws IOException {
InputStream inputStream = zipFileURL.openStream();
try {
this.init(inputStream);
} finally {
inputStream.close();
}
}
public MorphEnAnalyzer(InputStream inputStream) throws IOException {
this.init(inputStream);
}
/**
* Initializes a morphological analyzer.
*
* @param inputStream "en_dict.jar"
*/
public void init(InputStream inputStream) throws IOException {
ZipInputStream zin = new ZipInputStream(inputStream);
ZipEntry zEntry;
String filename;
while ((zEntry = zin.getNextEntry()) != null) {
filename = zEntry.getName();
switch (filename) {
case NOUN_EXC:
m_noun_exc = getExcecptionMap(zin);
break;
case VERB_EXC:
m_verb_exc = getExcecptionMap(zin);
break;
case ADJ_EXC:
m_adj_exc = getExcecptionMap(zin);
break;
case ADV_EXC:
m_adv_exc = getExcecptionMap(zin);
break;
case NOUN_BASE:
s_noun_base = getBaseSet(zin);
break;
case VERB_BASE:
s_verb_base = getBaseSet(zin);
break;
case ADJ_BASE:
s_adj_base = getBaseSet(zin);
break;
case ORD_BASE:
s_ord_base = getBaseSet(zin);
break;
case NOUN_RULE:
a_noun_rule = getRuleList(zin);
break;
case VERB_RULE:
a_verb_rule = getRuleList(zin);
break;
case ADJ_RULE:
a_adj_rule = getRuleList(zin);
break;
case ABBR_RULE:
m_abbr_rule = getAbbreviationMap(zin);
break;
}
}
zin.close();
}
/**
* @return HashMap taking exceptions as keys and their base-forms as values.
*/
private HashMap<String, String> getExcecptionMap(ZipInputStream zin) throws IOException {
HashMap<String, String> map = new HashMap<>();
BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
StringTokenizer tok;
String line, exc, base;
while ((line = fin.readLine()) != null) {
tok = new StringTokenizer(line);
exc = (tok.hasMoreTokens()) ? tok.nextToken() : null;
base = (tok.hasMoreTokens()) ? tok.nextToken() : null;
if (exc != null && base != null) {
map.put(exc, base);
while (tok.hasMoreTokens()) {
map.put(tok.nextToken(), base);
}
}
}
return map;
}
/**
* @return HashSet containing base-forms.
*/
private HashSet<String> getBaseSet(ZipInputStream zin) throws IOException {
HashSet<String> set = new HashSet<>();
BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
String line;
while ((line = fin.readLine()) != null) {
set.add(line.trim());
}
return set;
}
/**
* @return List containing rules.
*/
private ArrayList<JObjectObjectTuple<String, String>> getRuleList(ZipInputStream zin) throws IOException {
ArrayList<JObjectObjectTuple<String, String>> list = new ArrayList<>();
BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
StringTokenizer tok;
String line, str0, str1;
while ((line = fin.readLine()) != null) {
tok = new StringTokenizer(line);
str0 = tok.nextToken();
str1 = (tok.hasMoreTokens()) ? tok.nextToken() : "";
list.add(new JObjectObjectTuple<>(str0, str1));
}
return list;
}
/**
* @return HashMap taking (abbreviation and pos-tag) as the key and its
* base-form as the value.
*/
private HashMap<String, String> getAbbreviationMap(ZipInputStream zin) throws IOException {
HashMap<String, String> map = new HashMap<>();
BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
StringTokenizer tok;
String line, abbr, pos, key, base;
while ((line = fin.readLine()) != null) {
tok = new StringTokenizer(line);
abbr = tok.nextToken();
pos = tok.nextToken();
key = abbr + FIELD_DELIM + pos;
base = tok.nextToken();
map.put(key, base);
}
return map;
}
/**
* Returns the lemma of the form using the pos-tag.
*
* @param form word-form
* @param pos pos-tag
*/
public String getLemma(String form, String pos) {
form = form.toLowerCase();
// exceptions
String morphem = getException(form, pos);
if (morphem != null) {
return morphem;
}
// base-forms
morphem = getBase(form, pos);
if (morphem != null) {
return morphem;
}
// abbreviations
morphem = getAbbreviation(form, pos);
if (morphem != null) {
return morphem;
}
// numbers
morphem = getNumber(form, pos);
if (morphem != null) {
return morphem;
}
return form;
}
/**
* Returns the base form of the form considered to be an exception. If the
* form is not an exception, returns null.
*
* @param form word-form
* @param pos pos-tag
*/
private String getException(String form, String pos) {
if (PosEnLib.isNoun(pos)) {
return m_noun_exc.get(form);
} else if (PosEnLib.isVerb(pos)) {
return m_verb_exc.get(form);
} else if (PosEnLib.isAdjective(pos)) {
return m_adj_exc.get(form);
} else if (PosEnLib.isAdverb(pos)) {
return m_adv_exc.get(form);
}
return null;
}
/**
* Returns the base-form of the form. If there is no base-form, returns
* null.
*
* @param form word-form
* @param pos pos-tag
*/
private String getBase(String form, String pos) {
if (PosEnLib.isNoun(pos)) {
return getBaseAux(form, s_noun_base, a_noun_rule);
}
if (PosEnLib.isVerb(pos)) {
return getBaseAux(form, s_verb_base, a_verb_rule);
}
if (PosEnLib.isAdjective(pos)) {
return getBaseAux(form, s_adj_base, a_adj_rule);
}
return null;
}
/**
* Returns the base-form of the form. If there is no base-form, returns
* null.
*
* @param form word-form
* @param set set containing base-forms
* @param rule list containing detachment rules
*/
private String getBaseAux(String form, HashSet<String> set, ArrayList<JObjectObjectTuple<String, String>> rule) {
int offset;
String base;
for (JObjectObjectTuple<String, String> tup : rule) {
if (form.endsWith(tup.o1)) {
offset = form.length() - tup.o1.length();
base = form.substring(0, offset) + tup.o2;
if (set.contains(base)) {
return base;
}
}
}
return null;
}
/**
* Returns the base form of the form considered to be an abbreviation. If
* the form is not an abbreviation, returns null.
*
* @param form word-form
* @param pos pos-tag
*/
private String getAbbreviation(String form, String pos) {
String key = form + FIELD_DELIM + pos;
return m_abbr_rule.get(key);
}
/**
* Returns a simplified form of numbers.
*
* @param form word-form
* @param pos pos-tag
*/
public String getNumber(String form, String pos) {
if (s_ord_base.contains(form)) {
return "$#ORD#$";
}
String currStr = getNormalizedNumber(form);
if (currStr.equals("0st") || currStr.equals("0nd") || currStr.equals("0rd") || currStr.equals("0th")) {
return "$#ORD#$";
}
return (currStr.equals(form)) ? null : currStr;
}
static public String getNormalizedNumber(String form) {
String prevStr = "", currStr = form;
while (!prevStr.equals(currStr)) {
prevStr = currStr;
currStr = currStr.replaceAll("\\d%", "0");
currStr = currStr.replaceAll("\\$\\d", "0");
currStr = currStr.replaceAll("\\.\\d", "0");
currStr = currStr.replaceAll(",\\d", "0");
currStr = currStr.replaceAll(":\\d", "0");
// currStr = currStr.replaceAll("-\\d", "0");
// currStr = currStr.replaceAll("\\\\/\\d", "0");
}
return currStr.replaceAll("\\d+", "0");
}
static public String getAbbrVerb(String form, String pos) {
if (form.equals("'d") && pos.equals("MD")) {
return "would";
}
if (form.equals("'ll") && pos.equals("MD")) {
return "will";
}
if (form.equals("'m") && pos.equals("VBP")) {
return "be";
}
if (form.equals("'re") && pos.equals("VBP")) {
return "be";
}
if (form.equals("'ve") && pos.equals("VB")) {
return "have";
}
if (form.equals("'ve") && pos.equals("VBP")) {
return "have";
}
if (form.equals("'d") && pos.equals("VBD")) {
return "have";
}
return form;
}
}