/**
* Copyright (c) 2009, Regents of the University of Colorado All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided
* with the distribution. Neither the name of the University of Colorado at
* Boulder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package clear.parse;
import clear.decode.AbstractMultiDecoder;
import clear.dep.DepFeat;
import clear.dep.DepLib;
import clear.dep.DepNode;
import clear.dep.DepTree;
import clear.ftr.map.DepFtrMap;
import clear.ftr.xml.DepFtrXml;
import clear.ftr.xml.FtrToken;
import clear.morph.MorphKr;
import clear.reader.DepReader;
import clear.util.IOUtil;
import clear.util.tuple.JObjectObjectTuple;
import com.carrotsearch.hppc.IntArrayList;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.regex.Matcher;
/**
* Abstract dependency parser.
*
* @author Jinho D. Choi <b>Last update:</b> 4/12/2011
*/
abstract public class AbstractDepParser extends AbstractParser {
/**
* Shift-eager algorithm
*/
static final public String ALG_SHIFT_EAGER = "shift-eager";
/**
* Shift-pop algorithm
*/
static final public String ALG_SHIFT_POP = "shift-pop";
/**
* Feature templates
*/
protected DepFtrXml t_xml;
/**
* Feature mappings
*/
protected DepFtrMap t_map;
/**
* ML decoder
*/
protected AbstractMultiDecoder c_dec;
/**
* Prints transitions
*/
protected PrintStream f_out;
/**
* Current dependency tree
*/
protected DepTree d_tree;
/**
* Index of lambda_1
*/
protected int i_lambda;
/**
* Index of beta
*/
protected int i_beta;
/**
* Previous transitions
*/
protected ArrayList<String> prev_trans;
public int i_trainIndex = 0;
// =============================== Constructors ===============================
/**
* {@link AbstractDepParser#FLAG_PRINT_TRANSITION} or {@link AbstractDepParser#FLAG_TRAIN_LEXICON}.
*/
public AbstractDepParser(byte flag, String filename) {
i_flag = flag;
if (flag == FLAG_PRINT_TRANSITION) {
f_out = IOUtil.createPrintFileStream(filename);
} else if (flag == FLAG_TRAIN_LEXICON) {
t_xml = new DepFtrXml(filename);
t_map = new DepFtrMap(t_xml);
}
}
/**
* {@link AbstractDepParser#FLAG_TRAIN_INSTANCE}.
*/
public AbstractDepParser(byte flag, DepFtrXml xml, String lexiconFile) {
i_flag = flag;
t_xml = xml;
t_map = new DepFtrMap(lexiconFile);
initTrainArrays(1);
}
/**
* {@link AbstractDepParser#FLAG_PREDICT} or {@link AbstractDepParser#FLAG_TRAIN_BOOST}.
*/
public AbstractDepParser(byte flag, DepFtrXml xml, DepFtrMap map, AbstractMultiDecoder decoder) {
i_flag = flag;
t_xml = xml;
t_map = map;
c_dec = decoder;
if (flag == FLAG_TRAIN_BOOST) {
initTrainArrays(1);
}
}
// =============================== External methods ===============================
public DepFtrXml getDepFtrXml() {
return t_xml;
}
public DepFtrMap getDepFtrMap() {
return t_map;
}
/**
* Saves tags from {@link AbstractDepParser#t_map} to
* <code>lexiconFile</code>.
*/
public void saveTags(String lexiconFile) {
t_map.save(t_xml, lexiconFile);
}
public void closeOutputStream() {
f_out.close();
}
// =============================== Pre-processing ===============================
protected void preProcess(DepTree tree) {
switch (s_language) {
case DepReader.LANG_EN:
preProcessEn(tree);
break;
case DepReader.LANG_CZ:
preProcessCz(tree);
break;
case DepReader.LANG_KR:
preProcessKr(tree);
break;
}
}
protected void preProcessEn(DepTree tree) {
int i, j, size = tree.size();
DepNode head;
for (i = 1; i < size; i++) {
head = tree.get(i);
if (head.isPosx("IN")) {
for (j = i + 1; j < size; j++) {
if (tree.get(j).isPosx("NN.*|CD") && !(j + 1 < size && tree.get(j + 1).isPosx("NN.*|CD|POS"))) {
head.rightMostDep = tree.get(j);
i = j;
break;
}
}
}
}
}
protected void preProcessCz(DepTree tree) {
preProcessCzMorph(tree);
preProcessCzCoord(tree);
}
protected void preProcessCzMorph(DepTree tree) {
DepNode node;
String feat;
for (int i = 1; i < tree.size(); i++) {
node = tree.get(i);
if ((feat = node.getFeat(DepLib.CZ_FEAT[2])) != null) // degree of Comparison
{
node.pos += feat;
}
if ((feat = node.getFeat(DepLib.CZ_FEAT[8])) != null) // name
{
node.lemma = "$SEM=" + feat + "$";
} else if ((feat = node.getFeat(DepLib.CZ_FEAT[9])) != null) // number
{
if (feat.equals("n")) {
node.lemma = "$CRD$";
}
}
}
}
protected void preProcessCzCoord(DepTree tree) {
int coordId, nextId, prevId, size = tree.size(), gap = 10, count, total;
DepNode coord, prev, next;
DepFeat prevFeats, nextFeats;
String nextPos, tmp;
double score, bestScore;
JObjectObjectTuple<DepNode, DepNode> bestPair = new JObjectObjectTuple<>(null, null);
final String SubPOS = DepLib.CZ_FEAT[9];
for (coordId = 1; coordId < size; coordId++) {
coord = tree.get(coordId);
if (!coord.getFeat(SubPOS).equals("^") && !coord.lemma.matches(",|:|&|\\+")) {
continue;
}
bestScore = 0;
for (nextId = coordId + 1; nextId <= coordId + gap && nextId < size; nextId++) {
next = tree.get(nextId);
nextFeats = next.feats;
nextPos = nextFeats.get(SubPOS);
total = nextFeats.size();
for (prevId = coordId - 1; prevId >= coordId - gap && prevId > 0; prevId--) {
prev = tree.get(prevId);
prevFeats = prev.feats;
if (!nextPos.equals(prevFeats.get(SubPOS))) {
continue;
}
count = 0;
for (String nextKey : nextFeats.keySet()) {
if (nextKey.equals(SubPOS)) {
continue;
}
tmp = prevFeats.get(nextKey);
if (tmp != null && tmp.equals(nextFeats.get(nextKey))) {
count++;
}
}
score = (double) count / total;
if (score > bestScore) {
bestScore = score;
bestPair.set(prev, next);
}
if (score >= 0.8) {
break;
}
}
}
if (bestScore > 0) {
coord.leftMostDep = bestPair.o1;
coord.rightMostDep = bestPair.o2;
bestPair.o1.coordHead = coord;
bestPair.o2.coordHead = coord;
}
}
}
protected void preProcessKr(DepTree tree) {
DepNode node;
int i, size = tree.size();
MorphKr root = new MorphKr();
node = tree.get(DepLib.ROOT_ID);
node.morphKr = root;
for (i = 1; i < size; i++) {
node = tree.get(i);
node.morphKr = new MorphKr(node.lemma);
}
}
// =============================== Instance ===============================
protected void trainInstance(String label) {
if (i_flag == FLAG_TRAIN_LEXICON) {
addLexica();
t_map.addLabel(label);
} else if (i_flag == FLAG_TRAIN_INSTANCE) {
saveInstance(label, getFeatureArray());
}
}
protected void saveInstance(String label, IntArrayList ftr) {
// if (ftr.isEmpty()) System.err.println(d_tree.get(i_lambda).lemma+" "+d_tree.get(i_beta).lemma);
saveInstance(label, ftr, t_map, i_trainIndex);
}
/**
* Prints the current transition.
*
* @param trans transition
* @param arc lambda_1[0] <- deprel -> beta[0]
*/
protected void printTransition(String trans, String arc) {
StringBuilder build = new StringBuilder();
// operation
build.append(trans);
build.append("\t");
// lambda_1
build.append("[");
if (i_lambda >= 0) {
build.append(0);
}
if (i_lambda >= 1) {
build.append(":").append(i_lambda);
}
build.append("]\t");
// lambda_2
build.append("[");
if (getLambda2Count() > 0) {
build.append(i_lambda + 1);
}
if (getLambda2Count() > 1) {
build.append(":").append((i_beta - 1));
}
build.append("]\t");
// beta
build.append("[");
if (i_beta < d_tree.size()) {
build.append(i_beta);
}
if (i_beta <= d_tree.size()) {
build.append(":").append((d_tree.size() - 1));
}
build.append("]\t");
// transition
build.append(arc);
f_out.println(build.toString());
}
/**
* @return number of nodes in lambda_2 (list #2)
*/
protected int getLambda2Count() {
return i_beta - (i_lambda + 1);
}
// =============================== Lexica ===============================
protected void addLexica() {
addNgramLexica(t_xml, t_map);
addLanguageSpecificLexica();
}
protected void addLanguageSpecificLexica() {
switch (s_language) {
case DepReader.LANG_EN:
addEnPunctuationLexica();
break;
case DepReader.LANG_CZ:
addCzPunctuationLexica();
break;
}
}
protected void addEnPunctuationLexica() {
DepNode b0 = d_tree.get(i_beta);
if (b0.isDeprel(DepLib.DEPREL_P)) {
t_map.addExtra(0, b0.form);
}
}
protected void addCzPunctuationLexica() {
DepNode b0 = d_tree.get(i_beta);
if (b0.isPos("Z")) {
t_map.addExtra(0, b0.form);
}
}
// =============================== Feature ===============================
protected IntArrayList getFeatureArray() {
// add features
IntArrayList arr = new IntArrayList();
int idx[] = {1};
addNgramFeatures(arr, idx, t_xml, t_map);
addLanguageSpecificFeatures(arr, idx);
return arr;
}
protected void addLanguageSpecificFeatures(IntArrayList arr, int[] beginIndex) {
switch (s_language) {
case DepReader.LANG_EN:
addEnPunctuationFeatures(arr, beginIndex);
break;
case DepReader.LANG_CZ:
addCzPunctuationFeatures(arr, beginIndex);
addCzCoordFeatures(arr, beginIndex);
// addCzCaseFeatures (arr, beginIndex);
break;
case DepReader.LANG_KR:
addKrCaseFeatures(arr, beginIndex);
break;
}
}
/**
* Adds punctuation features. This method is called from {@link ShiftPopParser#getFeatureArray()}.
*/
protected void addEnPunctuationFeatures(IntArrayList arr, int[] beginIndex) {
int index, n = t_map.n_extra[0];
index = d_tree.getRightNearestPunctuation(i_lambda, i_beta - 1, t_map);
if (index != -1) {
arr.add(beginIndex[0] + index);
}
beginIndex[0] += n;
index = d_tree.getRightNearestPunctuation(i_beta, d_tree.size() - 1, t_map);
if (index != -1) {
arr.add(beginIndex[0] + index);
}
beginIndex[0] += n;
index = d_tree.getLeftNearestPunctuation(i_beta, i_lambda + 1, t_map);
if (index != -1) {
arr.add(beginIndex[0] + index);
}
beginIndex[0] += n;
/*
* index = d_tree.getLeftNearestPunctuation(i_lambda, 1, t_map); if
* (index != -1) arr.add(beginIndex[0] + index); beginIndex[0] += n;
*/
}
protected void addCzPunctuationFeatures(IntArrayList arr, int[] beginIndex) {
int index, n = t_map.n_extra[0];
index = d_tree.getRightNearestPunctuation(i_lambda, i_beta - 1, t_map);
if (index != -1) {
arr.add(beginIndex[0] + index);
}
beginIndex[0] += n;
/*
* index = d_tree.getRightNearestPunctuation(i_beta, d_tree.size()-1,
* t_map); if (index != -1) arr.add(beginIndex[0] + index);
* beginIndex[0] += n;
*
* index = d_tree.getLeftNearestPunctuation(i_beta, i_lambda+1, t_map);
* if (index != -1) arr.add(beginIndex[0] + index); beginIndex[0] += n;
*
* index = d_tree.getLeftNearestPunctuation(i_lambda, 1, t_map); if
* (index != -1) arr.add(beginIndex[0] + index); beginIndex[0] += n;
*/
}
protected void addCzCoordFeatures(IntArrayList arr, int[] beginIndex) {
DepNode lambda = d_tree.get(i_lambda);
DepNode beta = d_tree.get(i_beta);
if (lambda.coordHead != null) {
if (lambda.coordHead.id == i_beta) {
arr.add(beginIndex[0]);
} else if (lambda.coordHead.id > 0) {
arr.add(beginIndex[0] + 1);
}
}
if (beta.coordHead != null) {
if (beta.coordHead.id == i_lambda) {
arr.add(beginIndex[0] + 2);
}
}
beginIndex[0] += 3;
}
protected void addCzCaseFeatures(IntArrayList arr, int[] beginIndex) {
if (d_tree.get(i_lambda).isPos("V")) {
if (!d_tree.existsLeftDependent(i_lambda, "Sb")) {
arr.add(beginIndex[0]);
}
if (!d_tree.existsLeftDependent(i_lambda, "Obj")) {
arr.add(beginIndex[0] + 1);
}
if (!d_tree.existsRightDependent(i_lambda, "Sb")) {
arr.add(beginIndex[0] + 2);
}
if (!d_tree.existsRightDependent(i_lambda, "Obj")) {
arr.add(beginIndex[0] + 3);
}
}
if (d_tree.get(i_beta).isPos("V")) {
if (!d_tree.existsLeftDependent(i_beta, "Sb")) {
arr.add(beginIndex[0] + 4);
}
if (!d_tree.existsLeftDependent(i_beta, "Obj")) {
arr.add(beginIndex[0] + 5);
}
}
beginIndex[0] += 6;
}
private void addKrCaseFeatures(IntArrayList arr, int[] beginIndex) {
DepNode lambda = d_tree.get(i_lambda);
DepNode beta = d_tree.get(i_beta);
MorphKr lMorph = lambda.morphKr;
MorphKr bMorph = beta.morphKr;
if (lMorph.isX) {
arr.add(beginIndex[0]);
}
if (bMorph.isX) {
arr.add(beginIndex[0] + 1);
}
beginIndex[0] += 2;
/*
* if (!lambda.isRoot() && i_beta - i_lambda == 1) { if
* (lMorph.getLastMorphem().pos.matches("NN.*") &&
* bMorph.getFirstMorphem().pos.matches("NN.*")) arr.add(beginIndex[0]);
* }
*
* beginIndex[0] += 1;
*/
}
/**
* @return field retrieved from
* <code>token</code>
*/
@Override
protected String getField(FtrToken token) {
int index = (token.source == DepFtrXml.LAMBDA) ? i_lambda : i_beta;
index += token.offset;
if (!d_tree.isRange(index) || (token.source == DepFtrXml.LAMBDA && index == i_beta) || (token.source == DepFtrXml.BETA && index == i_lambda)) {
return null;
}
DepNode node = null;
if (token.relation == null) {
node = d_tree.get(index);
} else if (token.isRelation(DepFtrXml.R_HD)) {
node = d_tree.getHead(index);
} else if (token.isRelation(DepFtrXml.R_LM)) {
node = d_tree.getLeftMostDependent(index);
} else if (token.isRelation(DepFtrXml.R_RM)) {
node = d_tree.getRightMostDependent(index);
}
if (node == null) {
return null;
}
Matcher m;
if (token.isField(DepFtrXml.F_FORM)) {
return node.form;
} else if (token.isField(DepFtrXml.F_LEMMA)) {
return node.lemma;
} else if (token.isField(DepFtrXml.F_POS)) {
return node.pos;
} else if (token.isField(DepFtrXml.F_DEPREL)) {
return node.getDeprel();
} else if ((m = DepFtrXml.P_FEAT.matcher(token.field)).find()) {
return node.getFeat(m.group(1));
} else if ((m = DepFtrXml.P_TRANS.matcher(token.field)).find()) {
int idx = prev_trans.size() - Integer.parseInt(m.group(1)) - 1;
return (idx >= 0) ? prev_trans.get(idx) : null;
} else if ((m = DepFtrXml.P_KR.matcher(token.field)).find()) {
String type = m.group(1);
int loc = Integer.parseInt(m.group(2));
return node.morphKr.getMorphem(loc, type);
}
// System.err.println("Error: unspecified feature '"+token.field+"'");
return null;
}
/**
* Parses
* <code>tree</code>.
*/
abstract public void parse(DepTree tree);
}