/**
* Copyright (c) 2011, Regents of the University of Colorado All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided
* with the distribution. Neither the name of the University of Colorado at
* Boulder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package clear.engine;
import clear.dep.DepNode;
import clear.dep.DepTree;
import clear.dep.srl.SRLHead;
import clear.dep.srl.SRLInfo;
import clear.morph.MorphEnAnalyzer;
import clear.propbank.*;
import clear.treebank.*;
import clear.util.IOUtil;
import clear.util.JSet;
import com.carrotsearch.hppc.IntOpenHashSet;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
/**
* Compare two dependency trees.
*
* @author Jinho D. Choi <b>Last update:</b> 4/26/2010
*/
public class PropToDep {
@Option(name = "-i", usage = "name of a file containing PropBank instances", required = true, metaVar = "REQUIRED")
String s_propFile;
@Option(name = "-o", usage = "name of a directory for dependency output", required = true, metaVar = "REQUIRED")
String s_srlDir;
@Option(name = "-p", usage = "name of a directory containing parse trees", required = true, metaVar = "REQUIRED")
String s_parseDir;
@Option(name = "-h", usage = "name of a file containing head-percolation rules", required = true, metaVar = "REQUIRED")
String s_headruleFile;
@Option(name = "-m", usage = "name of a file containing dictionaries for morphological analyzer", metaVar = "OPTIONAL")
String s_dictFile = null;
@Option(name = "-n", usage = "minimum sentence length (inclusive; default = 1)", metaVar = "OPTIONAL")
int n_length = 1;
@Option(name = "-f", usage = "if set, include function tags", metaVar = "OPTIONAL")
boolean b_funcTag = false;
@Option(name = "-e", usage = "if set, include empty categories", metaVar = "OPTIONAL")
boolean b_ec = false;
@Option(name = "-r", usage = "if set, reverse dependencies of auxiliaries and modals", metaVar = "OPTIONAL")
boolean b_reverseVC = false;
@Option(name = "-j", usage = "if set, consider adjectival predicates as verbal predicates", metaVar = "OPTIONAL")
boolean b_adjPred = false;
// final String PARSE_EXT = ".parse";
final String PARSE_EXT = "";
final String SRL_EXT = ".srl";
HashMap<String, PBInstance> m_pbInstances;
public PropToDep(String[] args) {
CmdLineParser cmd = new CmdLineParser(this);
try {
cmd.parseArgument(args);
readPBInstances();
merge();
} catch (CmdLineException e) {
System.err.println(e.getMessage());
cmd.printUsage(System.err);
}
}
/**
* Reads all PropBank instances and stores them to {@link PropToDep#m_pbInstances}.
*/
public void readPBInstances() {
PBReader reader = new PBReader(s_propFile);
PBInstance instance;
m_pbInstances = new HashMap<>();
System.out.println("Initialize: " + s_propFile);
while ((instance = reader.nextInstance()) != null) {
if (instance.rolesetId.endsWith(".LV")) {
continue;
}
if (!instance.type.endsWith("-v")) {
continue;
}
m_pbInstances.put(instance.getKey(), instance);
}
}
/**
* @return list of tree paths as in PropBank instance.
*/
private ArrayList<String> getTreePaths() {
HashSet<String> set = new HashSet<>();
for (String key : m_pbInstances.keySet()) {
set.add(key.substring(0, key.indexOf(PBInstance.KEY_DELIM)));
}
ArrayList<String> list = new ArrayList<>(set);
Collections.sort(list);
return list;
}
/**
* @return list of PropBank instances for the tree.
*/
private ArrayList<PBInstance> getPBInstances(String treePath, int treeIndex) {
ArrayList<PBInstance> list = new ArrayList<>();
String prefix = treePath + PBInstance.KEY_DELIM + treeIndex + PBInstance.KEY_DELIM;
for (String key : m_pbInstances.keySet()) {
if (key.startsWith(prefix)) {
list.add(m_pbInstances.get(key));
}
}
return list;
}
public void merge() {
TBReader reader;
TBTree tree;
int treeIndex;
String mergeFile;
PrintStream fout;
DepTree dTree;
DepNode dNode;
TBHeadRules headrules = new TBHeadRules(s_headruleFile);
MorphEnAnalyzer morph = (s_dictFile != null) ? new MorphEnAnalyzer(s_dictFile) : null;
TBEnConvert convert = new TBEnConvert(headrules, morph, b_funcTag, b_ec, b_reverseVC);
s_parseDir += File.separator;
s_srlDir += File.separator;
ArrayList<PBInstance> list;
for (String treePath : getTreePaths()) {
mergeFile = s_srlDir + treePath.substring(treePath.lastIndexOf(File.separator) + 1) + SRL_EXT;
reader = new TBReader(s_parseDir + treePath + PARSE_EXT);
fout = IOUtil.createPrintFileStream(mergeFile);
System.out.println(mergeFile);
for (treeIndex = 0; (tree = reader.nextTree()) != null; treeIndex++) {
list = getPBInstances(treePath, treeIndex);
if (!b_adjPred) {
removeAdjectivalPredicates(tree, list);
}
if (list.isEmpty()) {
dTree = convert.toDepTree(tree);
for (int i = 1; i < dTree.size(); i++) {
dNode = dTree.get(i);
dNode.srlInfo = new SRLInfo();
}
} else {
tree.setPBLocs();
tree.setAntecedents();
for (PBInstance instance : list) {
mergeAux(instance, tree);
}
dTree = convert.toSRLTree(tree);
}
// fout.println(";"+treePath+" "+treeIndex);
if (dTree.size() > n_length) {
fout.println(dTree + "\n");
}
}
fout.close();
}
}
protected void removeAdjectivalPredicates(TBTree tree, ArrayList<PBInstance> instances) {
ArrayList<PBInstance> remove = new ArrayList<>();
TBNode node;
for (PBInstance instance : instances) {
node = tree.getNode(instance.predicateId, 1);
if (node.isPos(TBEnLib.POS_NP) || (node.isPos(TBEnLib.POS_ADJP) && node.getParent().isPos(TBEnLib.POS_NP) && !node.isPrior(TBEnLib.POS_NP))) {
remove.add(instance);
}
}
instances.removeAll(remove);
}
private void mergeAux(PBInstance instance, TBTree tree) {
TBNode pred = tree.getNode(instance.predicateId, 0);
if (pred == null) {
System.err.println("Wrong location of predicate: " + instance.treePath + " " + instance.treeIndex + " " + instance.predicateId);
return;
}
ArrayList<PBArg> pbArgs = instance.getArgs();
ArrayList<PBArg> delArgs = new ArrayList<>();
for (PBArg pbArg : pbArgs) {
if (!processEmtpyCategories(pbArg, tree, pred)) {
System.err.println("Wrong location in " + pbArg.label + ": " + instance.toString());
}
}
for (PBArg pbArg : pbArgs) {
if (pbArg.isLabel("rel.*")) {
if (processRels(pbArg, instance.predicateId)) {
delArgs.add(pbArg);
}
continue;
}
if (pbArg.isLabel("LINK.*")) {
if (!processLink(pbArgs, pbArg, tree)) {
System.err.println("No-achor in " + pbArg.label + ": " + instance.toString());
}
delArgs.add(pbArg);
continue;
}
}
pbArgs.removeAll(delArgs);
if (pbArgs.isEmpty()) {
return;
}
if (!instance.rolesetId.endsWith(".DP")) {
pred.rolesetId = instance.rolesetId;
}
for (PBArg pbArg : pbArgs) {
processEmtpyCategoriesSub(pbArg, tree, pred);
if (!addPBArgToTBTree(pbArg, tree)) {
System.err.println("Wrong location in " + pbArg.label + ": " + instance.toString());
}
}
}
/**
* Removes
* <code>predId:0</code> from
* <code>pbArg</code>.
*/
private boolean processRels(PBArg pbArg, int predId) {
for (PBLoc loc : pbArg.getLocs()) {
if (loc.equals(predId, 0)) {
pbArg.removeLoc(loc);
break;
}
}
return pbArg.getLocs().isEmpty();
}
/**
* Merges LINK-argument with its anchor-argument.
*/
private boolean processLink(ArrayList<PBArg> pbArgs, PBArg linkArg, TBTree tree) {
PBLoc anchor = new PBLoc(null, -1, -1);
TBNode node;
if (linkArg.isLabel("LINK-SLC")) {
// find antecedent
for (PBLoc pbLoc : linkArg.getLocs()) {
node = tree.getNode(pbLoc.terminalId, pbLoc.height);
if (node == null) {
return false;
}
if (node.isPos("WH.*")) {
anchor = pbLoc;
break;
}
}
// find antecedent in height 1
if (anchor.terminalId == -1) {
for (PBLoc pbLoc : linkArg.getLocs()) {
node = tree.getNode(pbLoc.terminalId, 1);
if (node != null && node.isPos("WH.*")) {
pbLoc.height = 1;
anchor = pbLoc;
break;
}
}
}
} else if (anchor.terminalId == -1) // normalize empty categories
{
for (PBLoc pbLoc : linkArg.getLocs()) {
node = tree.getNode(pbLoc.terminalId, 0);
if (node == null) {
return false;
}
if (node.isEmptyCategory()) {
pbLoc.height = 0;
}
}
}
for (PBArg pbArg : pbArgs) {
if (!pbArg.isLabel("LINK.*") && pbArg.overlapsLocs(linkArg)) {
processLinkAux(linkArg, pbArg, anchor, tree);
return true;
}
}
for (PBArg pbArg : pbArgs) {
if (!pbArg.isLabel("LINK.*") && pbArg.overlapsMildLocs(linkArg)) {
processLinkAux(linkArg, pbArg, anchor, tree);
return true;
}
}
return false;
}
private void processLinkAux(PBArg linkArg, PBArg pbArg, PBLoc anchor, TBTree tree) {
TBNode node, comp;
pbArg.putLocs(linkArg.getLocs());
// find antecedents of complementizer
if (anchor.terminalId != -1) {
node = tree.getNode(anchor.terminalId, anchor.height);
comp = node.getComplementizer();
if (!comp.hasAntecedent()) {
Collections.sort(pbArg.getLocs());
PBLoc anteLoc = pbArg.getLocs().get(0);
comp.pbLoc.type = PBLib.PROP_OP_COMP;
comp.antecedent = tree.getNode(anteLoc.terminalId, anteLoc.height);
} else {
pbArg.putLoc(comp.antecedent.pbLoc);
}
}
}
/**
* Finds empty categories' antecedents.
*/
private boolean processEmtpyCategories(PBArg pbArg, TBTree tree, TBNode pred) {
ArrayList<PBLoc> addLocs = new ArrayList<>();
// ArrayList<PBLoc> delLocs = new ArrayList<PBLoc>();
TBNode curr, node;
for (PBLoc pbLoc : pbArg.getLocs()) {
curr = tree.getNode(pbLoc.terminalId, pbLoc.height);
if (curr == null) {
return false;
}
if ((node = curr.getIncludedEmptyCategory("\\*ICH\\*.*")) != null && node.hasAntecedent()) {
node.antecedent.pbLoc.type = PBLib.PROP_OP_SKIP;
addLocs.add(node.antecedent.pbLoc);
} else if ((node = curr.getIncludedEmptyCategory("\\*RNR\\*.*")) != null && node.hasAntecedent()) {
node.antecedent.pbLoc.type = PBLib.PROP_OP_SKIP;
addLocs.add(node.antecedent.pbLoc);
} else if (curr.isEmptyCategoryRec()) {
do {
pbLoc.height = 0;
if (curr.isPhrase()) {
curr = tree.getNode(pbLoc.terminalId, 0);
}
if (curr.hasAntecedent()) {
pbLoc = curr.antecedent.pbLoc;
addLocs.add(pbLoc);
} else {
break;
}
curr = tree.getNode(pbLoc.terminalId, pbLoc.height);
} while (curr.isEmptyCategoryRec());
/*
* if (curr.isForm("\\*T\\*.*")) { // delLocs.add(pbLoc); if
* (curr.hasAntecedent()) addLocs.add(curr.antecedent.pbLoc); }
* else if (curr.isForm("\\*PRO\\*.*|\\*|\\*-\\d")) { if
* (curr.getParent().isFollowedBy("VP")) { if
* (curr.hasAntecedent()) addLocs.add(curr.antecedent.pbLoc); }
* // else // delLocs.add(pbLoc);
}
*/
}
}
for (PBLoc pbLoc : addLocs) {
pbArg.putLoc(pbLoc);
}
// for (PBLoc pbLoc: delLocs)
// pbArg.removeLocs(pbLoc);
trimEmptyCategories(pbArg, tree, pred);
return true;
}
private void processEmtpyCategoriesSub(PBArg pbArg, TBTree tree, TBNode pred) {
ArrayList<SRLHead> heads;
TBNode curr, node, term;
for (PBLoc pbLoc : pbArg.getLocs()) {
curr = tree.getNode(pbLoc.terminalId, pbLoc.height);
if (curr.isPos("S")) {
for (TBNode child : curr.getChildren()) {
if (child.isEmptyCategoryRec() && (node = child.getIncludedEmptyCategory("\\*|\\*-\\d")) != null && node.hasAntecedent() && node.terminalId > pred.terminalId) {
node = node.antecedent;
term = tree.getTerminalNode(node.pbLoc.terminalId);
if (term.isPos("EX") || term.form.equalsIgnoreCase("it")) {
return;
}
if ((heads = node.getPBHeads()) != null) {
for (SRLHead head : heads) {
if (head.equals(pred.terminalId + 1)) {
return;
}
}
}
node.pbLoc.type = "";
pbArg.putLoc(node.pbLoc);
return;
}
}
}
}
}
void trimEmptyCategories(PBArg pbArg, TBTree tree, TBNode pred) {
ArrayList<PBLoc> pbLocs = pbArg.getLocs();
ArrayList<PBLoc> delLocs = new ArrayList<>();
Collections.sort(pbLocs);
TBNode ante = tree.getNode(pbLocs.get(0));
if (ante.isEmptyCategoryRec()) {
ante.antecedent = null;
ante = null;
} else {
ante.pbLoc.type = PBLib.PROP_OP_ANTE;
}
boolean isFound = false;
PBLoc pbLoc;
TBNode curr;
for (int i = pbLocs.size() - 1; i >= 0; i--) {
pbLoc = pbLocs.get(i);
curr = tree.getNode(pbLoc);
if (!curr.isEmptyCategoryRec()) {
continue;
}
if (curr.isPhrase()) {
curr = tree.getNode(pbLoc.terminalId, 0);
}
if (curr.isForm("\\*PRO\\*.*|\\*|\\*-\\d")) {
if (!isFound && pbLoc.terminalId < pred.terminalId) {
isFound = true;
curr.antecedent = ante;
} else {
delLocs.add(pbLoc);
}
}
}
if (isFound) {
pbLocs.removeAll(delLocs);
}
}
protected boolean isCyclic(ArrayList<PBInstance> pbInstances, TBTree tree) {
ArrayList<IntOpenHashSet> list = new ArrayList<>();
TBNode node;
for (PBInstance instance : pbInstances) {
IntOpenHashSet set = new IntOpenHashSet();
for (PBArg pbArg : instance.getArgs()) {
for (PBLoc pbLoc : pbArg.getLocs()) {
node = tree.getNode(pbLoc);
set.addAll(node.getSubTermainlIDs());
}
}
list.add(set);
}
int i, j, size = pbInstances.size();
for (i = 0; i < size; i++) {
PBInstance iInstance = pbInstances.get(i);
IntOpenHashSet iSet = list.get(i);
for (j = i + 1; j < size; j++) {
PBInstance jInstance = pbInstances.get(j);
IntOpenHashSet jSet = list.get(j);
if (iSet.contains(jInstance.predicateId) && jSet.contains(iInstance.predicateId)) {
return true;
}
}
}
return false;
}
private boolean addPBArgToTBTree(PBArg pbArg, TBTree tree) {
ArrayList<PBLoc> pbLocs = pbArg.getLocs();
IntOpenHashSet addIDs = new IntOpenHashSet();
IntOpenHashSet delIDs = new IntOpenHashSet();
PBLoc rLoc = null;
TBNode node, tmp;
String label;
if (pbArg.label.matches("rel.*")) {
label = "C-V";
} else {
label = "A" + pbArg.label.substring(3);
}
// retrieve all terminal IDs
for (PBLoc pbLoc : pbLocs) {
if ((node = tree.getNode(pbLoc)) == null) {
return false;
}
tmp = tree.getTerminalNode(pbLoc.terminalId);
if (pbLoc.isType(PBLib.PROP_OP_SKIP)) {
delIDs.addAll(node.getSubTermainlIDs());
}
if (node.isPos("WH.*")) {
rLoc = pbLoc;
} else if (node.isEmptyCategoryRec() && tmp.form.startsWith(TBEnLib.EC_PRO) && pbLoc.terminalId > pbArg.predicateId) {
continue;
} else {
addIDs.addAll(node.getSubTermainlIDs());
}
}
if (addIDs.isEmpty()) {
return true;
}
// add terminal IDs
/*
* int[] ids = addIDs.toArray(); Arrays.sort(ids);
*
* TBNode pred = tree.getTerminalNode(pbArg.predicateId);
* pred.addPBArg(new SRLArg(label, ids));
*/
// add each argument
addIDs.removeAll(delIDs);
int terminalId, height;
String prefix = "";
int[] ids;
while (!addIDs.isEmpty()) {
height = 0;
ids = addIDs.toArray();
Arrays.sort(ids);
terminalId = ids[0];
while (true) {
node = tree.getNode(terminalId, height + 1);
if (node == null || !JSet.isSubset(addIDs, node.getSubTermainlIDs())) {
node = tree.getNode(terminalId, height);
if (node.isEmptyCategoryRec()) {
node.addPBHead(new SRLHead(pbArg.predicateId + 1, label));
} else if (!TBEnLib.isPunctuation(node.pos)) {
node.addPBHead(new SRLHead(pbArg.predicateId + 1, prefix + label));
}
addIDs.removeAll(node.getSubTermainlIDs());
if (!node.isEmptyCategoryRec() && !TBEnLib.isPunctuation(node.pos) && !label.startsWith("AM")) {
prefix = "C-";
}
break;
}
height++;
}
}
prefix = "R-";
if (rLoc != null) {
node = tree.getNode(rLoc);
// ids = node.getSubTermainlIDs().toArray();
// Arrays.sort(ids);
// pred.addPBArg(new SRLArg(prefix+label, ids));
node.addPBHead(new SRLHead(pbArg.predicateId + 1, prefix + label));
}
return true;
}
static public void main(String[] args) {
PropToDep ptd = new PropToDep(args);
}
}