/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.parser; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Set; /** * Abstract class containing many of the methods used to generate contexts for parsing. */ public abstract class AbstractContextGenerator { protected static final String EOS = "eos"; protected boolean zeroBackOff; /** Set of punctuation to be used in generating features. */ protected Set<String> punctSet; protected boolean useLabel; /** * Creates punctuation feature for the specified punctuation at the specified index * based on the punctuation mark. * @param punct The punctuation which is in context. * @param i The index of the punctuation with relative to the parse. * @return Punctuation feature for the specified parse and the specified punctuation at the specfied index. */ protected String punct(Parse punct, int i) { return String.valueOf(i) + "=" + punct.getCoveredText(); } /** * Creates punctuation feature for the specified punctuation at the specfied index * based on the punctuation's tag. * @param punct The punctuation which is in context. * @param i The index of the punctuation relative to the parse. * @return Punctuation feature for the specified parse and the specified punctuation at the specfied index. */ protected String punctbo(Parse punct, int i) { return String.valueOf(i) + "=" + punct.getType(); } protected String cons(Parse p, int i) { StringBuilder feat = new StringBuilder(20); feat.append(i).append("="); if (p != null) { if (useLabel && i < 0) { feat.append(p.getLabel()).append("|"); } feat.append(p.getType()).append("|").append(p.getHead().getCoveredText()); } else { feat.append(EOS); } return feat.toString(); } protected String consbo(Parse p, int i) { //cons back-off StringBuilder feat = new StringBuilder(20); feat.append(i).append("*="); if (p != null) { if (useLabel && i < 0) { feat.append(p.getLabel()).append("|"); } feat.append(p.getType()); } else { feat.append(EOS); } return feat.toString(); } /** * Generates a string representing the grammar rule production that the specified parse * is starting. The rule is of the form p.type -> c.children[0..n].type. * @param p The parse which stats teh production. * @param includePunctuation Whether punctuation should be included in the production. * @return a string representing the grammar rule production that the specified parse * is starting. */ protected String production(Parse p, boolean includePunctuation) { StringBuilder production = new StringBuilder(20); production.append(p.getType()).append("->"); Parse[] children = AbstractBottomUpParser.collapsePunctuation(p.getChildren(),punctSet); for (int ci = 0; ci < children.length; ci++) { production.append(children[ci].getType()); if (ci + 1 != children.length) { production.append(","); Collection<Parse> nextPunct = children[ci].getNextPunctuationSet(); if (includePunctuation && nextPunct != null) { //TODO: make sure multiple punctuation comes out the same for (Iterator<Parse> pit = nextPunct.iterator(); pit.hasNext();) { Parse punct = pit.next(); production.append(punct.getType()).append(","); } } } } return production.toString(); } protected void cons2(List<String> features, Cons c0, Cons c1, Collection<Parse> punct1s, boolean bigram) { if (punct1s != null) { for (Iterator<Parse> pi = punct1s.iterator();pi.hasNext();) { Parse p = pi.next(); String punctbo = punctbo(p,c1.index <= 0 ? c1.index - 1 : c1.index); //punctbo(1); features.add(punctbo); if (c0.index == 0) { //TODO look at removing case //cons(0)punctbo(1) if (c0.unigram) features.add(c0.cons + "," + punctbo); features.add(c0.consbo + "," + punctbo); } if (c1.index == 0) { //TODO look at removing case //punctbo(1)cons(1) if (c1.unigram) features.add(punctbo + "," + c1.cons); features.add(punctbo + "," + c1.consbo); } //cons(0)punctbo(1)cons(1) if (bigram) features.add(c0.cons + "," + punctbo + "," + c1.cons); if (c1.unigram) features.add(c0.consbo + "," + punctbo + "," + c1.cons); if (c0.unigram) features.add(c0.cons + "," + punctbo + "," + c1.consbo); features.add(c0.consbo + "," + punctbo + "," + c1.consbo); } } else { //cons(0),cons(1) if (bigram) features.add(c0.cons + "," + c1.cons); if (c1.unigram) features.add(c0.consbo + "," + c1.cons); if (c0.unigram) features.add(c0.cons + "," + c1.consbo); features.add(c0.consbo + "," + c1.consbo); } } /** * Creates cons features involving the 3 specified nodes and adds them to the specified feature list. * @param features The list of features. * @param c0 The first node. * @param c1 The second node. * @param c2 The third node. * @param punct1s The punctuation between the first and second node. * @param punct2s The punctuation between the second and third node. * @param trigram Specifies whether lexical tri-gram features between these nodes should be generated. * @param bigram1 Specifies whether lexical bi-gram features between the first and second * node should be generated. * @param bigram2 Specifies whether lexical bi-gram features between the second and third * node should be generated. */ protected void cons3(List<String> features, Cons c0, Cons c1, Cons c2, Collection<Parse> punct1s, Collection<Parse> punct2s, boolean trigram, boolean bigram1, boolean bigram2) { // features.add("stage=cons(0),cons(1),cons(2)"); if (punct1s != null) { if (c0.index == -2) { for (Iterator<Parse> pi = punct1s.iterator(); pi.hasNext();) { Parse p = pi.next(); // String punct = punct(p,c1.index); String punctbo = punctbo(p,c1.index <= 0 ? c1.index - 1 : c1.index); //punct(-2) //TODO consider changing //features.add(punct); //punctbo(-2) features.add(punctbo); } } } if (punct2s != null) { if (c2.index == 2) { for (Iterator<Parse> pi = punct2s.iterator(); pi.hasNext();) { Parse p = pi.next(); // String punct = punct(p,c2.index); String punctbo = punctbo(p,c2.index <= 0 ? c2.index - 1 : c2.index); //punct(2) //TODO consider changing //features.add(punct); //punctbo(2) features.add(punctbo); } } if (punct1s != null) { //cons(0),punctbo(1),cons(1),punctbo(2),cons(2) for (Iterator<Parse> pi2 = punct2s.iterator(); pi2.hasNext();) { String punctbo2 = punctbo(pi2.next(),c2.index <= 0 ? c2.index - 1 : c2.index); for (Iterator<Parse> pi1 = punct1s.iterator(); pi1.hasNext();) { String punctbo1 = punctbo(pi1.next(),c1.index <= 0 ? c1.index - 1 : c1.index); if (trigram) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); if (zeroBackOff) { if (bigram1) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + punctbo2); if (c1.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + punctbo2); if (c0.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + punctbo2); features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + punctbo2); } } } } else { //punct1s == null //cons(0),cons(1),punctbo(2),cons(2) for (Iterator<Parse> pi2 = punct2s.iterator(); pi2.hasNext();) { String punctbo2 = punctbo(pi2.next(),c2.index <= 0 ? c2.index - 1 : c2.index); if (trigram) features.add(c0.cons + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + c1.cons + "," + punctbo2 + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + c1.consbo + "," + punctbo2 + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + c1.cons + "," + punctbo2 + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); features.add(c0.consbo + "," + c1.consbo + "," + punctbo2 + "," + c2.consbo); if (zeroBackOff) { if (bigram1) features.add(c0.cons + "," + c1.cons + "," + punctbo2); if (c1.unigram) features.add(c0.consbo + "," + c1.cons + "," + punctbo2); if (c0.unigram) features.add(c0.cons + "," + c1.consbo + "," + punctbo2); features.add(c0.consbo + "," + c1.consbo + "," + punctbo2); } } } } else { if (punct1s != null) { //cons(0),punctbo(1),cons(1),cons(2) for (Iterator<Parse> pi1 = punct1s.iterator(); pi1.hasNext();) { String punctbo1 = punctbo(pi1.next(), c1.index <= 0 ? c1.index - 1 : c1.index); if (trigram) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + punctbo1 + "," + c1.cons + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + punctbo1 + "," + c1.cons + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + punctbo1 + "," + c1.consbo + "," + c2.consbo); features.add(c0.consbo + "," + punctbo1 + "," + c1.consbo + "," + c2.consbo); //zero backoff case covered by cons(0)cons(1) } } else { //cons(0),cons(1),cons(2) if (trigram) features.add(c0.cons + "," + c1.cons + "," + c2.cons); if (bigram2) features.add(c0.consbo + "," + c1.cons + "," + c2.cons); if (c0.unigram && c2.unigram) features.add(c0.cons + "," + c1.consbo + "," + c2.cons); if (bigram1) features.add(c0.cons + "," + c1.cons + "," + c2.consbo); if (c2.unigram) features.add(c0.consbo + "," + c1.consbo + "," + c2.cons); if (c1.unigram) features.add(c0.consbo + "," + c1.cons + "," + c2.consbo); if (c0.unigram) features.add(c0.cons + "," + c1.consbo + "," + c2.consbo); features.add(c0.consbo + "," + c1.consbo + "," + c2.consbo); } } } /** * Generates features for nodes surrounding a completed node of the specified type. * @param node A surrounding node. * @param i The index of the surrounding node with respect to the completed node. * @param type The type of the completed node. * @param punctuation The punctuation adjacent and between the specified surrounding node. * @param features A list to which features are added. */ protected void surround(Parse node, int i, String type, Collection<Parse> punctuation, List<String> features) { StringBuilder feat = new StringBuilder(20); feat.append("s").append(i).append("="); if (punctuation != null) { for (Iterator<Parse> pi = punctuation.iterator(); pi.hasNext();) { Parse punct = pi.next(); if (node != null) { feat.append(node.getHead().getCoveredText()).append("|").append(type) .append("|").append(node.getType()).append("|").append(punct.getType()); } else { feat.append(type).append("|").append(EOS).append("|").append(punct.getType()); } features.add(feat.toString()); feat.setLength(0); feat.append("s").append(i).append("*="); if (node != null) { feat.append(type).append("|").append(node.getType()).append("|").append(punct.getType()); } else { feat.append(type).append("|").append(EOS).append("|").append(punct.getType()); } features.add(feat.toString()); feat.setLength(0); feat.append("s").append(i).append("*="); feat.append(type).append("|").append(punct.getType()); features.add(feat.toString()); } } else { if (node != null) { feat.append(node.getHead().getCoveredText()).append("|").append(type) .append("|").append(node.getType()); } else { feat.append(type).append("|").append(EOS); } features.add(feat.toString()); feat.setLength(0); feat.append("s").append(i).append("*="); if (node != null) { feat.append(type).append("|").append(node.getType()); } else { feat.append(type).append("|").append(EOS); } features.add(feat.toString()); } } /** * Produces features to determine whether the specified child node is part of * a complete constituent of the specified type and adds those features to the * specfied list. * @param child The parse node to consider. * @param i A string indicating the position of the child node. * @param type The type of constituent being built. * @param features List to add features to. */ protected void checkcons(Parse child, String i, String type, List<String> features) { StringBuilder feat = new StringBuilder(20); feat.append("c").append(i).append("=").append(child.getType()).append("|") .append(child.getHead().getCoveredText()).append("|").append(type); features.add(feat.toString()); feat.setLength(0); feat.append("c").append(i).append("*=").append(child.getType()).append("|").append(type); features.add(feat.toString()); } protected void checkcons(Parse p1, Parse p2, String type, List<String> features) { StringBuilder feat = new StringBuilder(20); feat.append("cil=").append(type).append(",").append(p1.getType()).append("|") .append(p1.getHead().getCoveredText()).append(",").append(p2.getType()) .append("|").append(p2.getHead().getCoveredText()); features.add(feat.toString()); feat.setLength(0); feat.append("ci*l=").append(type).append(",").append(p1.getType()).append(",") .append(p2.getType()).append("|").append(p2.getHead().getCoveredText()); features.add(feat.toString()); feat.setLength(0); feat.append("cil*=").append(type).append(",").append(p1.getType()).append("|") .append(p1.getHead().getCoveredText()).append(",").append(p2.getType()); features.add(feat.toString()); feat.setLength(0); feat.append("ci*l*=").append(type).append(",").append(p1.getType()) .append(",").append(p2.getType()); features.add(feat.toString()); } /** * Populates specified nodes array with left-most right frontier * node with a unique head. If the right frontier doesn't contain * enough nodes, then nulls are placed in the array elements. * @param rf The current right frontier. * @param nodes The array to be populated. */ protected void getFrontierNodes(List<Parse> rf, Parse[] nodes) { int leftIndex = 0; int prevHeadIndex = -1; for (int fi = 0; fi < rf.size(); fi++) { Parse fn = rf.get(fi); int headIndex = fn.getHeadIndex(); if (headIndex != prevHeadIndex) { nodes[leftIndex] = fn; leftIndex++; prevHeadIndex = headIndex; if (leftIndex == nodes.length) { break; } } } for (int ni = leftIndex; ni < nodes.length; ni++) { nodes[ni] = null; } } }