package edu.stanford.nlp.trees; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ling.LabelFactory; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon; import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern; import edu.stanford.nlp.util.StringUtils; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.Properties; /** * Coordination transformer transforms a PennTreebank tree containing * a coordination in a flat structure in order to get the dependencies * right. * <br> * The transformer goes through several steps: * <ul> * <li> Removes empty nodes and simplifies many tags (<code>DependencyTreeTransformer</code>) * <li> Relabels UCP phrases to either ADVP or NP depending on their content * <li> Turn flat CC structures into structures with an intervening node * <li> Add extra structure to QP phrases - combine "well over", unflattened structures with CC (<code>QPTreeTransformer</code>) * <li> Flatten SQ structures to get the verb as the head * <li> Rearrange structures that appear to be dates * <li> Flatten X over only X structures * <li> Turn some fixed conjunction phrases into CONJP, such as "and yet", etc * <li> Attach RB such as "not" to the next phrase to get the RB headed by the phrase it modifies * <li> Turn SBAR to PP if parsed as SBAR in phrases such as "The day after the airline was planning ..." * <li> Rearrange "now that" into an SBAR phrase if it was misparsed as ADVP * <li> (Only for universal dependencies) Extracts multi-word expressions and attaches all nodes to a new MWE constituent * </ul> * * @author Marie-Catherine de Marneffe * @author John Bauer * @author Sebastian Schuster */ public class CoordinationTransformer implements TreeTransformer { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(CoordinationTransformer.class); private static final boolean VERBOSE = System.getProperty("CoordinationTransformer", null) != null; private final TreeTransformer tn = new DependencyTreeTransformer(); //to get rid of unwanted nodes and tag private final TreeTransformer dates = new DateTreeTransformer(); //to flatten date patterns private final TreeTransformer qp; //to restructure the QP constituents private final HeadFinder headFinder; private final boolean performMWETransformation; // default constructor public CoordinationTransformer(HeadFinder hf) { this(hf, false); } /** * Constructor * * @param hf the headfinder * @param performMWETransformation Parameter for backwards compatibility. * If set to false, multi-word expressions won't be attached to a new "MWE" node */ public CoordinationTransformer(HeadFinder hf, boolean performMWETransformation) { this.headFinder = hf; this.performMWETransformation = performMWETransformation; qp = new QPTreeTransformer(performMWETransformation); } /** * Transforms t if it contains a coordination in a flat structure (CCtransform) * and transforms UCP (UCPtransform). * * @param t a tree to be transformed * @return t transformed */ @Override public Tree transformTree(Tree t) { if (VERBOSE) { log.info("Input to CoordinationTransformer: " + t); } t = tn.transformTree(t); if (VERBOSE) { log.info("After DependencyTreeTransformer: " + t); } if (t == null) { return t; } if (performMWETransformation) { t = MWETransform(t); if (VERBOSE) { log.info("After MWETransform: " + t); } t = prepCCTransform(t); if (VERBOSE) { log.info("After prepCCTransform: " + t); } } t = UCPtransform(t); if (VERBOSE) { log.info("After UCPTransformer: " + t); } t = CCtransform(t); if (VERBOSE) { log.info("After CCTransformer: " + t); } t = qp.transformTree(t); if (VERBOSE) { log.info("After QPTreeTransformer: " + t); } t = SQflatten(t); if (VERBOSE) { log.info("After SQ flattening: " + t); } t = dates.transformTree(t); if (VERBOSE) { log.info("After DateTreeTransformer: " + t); } t = removeXOverX(t); if (VERBOSE) { log.info("After removeXoverX: " + t); } t = combineConjp(t); if (VERBOSE) { log.info("After combineConjp: " + t); } t = moveRB(t); if (VERBOSE) { log.info("After moveRB: " + t); } t = changeSbarToPP(t); if (VERBOSE) { log.info("After changeSbarToPP: " + t); } t = rearrangeNowThat(t); if (VERBOSE) { log.info("After rearrangeNowThat: " + t); } return t; } private static TregexPattern rearrangeNowThatTregex = TregexPattern.compile("ADVP=advp <1 (RB < /^(?i:now)$/) <2 (SBAR=sbar <1 (IN < /^(?i:that)$/))"); private static TsurgeonPattern rearrangeNowThatTsurgeon = Tsurgeon.parseOperation("[relabel advp SBAR] [excise sbar sbar]"); private static Tree rearrangeNowThat(Tree t) { if (t == null) { return t; } return Tsurgeon.processPattern(rearrangeNowThatTregex, rearrangeNowThatTsurgeon, t); } private static TregexPattern changeSbarToPPTregex = TregexPattern.compile("NP < (NP $++ (SBAR=sbar < (IN < /^(?i:after|before|until|since|during)$/ $++ S)))"); private static TsurgeonPattern changeSbarToPPTsurgeon = Tsurgeon.parseOperation("relabel sbar PP"); /** * For certain phrases, we change the SBAR to a PP to get prep/pcomp * dependencies. For example, in "The day after the airline was * planning...", we want prep(day, after) and pcomp(after, * planning). If "after the airline was planning" was parsed as an * SBAR, either by the parser or in the treebank, we fix that here. */ private static Tree changeSbarToPP(Tree t) { if (t == null) { return null; } return Tsurgeon.processPattern(changeSbarToPPTregex, changeSbarToPPTsurgeon, t); } private static TregexPattern findFlatConjpTregex = // TODO: add more patterns, perhaps ignore case // for example, what should we do with "and not"? Is it right to // generally add the "not" to the following tree with moveRB, or // should we make "and not" a CONJP? // also, perhaps look at ADVP TregexPattern.compile("/^(S|PP|VP)/ < (/^(S(?!YM)|PP|VP)/ $++ (CC=start $+ (RB|ADVP $+ /^(S(?!YM)|PP|VP)/) " + "[ (< and $+ (RB=end < yet)) | " + // TODO: what should be the head of "and yet"? " (< and $+ (RB=end < so)) | " + " (< and $+ (ADVP=end < (RB|IN < so))) ] ))"); // TODO: this structure needs a dependency private static TsurgeonPattern addConjpTsurgeon = Tsurgeon.parseOperation("createSubtree CONJP start end"); private static Tree combineConjp(Tree t) { if (t == null) { return null; } return Tsurgeon.processPattern(findFlatConjpTregex, addConjpTsurgeon, t); } private static TregexPattern[] moveRBTregex = { TregexPattern.compile("/^S|PP|VP|NP/ < (/^(S|PP|VP|NP)/ $++ (/^(,|CC|CONJP)$/ [ $+ (RB=adv [ < not | < then ]) | $+ (ADVP=adv <: RB) ])) : (=adv $+ /^(S(?!YM)|PP|VP|NP)/=dest) "), TregexPattern.compile("/^ADVP/ < (/^ADVP/ $++ (/^(,|CC|CONJP)$/ [$+ (RB=adv [ < not | < then ]) | $+ (ADVP=adv <: RB)])) : (=adv $+ /^NP-ADV|ADVP|PP/=dest)"), TregexPattern.compile("/^FRAG/ < (ADVP|RB=adv $+ VP=dest)"), }; private static TsurgeonPattern moveRBTsurgeon = Tsurgeon.parseOperation("move adv >0 dest"); static Tree moveRB(Tree t) { if (t == null) { return null; } for (TregexPattern pattern : moveRBTregex) { t = Tsurgeon.processPattern(pattern, moveRBTsurgeon, t); } return t; } // Matches to be questions if the question starts with WHNP, such as // Who, What, if there is an SQ after the WH question. // // TODO: maybe we want to catch more complicated tree structures // with something in between the WH and the actual question. private static TregexPattern flattenSQTregex = TregexPattern.compile("SBARQ < ((WHNP=what < WP) $+ (SQ=sq < (/^VB/=verb < " + EnglishPatterns.copularWordRegex + ") " + // match against "is running" if the verb is under just a VBG " !< (/^VB/ < !" + EnglishPatterns.copularWordRegex + ") " + // match against "is running" if the verb is under a VP - VBG " !< (/^V/ < /^VB/ < !" + EnglishPatterns.copularWordRegex + ") " + // match against "What is on the test?" " !< (PP $- =verb) " + // match against "is there" " !<, (/^VB/ < " + EnglishPatterns.copularWordRegex + " $+ (NP < (EX < there)))" + // match against "good at" " !< (ADJP < (PP <: IN|TO))))"); private static TsurgeonPattern flattenSQTsurgeon = Tsurgeon.parseOperation("excise sq sq"); /** * Removes the SQ structure under a WHNP question, such as "Who am I * to judge?". We do this so that it is easier to pick out the head * and then easier to connect that head to all of the other words in * the question in this situation. In the specific case of making * the copula head, we don't do this so that the existing headfinder * code can easily find the "am" or other copula verb. */ public Tree SQflatten(Tree t) { if (headFinder != null && (headFinder instanceof CopulaHeadFinder)) { if (((CopulaHeadFinder) headFinder).makesCopulaHead()) { return t; } } if (t == null) { return null; } return Tsurgeon.processPattern(flattenSQTregex, flattenSQTsurgeon, t); } private static TregexPattern removeXOverXTregex = TregexPattern.compile("__=repeat <: (~repeat < __)"); private static TsurgeonPattern removeXOverXTsurgeon = Tsurgeon.parseOperation("excise repeat repeat"); public static Tree removeXOverX(Tree t) { return Tsurgeon.processPattern(removeXOverXTregex, removeXOverXTsurgeon, t); } // UCP (JJ ...) -> ADJP // UCP (DT JJ ...) -> ADJP // UCP (... (ADJP (JJR older|younger))) -> ADJP // UCP (N ...) -> NP // UCP ADVP -> ADVP // Might want to look for ways to include RB for flatter structures, // but then we have to watch out for (RB not) for example // Note that the order of OR expressions means the older|younger // pattern takes precedence // By searching for everything at once, then using one tsurgeon // which fixes everything at once, we can save quite a bit of time private static final TregexPattern ucpRenameTregex = TregexPattern.compile("/^UCP/=ucp [ <, /^JJ|ADJP/=adjp | ( <1 DT <2 /^JJ|ADJP/=adjp ) |" + " <- (ADJP=adjp < (JJR < /^(?i:younger|older)$/)) |" + " <, /^N/=np | ( <1 DT <2 /^N/=np ) | " + " <, /^ADVP/=advp ]"); // TODO: this turns UCP-TMP into ADVP instead of ADVP-TMP. What do we actually want? private static final TsurgeonPattern ucpRenameTsurgeon = Tsurgeon.parseOperation("[if exists adjp relabel ucp /^UCP(.*)$/ADJP$1/] [if exists np relabel ucp /^UCP(.*)$/NP$1/] [if exists advp relabel ucp /^UCP(.*)$/ADVP/]"); /** * Transforms t if it contains an UCP, it will change the UCP tag * into the phrasal tag of the first word of the UCP * (UCP (JJ electronic) (, ,) (NN computer) (CC and) (NN building)) * will become * (ADJP (JJ electronic) (, ,) (NN computer) (CC and) (NN building)) * * @param t a tree to be transformed * @return t transformed */ public static Tree UCPtransform(Tree t) { if (t == null) { return null; } return Tsurgeon.processPattern(ucpRenameTregex, ucpRenameTsurgeon, t); } /** * Transforms t if it contains a coordination in a flat structure * * @param t a tree to be transformed * @return t transformed (give t not null, return will not be null) */ public static Tree CCtransform(Tree t) { boolean notDone = true; while (notDone) { Tree cc = findCCparent(t, t); if (cc != null) { t = cc; } else { notDone = false; } } return t; } private static String getHeadTag(Tree t) { if (t.value().startsWith("NN")) { return "NP"; } else if (t.value().startsWith("JJ")) { return "ADJP"; } else { return "NP"; } } /** If things match, this method destructively changes the children list * of the tree t. When this method is called, t is an NP and there must * be at least two children to the right of ccIndex. * * @param t The tree to transform a conjunction in * @param ccIndex The index of the CC child * @return t */ private static Tree transformCC(Tree t, int ccIndex) { if (VERBOSE) { log.info("transformCC in: " + t); } //System.out.println(ccIndex); // use the factories of t to create new nodes TreeFactory tf = t.treeFactory(); LabelFactory lf = t.label().labelFactory(); Tree[] ccSiblings = t.children(); //check if other CC List<Integer> ccPositions = new ArrayList<>(); for (int i = ccIndex + 1; i < ccSiblings.length; i++) { if (ccSiblings[i].value().startsWith("CC") && i < ccSiblings.length - 1) { // second conjunct to ensure that a CC we add isn't the last child ccPositions.add(Integer.valueOf(i)); } } // a CC b c ... -> (a CC b) c ... with b not a DT String beforeSibling = ccSiblings[ccIndex - 1].value(); if (ccIndex == 1 && (beforeSibling.equals("DT") || beforeSibling.equals("JJ") || beforeSibling.equals("RB") || ! (ccSiblings[ccIndex + 1].value().equals("DT"))) && ! (beforeSibling.startsWith("NP") || beforeSibling.equals("ADJP") || beforeSibling.equals("NNS"))) { // && (ccSiblings.length == ccIndex + 3 || !ccPositions.isEmpty())) { // something like "soya or maize oil" String leftHead = getHeadTag(ccSiblings[ccIndex - 1]); //create a new tree to be inserted as first child of t Tree left = tf.newTreeNode(lf.newLabel(leftHead), null); for (int i = 0; i < ccIndex + 2; i++) { left.addChild(ccSiblings[i]); } if (VERBOSE) { System.out.println("print left tree"); left.pennPrint(); System.out.println(); } // remove all the children of t before ccIndex+2 for (int i = 0; i < ccIndex + 2; i++) { t.removeChild(0); } if (VERBOSE) { if (t.numChildren() == 0) { System.out.println("Youch! No t children"); } } // if stuff after (like "soya or maize oil and vegetables") // we need to put the tree in another tree if (!ccPositions.isEmpty()) { boolean comma = false; int index = ccPositions.get(0); if (VERBOSE) {log.info("more CC index " + index);} if (ccSiblings[index - 1].value().equals(",")) {//to handle the case of a comma ("soya and maize oil, and vegetables") index = index - 1; comma = true; } if (VERBOSE) {log.info("more CC index " + index);} String head = getHeadTag(ccSiblings[index - 1]); if (ccIndex + 2 < index) { Tree tree = tf.newTreeNode(lf.newLabel(head), null); tree.addChild(0, left); int k = 1; for (int j = ccIndex+2; j<index; j++) { if (VERBOSE) ccSiblings[j].pennPrint(); t.removeChild(0); tree.addChild(k, ccSiblings[j]); k++; } if (VERBOSE) { System.out.println("print t"); t.pennPrint(); System.out.println("print tree"); tree.pennPrint(); System.out.println(); } t.addChild(0, tree); } else { t.addChild(0, left); } Tree rightTree = tf.newTreeNode(lf.newLabel("NP"), null); int start = 2; if (comma) { start++; } while (start < t.numChildren()) { Tree sib = t.getChild(start); t.removeChild(start); rightTree.addChild(sib); } t.addChild(rightTree); } else { t.addChild(0, left); } } // DT a CC b c -> DT (a CC b) c else if (ccIndex == 2 && ccSiblings[0].value().startsWith("DT") && !ccSiblings[ccIndex - 1].value().equals("NNS") && (ccSiblings.length == 5 || (!ccPositions.isEmpty() && ccPositions.get(0) == 5))) { String head = getHeadTag(ccSiblings[ccIndex - 1]); //create a new tree to be inserted as second child of t (after the determiner Tree child = tf.newTreeNode(lf.newLabel(head), null); for (int i = 1; i < ccIndex + 2; i++) { child.addChild(ccSiblings[i]); } if (VERBOSE) { if (child.numChildren() == 0) { System.out.println("Youch! No child children"); } } // remove all the children of t between the determiner and ccIndex+2 //System.out.println("print left tree"); //child.pennPrint(); for (int i = 1; i < ccIndex + 2; i++) { t.removeChild(1); } t.addChild(1, child); } // ... a, b CC c ... -> ... (a, b CC c) ... else if (ccIndex > 2 && ccSiblings[ccIndex - 2].value().equals(",") && !ccSiblings[ccIndex - 1].value().equals("NNS")) { String head = getHeadTag(ccSiblings[ccIndex - 1]); Tree child = tf.newTreeNode(lf.newLabel(head), null); for (int i = ccIndex - 3; i < ccIndex + 2; i++) { child.addChild(ccSiblings[i]); } if (VERBOSE) { if (child.numChildren() == 0) { System.out.println("Youch! No child children"); } } int i = ccIndex - 4; while (i > 0 && ccSiblings[i].value().equals(",")) { child.addChild(0, ccSiblings[i]); // add the comma child.addChild(0, ccSiblings[i - 1]); // add the word before the comma i = i - 2; } if (i < 0) { i = -1; } // remove the old children for (int j = i + 1; j < ccIndex + 2; j++) { t.removeChild(i + 1); } // put the new tree t.addChild(i + 1, child); } // something like "the new phone book and tour guide" -> multiple heads // we want (NP the new phone book) (CC and) (NP tour guide) else { boolean commaLeft = false; boolean commaRight = false; boolean preconj = false; int indexBegin = 0; Tree conjT = tf.newTreeNode(lf.newLabel("CC"), null); // create the left tree String leftHead = getHeadTag(ccSiblings[ccIndex - 1]); Tree left = tf.newTreeNode(lf.newLabel(leftHead), null); // handle the case of a preconjunct (either, both, neither) Tree first = ccSiblings[0]; String leaf = first.firstChild().value().toLowerCase(); if (leaf.equals("either") || leaf.equals("neither") || leaf.equals("both")) { preconj = true; indexBegin = 1; conjT.addChild(first.firstChild()); } for (int i = indexBegin; i < ccIndex - 1; i++) { left.addChild(ccSiblings[i]); } // handle the case of a comma ("GM soya and maize, and food ingredients") if (ccSiblings[ccIndex - 1].value().equals(",")) { commaLeft = true; } else { left.addChild(ccSiblings[ccIndex - 1]); } // create the CC tree Tree cc = ccSiblings[ccIndex]; // create the right tree int nextCC; if (ccPositions.isEmpty()) { nextCC = ccSiblings.length; } else { nextCC = ccPositions.get(0); } String rightHead = getHeadTag(ccSiblings[nextCC - 1]); Tree right = tf.newTreeNode(lf.newLabel(rightHead), null); for (int i = ccIndex + 1; i < nextCC - 1; i++) { right.addChild(ccSiblings[i]); } // handle the case of a comma ("GM soya and maize, and food ingredients") if (ccSiblings[nextCC - 1].value().equals(",")) { commaRight = true; } else { right.addChild(ccSiblings[nextCC - 1]); } if (VERBOSE) { if (left.numChildren() == 0) { System.out.println("Youch! No left children"); } if (right.numChildren() == 0) { System.out.println("Youch! No right children"); } } // put trees together in old t, first we remove the old nodes for (int i = 0; i < nextCC; i++) { t.removeChild(0); } if (!ccPositions.isEmpty()) { // need an extra level Tree tree = tf.newTreeNode(lf.newLabel("NP"), null); if (preconj) { tree.addChild(conjT); } if (left.numChildren() > 0) { tree.addChild(left); } if (commaLeft) { tree.addChild(ccSiblings[ccIndex - 1]); } tree.addChild(cc); if (right.numChildren() > 0) { tree.addChild(right); } if (commaRight) { t.addChild(0, ccSiblings[nextCC - 1]); } t.addChild(0, tree); } else { if (preconj) { t.addChild(conjT); } if (left.numChildren() > 0) { t.addChild(left); } if (commaLeft) { t.addChild(ccSiblings[ccIndex - 1]); } t.addChild(cc); if (right.numChildren() > 0) { t.addChild(right); } if (commaRight) { t.addChild(ccSiblings[nextCC - 1]); } } } if (VERBOSE) { log.info("transformCC out: " + t); } return t; } private static boolean notNP(List<Tree> children, int ccIndex) { for (int i = ccIndex, sz = children.size(); i < sz; i++) { if (children.get(i).value().startsWith("NP")) { return false; } } return true; } /* * Given a tree t, if this tree contains a CC inside a NP followed by 2 nodes * (i.e. we have a flat structure that will not work for the dependencies), * it will call transform CC on the NP containing the CC and the index of the * CC, and then return the root of the whole transformed tree. * If it finds no such tree, this method returns null. */ private static Tree findCCparent(Tree t, Tree root) { if (t.isPreTerminal()) { if (t.value().startsWith("CC")) { Tree parent = t.parent(root); if (parent != null && parent.value().startsWith("NP")) { List<Tree> children = parent.getChildrenAsList(); //System.out.println(children); int ccIndex = children.indexOf(t); if (children.size() > ccIndex + 2 && notNP(children, ccIndex) && ccIndex != 0 && (ccIndex == children.size() - 1 || !children.get(ccIndex+1).value().startsWith("CC"))) { transformCC(parent, ccIndex); if (VERBOSE) { log.info("After transformCC: " + root); } return root; } } } } else { for (Tree child : t.getChildrenAsList()) { Tree cur = findCCparent(child, root); if (cur != null) { return cur; } } } return null; } /** * Multi-word expression patterns */ private static TregexPattern[] MWE_PATTERNS = { TregexPattern.compile("@CONJP <1 (RB=node1 < /^(?i)as$/) <2 (RB=node2 < /^(?i)well$/) <- (IN=node3 < /^(?i)as$/)"), //as well as TregexPattern.compile("@ADVP|CONJP <1 (RB=node1 < /^(?i)as$/) <- (IN|RB=node2 < /^(?i)well$/)"), //as well TregexPattern.compile("@PP < ((JJ=node1 < /^(?i)such$/) $+ (IN=node2 < /^(?i)as$/))"), //such as TregexPattern.compile("@PP < ((JJ|IN=node1 < /^(?i)due$/) $+ (IN|TO=node2 < /^(?i)to$/))"), //due to TregexPattern.compile("@PP|CONJP < ((IN|RB=node1 < /^(?i)(because|instead)$/) $+ (IN=node2 < of))"), //because of/instead of TregexPattern.compile("@ADVP|SBAR < ((IN|RB=node1 < /^(?i)in$/) $+ (NN=node2 < /^(?i)case$/))"), //in case TregexPattern.compile("@ADVP|PP < ((IN|RB=node1 < /^(?i)of$/) $+ (NN|RB=node2 < /^(?i)course$/))"), //of course TregexPattern.compile("@SBAR|PP < ((IN|RB=node1 < /^(?i)in$/) $+ (NN|NP|RB=node2 [< /^(?i)order$/ | <: (NN < /^(?i)order$/)]))"), //in order TregexPattern.compile("@PP|CONJP|SBAR < ((IN|RB=node1 < /^(?i)rather$/) $+ (IN=node2 < /^(?i)than$/))"), //rather than TregexPattern.compile("@CONJP < ((IN|RB=node1 < /^(?i)not$/) $+ (TO=node2 < /^(?i)to$/ $+ (VB|RB=node3 < /^(?i)mention$/)))"), //not to mention TregexPattern.compile("@PP|SBAR < ((JJ|IN|RB=node1 < /^(?i)so$/) $+ (IN|TO=node2 < /^(?i)that$/))"), //so that TregexPattern.compile("@SBAR < ((IN|RB=node1 < /^(?i)as$/) $+ (IN=node2 < /^(?i)if$/))"), //as if TregexPattern.compile("@PP < ((JJ|RB=node1 < /^(?i)prior$/) $+ (TO|IN=node2 < /^(?i)to$/))"), //prior to TregexPattern.compile("@PP < ((IN=node1 < /^(?i)as$/) $+ (TO|IN=node2 < /^(?i)to$/))"), //as to TregexPattern.compile("@ADVP < ((RB|NN=node1 < /^(?i)kind$/) $+ (IN|RB=node2 < /^(?i)of$/))"), //kind of TregexPattern.compile("@SBAR < ((IN|RB=node1 < /^(?i)whether$/) $+ (CC=node2 < /^(?i)or$/ $+ (RB=node3 < /^(?i)not$/)))"), //whether or not TregexPattern.compile("@CONJP < ((IN=node1 < /^(?i)as$/) $+ (VBN=node2 < /^(?i)opposed$/ $+ (TO|IN=node3 < /^(?i)to$/)))"), //as opposed to TregexPattern.compile("@ADVP|CONJP < ((VB|RB|VBD=node1 < /^(?i)let$/) $+ (RB|JJ=node2 < /^(?i)alone$/))"), //let alone //TODO: "so as to" TregexPattern.compile("@ADVP|PP < ((IN|RB=node1 < /^(?i)in$/) $+ (IN|NP|PP|RB|ADVP=node2 [< /^(?i)between$/ | <: (IN|RB < /^(?i)between$/)]))"), //in between TregexPattern.compile("@ADVP|QP|ADJP < ((DT|RB=node1 < /^(?i)all$/) $+ (CC|RB|IN=node2 < /^(?i)but$/))"), //all but TregexPattern.compile("@ADVP|INTJ < ((NN|DT|RB=node1 < /^(?i)that$/) $+ (VBZ|RB=node2 < /^(?i)is$/))"), //that is TregexPattern.compile("@WHADVP < ((WRB=node1 < /^(?i:how)$/) $+ (VB=node2 < /^(?i)come$/))"), //how come TregexPattern.compile("@VP < ((VBD=node1 < had|'d) $+ (@PRT|ADVP=node2 <: (RBR < /^(?i)better$/)))"), //had better TregexPattern.compile("@QP|XS < ((JJR|RBR|IN=node1 < /^(?i)(more|less)$/) $+ (IN=node2 < /^(?i)than$/))"), //more/less than TregexPattern.compile("@QP < ((JJR|RBR|IN=node1 < /^(?i)up$/) $+ (IN|TO=node2 < /^(?i)to$/))"), //up to TregexPattern.compile("@S|SQ|VP|ADVP|PP < (@ADVP < ((IN|RB=node1 < /^(?i)at$/) $+ (JJS|RBS=node2 < /^(?i)least$/)) !$+ (RB < /(?i)(once|twice)/))"), //at least }; private static TsurgeonPattern MWE_OPERATION = Tsurgeon.parseOperation("[createSubtree MWE node1 node2] [if exists node3 move node3 $- node2]"); private static TregexPattern ACCORDING_TO_PATTERN = TregexPattern.compile("PP=pp1 < (VBG=node1 < /^(?i)according$/ $+ (PP=pp2 < (TO|IN=node2 < to)))"); private static TsurgeonPattern ACCORDING_TO_OPERATION = Tsurgeon.parseOperation("[createSubtree MWE node1] [move node2 $- node1] [excise pp2 pp2]"); /* "but also" is not a MWE, so break up the CONJP. */ private static TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))"); private static TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]"); /* at least / at most / at best / at worst / ... should be treated as if "at" was a preposition and the RBS was a noun. Assumes that the MWE "at least" has already been extracted. */ private static TregexPattern AT_RBS_PATTERN = TregexPattern.compile("@ADVP|QP < ((IN|RB=node1 < /^(?i)at$/) $+ (JJS|RBS=node2))"); private static TsurgeonPattern AT_RBS_OPERATION = Tsurgeon.parseOperation("[relabel node1 IN] [createSubtree ADVP node1] [move node2 $- node1] [createSubtree NP node2]"); /* at all should be treated like a PP. */ private static TregexPattern AT_ALL_PATTERN = TregexPattern.compile("@ADVP=head < (RB|IN=node1 < /^(?i)at$/ $+ (RB|DT=node2 < /^(?i)all$/))"); private static TsurgeonPattern AT_ALL_OPERATION = Tsurgeon.parseOperation("[relabel head PP] [relabel node1 IN] [createSubtree NP node2]"); /** * Puts all multi-word expressions below a single constituent labeled "MWE". * Patterns for multi-word expressions are defined in MWE_PATTERNS. */ public static Tree MWETransform(Tree t) { for (TregexPattern p: MWE_PATTERNS) { Tsurgeon.processPattern(p, MWE_OPERATION, t); } Tsurgeon.processPattern(ACCORDING_TO_PATTERN, ACCORDING_TO_OPERATION, t); Tsurgeon.processPattern(BUT_ALSO_PATTERN, BUT_ALSO_OPERATION, t); Tsurgeon.processPattern(AT_RBS_PATTERN, AT_RBS_OPERATION, t); Tsurgeon.processPattern(AT_ALL_PATTERN, AT_ALL_OPERATION, t); return t; } private static TregexPattern FLAT_PREP_CC_PATTERN = TregexPattern.compile("PP <, (/^(IN|TO)$/=p1 $+ (CC=cc $+ /^(IN|TO)$/=p2))"); private static TsurgeonPattern FLAT_PREP_CC_OPERATION = Tsurgeon.parseOperation("[createSubtree PCONJP p1 cc] [move p2 $- cc]"); public static Tree prepCCTransform(Tree t) { Tsurgeon.processPattern(FLAT_PREP_CC_PATTERN, FLAT_PREP_CC_OPERATION, t); return t; } public static void main(String[] args) { CoordinationTransformer transformer = new CoordinationTransformer(null); Treebank tb = new MemoryTreebank(); Properties props = StringUtils.argsToProperties(args); String treeFileName = props.getProperty("treeFile"); if (treeFileName != null) { try { TreeReader tr = new PennTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFileName))), new LabeledScoredTreeFactory()); for (Tree t ; (t = tr.readTree()) != null; ) { tb.add(t); } } catch (IOException e) { throw new RuntimeException("File problem: " + e); } } for (Tree t : tb) { System.out.println("Original tree"); t.pennPrint(); System.out.println(); System.out.println("Tree transformed"); Tree tree = transformer.transformTree(t); tree.pennPrint(); System.out.println(); System.out.println("----------------------------"); } } }