// Tsurgeon // Copyright (c) 2004-2010 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // Support/Questions: parser-user@lists.stanford.edu // Licensing: parser-support@lists.stanford.edu // http://www-nlp.stanford.edu/software/tregex.shtml package edu.stanford.nlp.trees.tregex.tsurgeon; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.Pair; import java.util.*; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.io.*; /** Tsurgeon provides a way of editing trees based on a set of operations that * are applied to tree locations matching a tregex pattern. * A simple example from the command-line: * <blockquote> * java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile atree * exciseNP renameVerb * </blockquote> * The file <code>atree</code> has Penn Treebank (S-expression) format trees. * The other (here, two) files have Tsurgeon operations. These consist of * a list of pairs of a tregex expression on one or more * lines, a blank line, and then some number of lines of Tsurgeon operations and then * another blank line. * <p> * Tsurgeon uses the Tregex engine to match tree patterns on trees; * for more information on Tregex's tree-matching functionality, * syntax, and semantics, please see the documentation for the * {@link TregexPattern} class. * <p> * If you want to use Tsurgeon as an API, the relevant method is * {@link #processPattern}. You will also need to look at the * {@link TsurgeonPattern} class and the {@link Tsurgeon#parseOperation} method. * <p> * Here's the simplest form of invocation on a single Tree: * <pre> * Tree t = Tree.valueOf("(ROOT (S (NP (NP (NNP Bank)) (PP (IN of) (NP (NNP America)))) (VP (VBD called)) (. .)))"); * TregexPattern pat = TregexPattern.compile("NP <1 (NP << Bank) <2 PP=remove"); * TsurgeonPattern surgery = Tsurgeon.parseOperation("excise remove remove"); * Tsurgeon.processPattern(pat, surgery, t).pennPrint(); * </pre> * <p> * Here is another sample invocation: * <pre> * TregexPattern matchPattern = TregexPattern.compile("SQ=sq < (/^WH/ $++ VP)"); * List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); * * TsurgeonPattern p = Tsurgeon.parseOperation("relabel sq S"); * * ps.add(p); * * Treebank lTrees; * List<Tree> result = Tsurgeon.processPatternOnTrees(matchPattern,Tsurgeon.collectOperations(ps),lTrees); * </pre> * <p> * <i>Note:</i> If you want to apply multiple surgery patterns, you will not want to call * processPatternOnTrees, for each individual pattern. Rather, you should either call * processPatternsOnTree and loop through the trees yourself, or, as above, collect all the * surgery patterns into one TsurgeonPattern, and then to call processPatternOnTrees. * Either of these latter methods is much faster. * <p> * For more information on using Tsurgeon from the command line, * see the {@link #main} method and the package Javadoc. * * @author Roger Levy */ public class Tsurgeon { private static final boolean DEBUG = false; static boolean verbose; // = false; private static final Pattern emptyLinePattern = Pattern.compile("^\\s*$"); private static final String commentIntroducingCharacter = "%"; private static final Pattern commentPattern = Pattern.compile("(?<!\\\\)%.*$"); private static final Pattern escapedCommentCharacterPattern = Pattern.compile("\\\\" + commentIntroducingCharacter); private Tsurgeon() {} // not an instantiable class /** Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n * * <h4>Arguments:</h4> * * Each argument should be the name of a transformation file that contains a list of pattern * and transformation operation list pairs. That is, it is a sequence of pairs of a * {@link TregexPattern} pattern on one or more lines, then a * blank line (empty or whitespace), then a list of transformation operations one per line * (as specified by <b>Legal operation syntax</b> below) to apply when the pattern is matched, * and then another blank line (empty or whitespace). * Note the need for blank lines: The code crashes if they are not present as separators * (although the blank line at the end of the file can be omitted). * The script file can include comment lines, either whole comment lines or * trailing comments introduced by %, which extend to the end of line. A needed percent * mark can be escaped by a preceding backslash. * <p> * For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node, * and relabel the SQ node to S, your transformation file would look like this: * * <blockquote> * <code> * SBARQ=n1 < SQ=n2<br> * <br> * excise n1 n1<br> * relabel n2 S * </code> * </blockquote> * * <h4>Options:</h4> * <ul> * <li><code>-treeFile <filename></code> specify the name of the file that has the trees you want to transform. * <li><code>-po <matchPattern> <operation></code> Apply a single operation to every tree using the specified match pattern and the specified operation. Use this option * when you want to quickly try the effect of one pattern/surgery combination, and are too lazy to write a transformation file. * <li><code>-s</code> Print each output tree on one line (default is pretty-printing). * <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as "Operated on:") and "after" (prepended as "Result:"). Unoperated trees just pass through the transducer as usual. * <li><code>-encoding X</code> Uses character set X for input and output of trees. * </ul> * * <h4>Legal operation syntax:</h4> * * <ul> * <li><code>delete <name></code> deletes the node and everything below it. * <li><code>prune <name></code> Like delete, but if, after the pruning, the parent has no children anymore, the parent is pruned too. * <li><code>excise <name1> <name2></code> * The name1 node should either dominate or be the same as the name2 node. This excises out everything from * name1 to name2. All the children of name2 go into the parent of name1, where name1 was. * <li><code>relabel <name> <new-label></code> Relabels the node to have the new label. " There are three possible forms: <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string, " <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid identifier without quoting, and " <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based relabeling. In the last case, all matches + " of the regular expression against the node label are replaced with the replacement String. This has the semantics of " Java/Perl's replaceAll: you may use capturing groups and put them in replacements with $n. Also, as in the example " you can escape a slash in the middle of the second and third forms with \\/ and \\\\. * <li><code>insert <name> <position></code> or <code>insert <tree> <position></code> * inserts the named node or tree into the position specified. * <li><code>move <name> <position></code> moves the named node into the specified position * <p>Right now the only ways to specify position are: * <p> * <code>$+ <name></code> the left sister of the named node<br> * <code>$- <name></code> the right sister of the named node<br> * <code>>i</code> the i_th daughter of the named node<br> * <code>>-i</code> the i_th daughter, counting from the right, of the named node. * <li><code>replace <name1> <name2></code> or <code>replace <name1> <tree></code> * deletes name1 and inserts tree or a copy of name2 in its place. * <li><code>adjoin <auxiliary_tree> <name></code> Adjoins the specified auxiliary tree into the named node. * The daughters of the target node will become the daughters of the foot of the auxiliary tree. * <li><code>adjoinH <auxiliary_tree> <name></code> Similar to adjoin, but preserves the target node * and makes it the root of <tree>. (It is still accessible as <code>name</code>. The root of the * auxiliary tree is ignored.) * <li> <code>adjoinF <auxiliary_tree> <name></code></dt> Similar to adjoin, * but preserves the target node and makes it the foot of <tree>. * (It is still accessible as <code>name</code>, and retains its status as parent of its children. * The root of the auxiliary tree is ignored.) * <li> <dt><code>coindex <name1> <name2> ... <nameM> </code> Puts a (Penn Treebank style) * coindexation suffix of the form "-N" on each of nodes name_1 through name_m. The value of N will be * automatically generated in reference to the existing coindexations in the tree, so that there is never * an accidental clash of indices across things that are not meant to be coindexed. * </ul> * * @param args a list of names of files each of which contains a single tregex matching pattern plus a list, one per line, * of transformation operations to apply to the matched pattern. * @throws Exception If an I/O or pattern syntax error */ public static void main(String[] args) throws Exception { String encoding = "UTF-8"; String encodingOption = "-encoding"; if(args.length==0) { System.err.println("Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>"); System.exit(0); } String treePrintFormats; String singleLineOption = "-s"; String verboseOption = "-v"; String matchedOption = "-m"; // if set, then print original form of trees that are matched & thus operated on String patternOperationOption = "-po"; String treeFileOption = "-treeFile"; Map<String,Integer> flagMap = new HashMap<String,Integer>(); flagMap.put(patternOperationOption,2); flagMap.put(treeFileOption,1); flagMap.put(singleLineOption,0); flagMap.put(encodingOption,1); Map<String,String[]> argsMap = StringUtils.argsToMap(args,flagMap); args = argsMap.get(null); if(argsMap.containsKey(verboseOption)) verbose = true; if(argsMap.containsKey(singleLineOption)) treePrintFormats = "oneline,"; else treePrintFormats = "penn,"; if(argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0]; TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack()); PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out,encoding), true); tp.setPrintWriter(pwOut); Treebank trees = new DiskTreebank(new TregexPattern.TRegexTreeReaderFactory(), encoding); if (argsMap.containsKey(treeFileOption)) { trees.loadPath(argsMap.get(treeFileOption)[0]); } List<Pair<TregexPattern,TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern,TsurgeonPattern>>(); if (argsMap.containsKey(patternOperationOption)) { TregexPattern matchPattern = TregexPattern.compile(argsMap.get(patternOperationOption)[0]); TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]); ops.add(new Pair<TregexPattern,TsurgeonPattern>(matchPattern,p)); } else { for (String arg : args) { List<Pair<TregexPattern,TsurgeonPattern>> pairs = getOperationsFromFile(arg, encoding); for (Pair<TregexPattern,TsurgeonPattern> pair : pairs) { if (verbose) { System.err.println(pair.second()); } ops.add(pair); } } } for (Tree t : trees ) { Tree original = t.deepCopy(); Tree result = processPatternsOnTree(ops, t); if (argsMap.containsKey(matchedOption) && matchedOnTree) { pwOut.println("Operated on: "); displayTree(original,tp,pwOut); pwOut.println("Result: "); } displayTree(result,tp,pwOut); } } private static void displayTree(Tree t, TreePrint tp, PrintWriter pw) { if (t==null) { System.out.println("null"); } else { tp.printTree(t,pw); } } /** * Parses a tsurgeon script text input and compiles a tregex pattern and a list * of tsurgeon operations into a pair. * * @param reader Reader to read patterns from * @return A pair of a tregex and tsurgeon pattern read from a file, or <code>null</code> * when the operations in the Reader have been exhausted * @throws IOException If any IO problem */ public static Pair<TregexPattern, TsurgeonPattern> getOperationFromReader(BufferedReader reader) throws IOException { String patternString = getPatternFromFile(reader); // System.err.println("Read tregex pattern: " + patternString); if ("".equals(patternString)) { return null; } TregexPattern matchPattern; try { matchPattern = TregexPattern.compile(patternString); } catch (edu.stanford.nlp.trees.tregex.ParseException e) { System.err.println("Error parsing your tregex pattern:\n" + patternString); throw new RuntimeException(e); } TsurgeonPattern collectedPattern = getTsurgeonOperationsFromReader(reader); return new Pair<TregexPattern,TsurgeonPattern>(matchPattern,collectedPattern); } /** * Assumes that we are at the beginning of a tsurgeon script file and gets the string for the * tregex pattern leading the file * @return tregex pattern string */ public static String getPatternFromFile(BufferedReader reader) throws IOException { StringBuilder matchString = new StringBuilder(); for (String thisLine; (thisLine = reader.readLine()) != null; ) { if (matchString.length() > 0 && emptyLinePattern.matcher(thisLine).matches()) { // A blank line after getting some real content (not just comments or nothing) break; } Matcher m = commentPattern.matcher(thisLine); if (m.matches()) { // delete it thisLine = m.replaceFirst(""); } if ( ! emptyLinePattern.matcher(thisLine).matches()) { matchString.append(thisLine); } } return matchString.toString(); } /** * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and parses * these out, collecting them into one operation. Stops on a whitespace line. * * @throws IOException */ public static TsurgeonPattern getTsurgeonOperationsFromReader(BufferedReader reader) throws IOException { List<TsurgeonPattern> operations = new ArrayList<TsurgeonPattern>(); for (String thisLine; (thisLine = reader.readLine()) != null; ) { if (emptyLinePattern.matcher(thisLine).matches()) { break; } thisLine = removeComments(thisLine); if (emptyLinePattern.matcher(thisLine).matches()) { continue; } // System.err.println("Read tsurgeon op: " + thisLine); try { operations.add(TsurgeonParser.parse(thisLine)); } catch (ParseException e) { System.err.println("Error parsing your Tsurgeon operation:\n" + thisLine); throw new RuntimeException(e.toString()); } } return collectOperations(operations); } private static String removeComments(String line) { Matcher m = commentPattern.matcher(line); line = m.replaceFirst(""); Matcher m1 = escapedCommentCharacterPattern.matcher(line); line = m1.replaceAll(commentIntroducingCharacter); return line; } /** * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and returns * them as a String, mirroring the way the strings appear in the file. This is helpful * for lazy evaluation of the operations, as in a GUI, * because you do not parse the operations on load. Comments are still excised. * @throws IOException */ public static String getTsurgeonTextFromReader(BufferedReader reader) throws IOException { StringBuilder sb = new StringBuilder(); for (String thisLine; (thisLine = reader.readLine()) != null; ) { thisLine = removeComments(thisLine); if (emptyLinePattern.matcher(thisLine).matches()) { continue; } sb.append(thisLine); sb.append('\n'); } return sb.toString(); } /** * Parses a tsurgeon script file and compiles all operations in the file into a list * of pairs of tregex and tsurgeon patterns. * * @param filename file containing the tsurgeon script * @return A pair of a tregex and tsurgeon pattern read from a file * @throws IOException If there is any I/O problem */ public static List<Pair<TregexPattern, TsurgeonPattern>> getOperationsFromFile(String filename, String encoding) throws IOException { List<Pair<TregexPattern,TsurgeonPattern>> operations = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding)); for ( ; ; ) { Pair<TregexPattern, TsurgeonPattern> operation = getOperationFromReader(reader); if (operation == null) { break; } operations.add(operation); } reader.close(); return operations; } /** * Applies {#processPattern} to a collection of trees. * @param matchPattern A {@link TregexPattern} to be matched against a {@link Tree}. * @param p A {@link TsurgeonPattern} to apply. * @param inputTrees The input trees to be processed * @return A List of the transformed trees */ public static List<Tree> processPatternOnTrees(TregexPattern matchPattern, TsurgeonPattern p, Collection<Tree> inputTrees) { List<Tree> result = new ArrayList<Tree>(); for (Tree tree : inputTrees) result.add(processPattern(matchPattern,p,tree)); return result; } /** * Tries to match a pattern against a tree. If it succeeds, apply the surgical operations contained in a {@link TsurgeonPattern}. * @param matchPattern A {@link TregexPattern} to be matched against a {@link Tree}. * @param p A {@link TsurgeonPattern} to apply. * @param t the {@link Tree} to match against and perform surgery on. * @return t, which has been surgically modified. */ public static Tree processPattern(TregexPattern matchPattern, TsurgeonPattern p, Tree t) { TregexMatcher m = matchPattern.matcher(t); while(m.find()) { t = p.evaluate(t,m); if(t==null) break; m = matchPattern.matcher(t); } return t; } private static boolean matchedOnTree; // hack-in field for seeing whether there was a match. public static Tree processPatternsOnTree(List<Pair<TregexPattern, TsurgeonPattern>> ops, Tree t) { matchedOnTree = false; for (Pair<TregexPattern,TsurgeonPattern> op : ops) { try { if (DEBUG) { System.err.println("Running pattern " + op.first()); } TregexMatcher m = op.first().matcher(t); while (m.find()) { matchedOnTree = true; t = op.second().evaluate(t,m); if (t == null) { return null; } m = op.first().matcher(t); } } catch (NullPointerException npe) { throw new RuntimeException("Tsurgeon.processPatternsOnTree failed to match label for pattern: " + op.first() + ", " + op.second(), npe); } } return t; } /** * Parses an operation string into a {@link TsurgeonPattern}. Throws an {@link IllegalArgumentException} if * the operation string is ill-formed. * <p> * Example of use: * <p> * <tt> * TsurgeonPattern p = Tsurgeon.parseOperation("prune ed"); * </tt> * @param operationString The operation to perform, as a text string * @return the operation pattern. */ public static TsurgeonPattern parseOperation(String operationString) { try { return new TsurgeonPatternRoot(new TsurgeonPattern[] {TsurgeonParser.parse(operationString)} ); } catch(ParseException e) { throw new IllegalArgumentException("Ill-formed operation string: " + operationString, e); } } /** * Collects a list of operation patterns into a sequence of operations to be applied. Required to keep track of global properties * across a sequence of operations. For example, if you want to insert a named node and then coindex it with another node, * you will need to collect the insertion and coindexation operations into a single TsurgeonPattern so that tsurgeon is aware * of the name of the new node and coindexation becomes possible. * @param patterns a list of {@link TsurgeonPattern} operations that you want to collect together into a single compound operation * @return a new {@link TsurgeonPattern} that performs all the operations in the sequence of the <code>patterns</code> argument */ public static TsurgeonPattern collectOperations(List<TsurgeonPattern> patterns) { return new TsurgeonPatternRoot(patterns.toArray(new TsurgeonPattern[patterns.size()])); } }