Tsurgeon.java example

Explorer

clade-master
- stanford-ner-2011-09-14
  - NERDemo.java
  - src
    - edu
      - stanford
        nlp
        classify
        AbstractLinearClassifierFactory.java
        AdaptedGaussianPriorObjectiveFunction.java
        BiasedLogConditionalObjectiveFunction.java
        Classifier.java
        ClassifierCreator.java
        ClassifierFactory.java
        CrossValidator.java
        Dataset.java
        GeneralDataset.java
        GeneralizedExpectationObjectiveFunction.java
        LinearClassifier.java
        LinearClassifierFactory.java
        LogConditionalObjectiveFunction.java
        LogPrior.java
        PRCurve.java
        ProbabilisticClassifier.java
        ProbabilisticClassifierCreator.java
        RVFClassifier.java
        RVFDataset.java
        SemiSupervisedLogConditionalObjectiveFunction.java
        WeightedDataset.java
        fsm
        DFSA.java
        DFSAState.java
        DFSATransition.java
        ie
        AbstractSequenceClassifier.java
        AcquisitionsPrior.java
        EmpiricalNERPrior.java
        EntityCachingAbstractSequencePrior.java
        NERDemo.java
        NERFeatureFactory.java
        NERServer.java
        SeminarsPrior.java
        UniformPrior.java
        crf
        CRFClassifier.java
        CRFClassifierEvaluator.java
        CRFCliqueTree.java
        CRFDatum.java
        CRFFeatureExporter.java
        CRFLabel.java
        CRFLogConditionalObjectiveFloatFunction.java
        CRFLogConditionalObjectiveFunction.java
        FactorTable.java
        FloatFactorTable.java
        NERGUI.java
        pascal
        AcronymModel.java
        Alignment.java
        AlignmentFactory.java
        CliqueTemplates.java
        DateTemplate.java
        DefaultTeXHyphenData.java
        InfoTemplate.java
        PascalTemplate.java
        Prior.java
        RelationalModel.java
        TeXHyphenator.java
        international
        morph
        MorphoFeatureSpecification.java
        MorphoFeatures.java
        io
        BZip2PipedOutputStream.java
        EncodingFileReader.java
        EncodingPrintWriter.java
        ExtensionFileFilter.java
        IOUtils.java
        NumberRangesFileFilter.java
        RegExFileFilter.java
        RuntimeIOException.java
        ling
        AnnotationLookup.java
        BasicDatum.java
        CategoryWordTag.java
        CategoryWordTagFactory.java
        CoreAnnotation.java
        CoreAnnotations.java
        CoreLabel.java
        CyclicCoreLabel.java
        Datum.java
        Document.java
        Featurizable.java
        HasCategory.java
        HasContext.java
        HasIndex.java
        HasOffset.java
        HasTag.java
        HasWord.java
        Label.java
        LabelFactory.java
        Labeled.java
        LabeledWord.java
        RVFDatum.java
        Sentence.java
        StringLabel.java
        StringLabelFactory.java
        TaggedWord.java
        TaggedWordFactory.java
        ValueLabel.java
        Word.java
        WordFactory.java
        WordLemmaTag.java
        WordLemmaTagFactory.java
        WordTag.java
        WordTagFactory.java
        math
        ADMath.java
        ArrayMath.java
        DoubleAD.java
        SloppyMath.java
        maxent
        Convert.java
        movetrees
        EmptyTreeLeaf.java
        HasTrace.java
        objectbank
        DelimitRegExIterator.java
        IdentityFunction.java
        IteratorFromReaderFactory.java
        LineIterator.java
        ObjectBank.java
        ReaderIteratorFactory.java
        ResettableReaderIteratorFactory.java
        TokenizerFactory.java
        optimization
        AbstractCachingDiffFloatFunction.java
        AbstractCachingDiffFunction.java
        AbstractStochasticCachingDiffFunction.java
        AbstractStochasticCachingDiffUpdateFunction.java
        CGMinimizer.java
        CmdEvaluator.java
        DiffFloatFunction.java
        DiffFunction.java
        Evaluator.java
        FloatFunction.java
        Function.java
        GoldenSectionLineSearch.java
        HasEvaluators.java
        HasFloatInitial.java
        HasInitial.java
        HybridMinimizer.java
        LineSearcher.java
        MemoryEvaluator.java
        Minimizer.java
        QNMinimizer.java
        ResultStoringFloatMonitor.java
        ResultStoringMonitor.java
        SGDMinimizer.java
        SGDToQNMinimizer.java
        SMDMinimizer.java
        SQNMinimizer.java
        ScaledSGDMinimizer.java
        StochasticCalculateMethods.java
        StochasticDiffFunctionTester.java
        StochasticInPlaceMinimizer.java
        StochasticMinimizer.java
        pipeline
        Annotation.java
        ChunkAnnotationUtils.java
        CoreMapAttributeAggregator.java
        LabeledChunkIdentifier.java
        process
        AbstractTokenizer.java
        Americanize.java
        CoreLabelTokenFactory.java
        CoreTokenFactory.java
        LexedTokenFactory.java
        ListProcessor.java
        Morpha.java
        Morphology.java
        PTB2TextLexer.java
        PTBLexer.java
        PTBTokenizer.java
        Tokenizer.java
        TokenizerAdapter.java
        WhitespaceLexer.java
        WhitespaceTokenizer.java
        WordShapeClassifier.java
        WordToSentenceProcessor.java
        WordTokenFactory.java
        sequences
        BeamBestSequenceFinder.java
        BestSequenceFinder.java
        Clique.java
        CoNLLDocumentReaderAndWriter.java
        ColumnDocumentReaderAndWriter.java
        CoolingSchedule.java
        DocumentReaderAndWriter.java
        ExactBestSequenceFinder.java
        FactoredSequenceListener.java
        FactoredSequenceModel.java
        FeatureFactory.java
        KBestSequenceFinder.java
        LatticeWriter.java
        ObjectBankWrapper.java
        PlainTextDocumentReaderAndWriter.java
        SeqClassifierFlags.java
        SequenceGibbsSampler.java
        SequenceListener.java
        SequenceModel.java
        SequenceSampler.java
        ViterbiSearchGraphBuilder.java
        stats
        AbstractCounter.java
        AccuracyStats.java
        ClassicCounter.java
        Counter.java
        Counters.java
        Distribution.java
        GeneralizedCounter.java
        IntCounter.java
        MultiClassAccuracyStats.java
        MultiClassChunkEvalStats.java
        MultiClassPrecisionRecallExtendedStats.java
        MultiClassPrecisionRecallStats.java
        ProbabilityDistribution.java
        Sampler.java
        Scorer.java
        SimpleGoodTuring.java
        TwoDimensionalCounter.java
        trees
        AbstractCollinsHeadFinder.java
        AbstractTreebankLanguagePack.java
        BobChrisTreeNormalizer.java
        CollinsHeadFinder.java
        CollocationFinder.java
        CompositeTreeTransformer.java
        CompositeTreebank.java
        Constituent.java
        ConstituentFactory.java
        CoordinationTransformer.java
        Dependencies.java
        Dependency.java
        DependencyFactory.java
        DependencyTreeTransformer.java
        DiskTreebank.java
        EnglishGrammaticalRelations.java
        GrammaticalRelation.java
        GrammaticalStructure.java
        GrammaticalStructureFactory.java
        HeadFinder.java
        Labeled.java
        LabeledConstituent.java
        LabeledScoredTreeFactory.java
        LabeledScoredTreeNode.java
        LabeledScoredTreeReaderFactory.java
        MemoryTreebank.java
        ModCollinsHeadFinder.java
        NPTmpRetainingTreeNormalizer.java
        NamedDependency.java
        ParentalTreeWrapper.java
        PennTreeReader.java
        PennTreeReaderFactory.java
        PennTreebankLanguagePack.java
        PennTreebankTokenizer.java
        QPTreeTransformer.java
        SemanticHeadFinder.java
        SimpleConstituent.java
        SimpleConstituentFactory.java
        SimpleTree.java
        SimpleTreeFactory.java
        SimpleTreeReaderFactory.java
        TransformingTreebank.java
        Tree.java
        TreeCoreAnnotations.java
        TreeFactory.java
        TreeFunctions.java
        TreeGraph.java
        TreeGraphNode.java
        TreeGraphNodeFactory.java
        TreeNormalizer.java
        TreePrint.java
        TreeReader.java
        TreeReaderFactory.java
        TreeTokenizerFactory.java
        TreeTransformer.java
        TreeVisitor.java
        Treebank.java
        TreebankLanguagePack.java
        Trees.java
        TypedDependency.java
        UnnamedDependency.java
        WordNetConnection.java
        WordStemmer.java
        international
        pennchinese
        CHTBLexer.java
        CHTBTokenizer.java
        CTBTreeReaderFactory.java
        ChineseEnglishWordMap.java
        ChineseHeadFinder.java
        ChineseTreebankLanguagePack.java
        FragDiscardingPennTreeReader.java
        tregex
        CoordinationPattern.java
        DescriptionPattern.java
        ParseException.java
        Relation.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        TregexMatcher.java
        TregexParser.java
        TregexParserConstants.java
        TregexParserTokenManager.java
        TregexPattern.java
        TregexPatternCompiler.java
        VariableStrings.java
        tsurgeon
        AdjoinNode.java
        AdjoinToFootNode.java
        AdjoinToHeadNode.java
        AuxiliaryTree.java
        CoindexNodes.java
        CoindexationGenerator.java
        DeleteNode.java
        ExciseNode.java
        FetchNode.java
        HoldTreeNode.java
        InsertNode.java
        JJTTsurgeonParserState.java
        MoveNode.java
        Node.java
        ParseException.java
        PruneNode.java
        RelabelNode.java
        ReplaceNode.java
        SimpleCharStream.java
        SimpleNode.java
        Token.java
        TokenMgrError.java
        TreeLocation.java
        Tsurgeon.java
        TsurgeonParser.java
        TsurgeonParserConstants.java
        TsurgeonParserTokenManager.java
        TsurgeonParserTreeConstants.java
        TsurgeonPattern.java
        TsurgeonPatternRoot.java
        util
        AbstractIterator.java
        ArrayCoreMap.java
        ArrayHeap.java
        ArrayMap.java
        ArrayUtils.java
        Beam.java
        BinaryHeapPriorityQueue.java
        ByteStreamGobbler.java
        CollectionFactory.java
        CollectionUtils.java
        CollectionValuedMap.java
        ConcatenationIterator.java
        ConcurrentHashSet.java
        CoreMap.java
        DeltaCollectionValuedMap.java
        DeltaMap.java
        ErasureUtils.java
        Factory.java
        FilePathProcessor.java
        FileProcessor.java
        Filter.java
        FilteredIterator.java
        Filters.java
        FixedPrioritiesPriorityQueue.java
        Function.java
        Generics.java
        HasInterval.java
        HashIndex.java
        HashableCoreMap.java
        Heap.java
        IdentityHashSet.java
        Index.java
        IntPair.java
        IntQuadruple.java
        IntTriple.java
        IntTuple.java
        IntUni.java
        Interner.java
        Interval.java
        MapFactory.java
        MemoryMonitor.java
        MetaClass.java
        MutableDouble.java
        MutableInteger.java
        PaddedList.java
        Pair.java
        PriorityQueue.java
        ReflectionLoading.java
        Scored.java
        ScoredComparator.java
        ScoredObject.java
        Sets.java
        StreamGobbler.java
        StringUtils.java
        SystemUtils.java
        Timing.java
        Triple.java
        TypesafeMap.java
        XMLUtils.java
        concurrent
        SynchronizedInterner.java

// Tsurgeon
// Copyright (c) 2004-2010 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    Support/Questions: parser-user@lists.stanford.edu
//    Licensing: parser-support@lists.stanford.edu
//    http://www-nlp.stanford.edu/software/tregex.shtml

package edu.stanford.nlp.trees.tregex.tsurgeon;

import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Pair;

import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.*;

/** Tsurgeon provides a way of editing trees based on a set of operations that
 *  are applied to tree locations matching a tregex pattern.
 *  A simple example from the command-line:
 *  <blockquote>
 * java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon -treeFile atree
 *        exciseNP renameVerb
 * </blockquote>
 * The file <code>atree</code> has Penn Treebank (S-expression) format trees.
 * The other (here, two) files have Tsurgeon operations.  These consist of
 * a list of pairs of a tregex expression on one or more
 * lines, a blank line, and then some number of lines of Tsurgeon operations and then
 * another blank line.
 * <p>
 * Tsurgeon uses the Tregex engine to match tree patterns on trees;
 * for more information on Tregex's tree-matching functionality,
 * syntax, and semantics, please see the documentation for the
 * {@link TregexPattern} class.
 * <p>

 * If you want to use Tsurgeon as an API, the relevant method is
 * {@link #processPattern}.  You will also need to look at the
 * {@link TsurgeonPattern} class and the {@link Tsurgeon#parseOperation} method.
 * <p>
 * Here's the simplest form of invocation on a single Tree:
 * <pre>
 * Tree t = Tree.valueOf("(ROOT (S (NP (NP (NNP Bank)) (PP (IN of) (NP (NNP America)))) (VP (VBD called)) (. .)))");
 * TregexPattern pat = TregexPattern.compile("NP <1 (NP << Bank) <2 PP=remove");
 * TsurgeonPattern surgery = Tsurgeon.parseOperation("excise remove remove");
 * Tsurgeon.processPattern(pat, surgery, t).pennPrint();
 * </pre>
 * <p>
 * Here is another sample invocation:
 * <pre>
 * TregexPattern matchPattern = TregexPattern.compile("SQ=sq < (/^WH/ $++ VP)");
 * List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>();
 *
 * TsurgeonPattern p = Tsurgeon.parseOperation("relabel sq S");
 *
 * ps.add(p);
 *
 * Treebank lTrees;
 * List<Tree> result = Tsurgeon.processPatternOnTrees(matchPattern,Tsurgeon.collectOperations(ps),lTrees);
 * </pre>
 * <p>
 * <i>Note:</i> If you want to apply multiple surgery patterns, you will not want to call
 * processPatternOnTrees, for each individual pattern.  Rather, you should either call
 * processPatternsOnTree and loop through the trees yourself, or, as above, collect all the
 * surgery patterns into one TsurgeonPattern, and then to call processPatternOnTrees.
 * Either of these latter methods is much faster.
 * <p>
 * For more information on using Tsurgeon from the command line,
 * see the {@link #main} method and the package Javadoc.
 *
 * @author Roger Levy
 */
public class Tsurgeon {

  private static final boolean DEBUG = false;
  static boolean verbose; // = false;

  private static final Pattern emptyLinePattern = Pattern.compile("^\\s*$");
  private static final String commentIntroducingCharacter = "%";
  private static final Pattern commentPattern = Pattern.compile("(?<!\\\\)%.*$");
  private static final Pattern escapedCommentCharacterPattern = Pattern.compile("\\\\" + commentIntroducingCharacter);

  private Tsurgeon() {} // not an instantiable class

  /** Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n
   *
   * <h4>Arguments:</h4>
   *
   * Each argument should be the name of a transformation file that contains a list of pattern
   * and transformation operation list pairs.  That is, it is a sequence of pairs of a
   * {@link TregexPattern} pattern on one or more lines, then a
   * blank line (empty or whitespace), then a list of transformation operations one per line
   * (as specified by <b>Legal operation syntax</b> below) to apply when the pattern is matched,
   * and then another blank line (empty or whitespace).
   * Note the need for blank lines: The code crashes if they are not present as separators
   * (although the blank line at the end of the file can be omitted).
   * The script file can include comment lines, either whole comment lines or
   * trailing comments introduced by %, which extend to the end of line.  A needed percent
   * mark can be escaped by a preceding backslash.
   * <p>
   * For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node,
   * and relabel the SQ node to S, your transformation file would look like this: 
   *
   * <blockquote>
   * <code>
   *    SBARQ=n1 < SQ=n2<br>
   *    <br>
   *    excise n1 n1<br>
   *    relabel n2 S
   * </code>
   * </blockquote>
   *
   * <h4>Options:</h4>
   * <ul>
   *   <li><code>-treeFile <filename></code>  specify the name of the file that has the trees you want to transform.
   *   <li><code>-po <matchPattern> <operation></code>  Apply a single operation to every tree using the specified match pattern and the specified operation.  Use this option
   *   when you want to quickly try the effect of one pattern/surgery combination, and are too lazy to write a transformation file.
   *   <li><code>-s</code> Print each output tree on one line (default is pretty-printing).
   *   <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as "Operated on:") and "after" (prepended as "Result:").  Unoperated trees just pass through the transducer as usual.
   *   <li><code>-encoding X</code> Uses character set X for input and output of trees.
   * </ul>
   *
   * <h4>Legal operation syntax:</h4>
   *
   * <ul>
   *   <li><code>delete <name></code>  deletes the node and everything below it.
   *   <li><code>prune <name></code>  Like delete, but if, after the pruning, the parent has no children anymore, the parent is pruned too.
   * <li><code>excise <name1> <name2></code>
   *   The name1 node should either dominate or be the same as the name2 node.  This excises out everything from
   * name1 to name2.  All the children of name2 go into the parent of name1, where name1 was.
   * <li><code>relabel <name> <new-label></code> Relabels the node to have the new label.
  "    There are three possible forms: <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string,
  "    <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid identifier without quoting, and
  "    <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based relabeling. In the last case, all matches +
  "    of the regular expression against the node label are replaced with the replacement String.  This has the semantics of
  "    Java/Perl's replaceAll: you may use capturing groups and put them in replacements with $n. Also, as in the example
  "    you can escape a slash in the middle of the second and third forms with \\/ and \\\\.
   * <li><code>insert <name> <position></code> or <code>insert <tree> <position></code>
   *   inserts the named node or tree into the position specified.
   * <li><code>move <name> <position></code> moves the named node into the specified position
   * <p>Right now the  only ways to specify position are:
   * <p>
   *      <code>$+ <name></code>     the left sister of the named node<br>
   *      <code>$- <name></code>     the right sister of the named node<br>
   *      <code>>i</code> the i_th daughter of the named node<br>
   *      <code>>-i</code> the i_th daughter, counting from the right, of the named node.
   * <li><code>replace <name1> <name2></code> or <code>replace <name1> <tree></code>
   *     deletes name1 and inserts tree or a copy of name2 in its place.
   * <li><code>adjoin <auxiliary_tree> <name></code> Adjoins the specified auxiliary tree into the named node.
   *     The daughters of the target node will become the daughters of the foot of the auxiliary tree.
   * <li><code>adjoinH <auxiliary_tree> <name></code> Similar to adjoin, but preserves the target node
   *     and makes it the root of <tree>. (It is still accessible as <code>name</code>.  The root of the
   *     auxiliary tree is ignored.)
   * <li> <code>adjoinF <auxiliary_tree> <name></code></dt>  Similar to adjoin,
   *     but preserves the target node and makes it the foot of <tree>.
   *     (It is still accessible as <code>name</code>, and retains its status as parent of its children.
   *     The root of the auxiliary tree is ignored.)
   * <li> <dt><code>coindex <name1> <name2> ... <nameM> </code> Puts a (Penn Treebank style)
   *     coindexation suffix of the form "-N" on each of nodes name_1 through name_m.  The value of N will be
   *     automatically generated in reference to the existing coindexations in the tree, so that there is never
   *     an accidental clash of indices across things that are not meant to be coindexed.
   * </ul>
   *
   * @param args a list of names of files each of which contains a single tregex matching pattern plus a list, one per line,
   *        of transformation operations to apply to the matched pattern.
   * @throws Exception If an I/O or pattern syntax error
   */
  public static void main(String[] args) throws Exception {
    String encoding = "UTF-8";
    String encodingOption = "-encoding";
    if(args.length==0) {
      System.err.println("Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>");
      System.exit(0);
    }
    String treePrintFormats;
    String singleLineOption = "-s";
    String verboseOption = "-v";
    String matchedOption = "-m"; // if set, then print original form of trees that are matched & thus operated on
    String patternOperationOption = "-po";
    String treeFileOption = "-treeFile";
    Map<String,Integer> flagMap = new HashMap<String,Integer>();
    flagMap.put(patternOperationOption,2);
    flagMap.put(treeFileOption,1);
    flagMap.put(singleLineOption,0);
    flagMap.put(encodingOption,1);
    Map<String,String[]> argsMap = StringUtils.argsToMap(args,flagMap);
    args = argsMap.get(null);

    if(argsMap.containsKey(verboseOption))      verbose = true;
    if(argsMap.containsKey(singleLineOption))   treePrintFormats = "oneline,";   else treePrintFormats = "penn,";
    if(argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0];

    TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack());
    PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out,encoding), true);
    tp.setPrintWriter(pwOut);

    Treebank trees = new DiskTreebank(new TregexPattern.TRegexTreeReaderFactory(), encoding);
    if (argsMap.containsKey(treeFileOption)) {
      trees.loadPath(argsMap.get(treeFileOption)[0]);
    }
    List<Pair<TregexPattern,TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern,TsurgeonPattern>>();


    if (argsMap.containsKey(patternOperationOption)) {
      TregexPattern matchPattern = TregexPattern.compile(argsMap.get(patternOperationOption)[0]);
      TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]);
      ops.add(new Pair<TregexPattern,TsurgeonPattern>(matchPattern,p));
    } else {
      for (String arg : args) {
        List<Pair<TregexPattern,TsurgeonPattern>> pairs = getOperationsFromFile(arg, encoding);
        for (Pair<TregexPattern,TsurgeonPattern> pair : pairs) {
          if (verbose) {
            System.err.println(pair.second());
          }
          ops.add(pair);
        }
      }
    }

    for (Tree t : trees ) {
      Tree original = t.deepCopy();
      Tree result = processPatternsOnTree(ops, t);
      if (argsMap.containsKey(matchedOption) && matchedOnTree) {
        pwOut.println("Operated on: ");
        displayTree(original,tp,pwOut);
        pwOut.println("Result: ");
      }
      displayTree(result,tp,pwOut);
    }
  }

  private static void displayTree(Tree t, TreePrint tp, PrintWriter pw) {
    if (t==null) {
      System.out.println("null");
    } else {
      tp.printTree(t,pw);
    }
  }

  /**
   * Parses a tsurgeon script text input and compiles a tregex pattern and a list
   * of tsurgeon operations into a pair.
   *
   * @param reader Reader to read patterns from
   * @return A pair of a tregex and tsurgeon pattern read from a file, or <code>null</code>
   *    when the operations in the Reader have been exhausted
   * @throws IOException If any IO problem
   */
  public static Pair<TregexPattern, TsurgeonPattern> getOperationFromReader(BufferedReader reader) throws IOException {
    String patternString = getPatternFromFile(reader);
    // System.err.println("Read tregex pattern: " + patternString);
    if ("".equals(patternString)) {
      return null;
    }
    TregexPattern matchPattern;
    try {
      matchPattern = TregexPattern.compile(patternString);
    } catch (edu.stanford.nlp.trees.tregex.ParseException e) {
      System.err.println("Error parsing your tregex pattern:\n" + patternString);
      throw new RuntimeException(e);
    }

    TsurgeonPattern collectedPattern = getTsurgeonOperationsFromReader(reader);
    return new Pair<TregexPattern,TsurgeonPattern>(matchPattern,collectedPattern);
  }

  /**
   * Assumes that we are at the beginning of a tsurgeon script file and gets the string for the
   * tregex pattern leading the file
   * @return tregex pattern string
   */
  public static String getPatternFromFile(BufferedReader reader) throws IOException {
    StringBuilder matchString = new StringBuilder();
    for (String thisLine; (thisLine = reader.readLine()) != null; ) {
      if (matchString.length() > 0 && emptyLinePattern.matcher(thisLine).matches()) {
        // A blank line after getting some real content (not just comments or nothing)
        break;
      }
      Matcher m = commentPattern.matcher(thisLine);
      if (m.matches()) {
        // delete it
        thisLine = m.replaceFirst("");
      }
      if ( ! emptyLinePattern.matcher(thisLine).matches()) {
        matchString.append(thisLine);
      }
    }
    return matchString.toString();
  }

  /**
   * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and parses
   * these out, collecting them into one operation.  Stops on a whitespace line.
   *
   * @throws IOException
   */
  public static TsurgeonPattern getTsurgeonOperationsFromReader(BufferedReader reader) throws IOException {
    List<TsurgeonPattern> operations = new ArrayList<TsurgeonPattern>();
    for (String thisLine; (thisLine = reader.readLine()) != null; ) {
      if (emptyLinePattern.matcher(thisLine).matches()) {
        break;
      }
      thisLine = removeComments(thisLine);
      if (emptyLinePattern.matcher(thisLine).matches()) {
        continue;
      }
      // System.err.println("Read tsurgeon op: " + thisLine);
      try {
        operations.add(TsurgeonParser.parse(thisLine));
      }
      catch (ParseException e) {
        System.err.println("Error parsing your Tsurgeon operation:\n" + thisLine);
        throw new RuntimeException(e.toString());
      }
    }
    return collectOperations(operations);
  }


  private static String removeComments(String line) {
    Matcher m = commentPattern.matcher(line);
    line = m.replaceFirst("");
    Matcher m1 = escapedCommentCharacterPattern.matcher(line);
    line = m1.replaceAll(commentIntroducingCharacter);
    return line;
  }


  /**
   * Assumes the given reader has only tsurgeon operations (not a tregex pattern), and returns
   * them as a String, mirroring the way the strings appear in the file. This is helpful
   * for lazy evaluation of the operations, as in a GUI,
   * because you do not parse the operations on load.  Comments are still excised.
   * @throws IOException
   */
  public static String getTsurgeonTextFromReader(BufferedReader reader) throws IOException {
    StringBuilder sb = new StringBuilder();
    for (String thisLine; (thisLine = reader.readLine()) != null; ) {
      thisLine = removeComments(thisLine);
      if (emptyLinePattern.matcher(thisLine).matches()) {
        continue;
      }
      sb.append(thisLine);
      sb.append('\n');
    }
    return sb.toString();
  }

  /**
   * Parses a tsurgeon script file and compiles all operations in the file into a list
   * of pairs of tregex and tsurgeon patterns.
   *
   * @param filename file containing the tsurgeon script
   * @return A pair of a tregex and tsurgeon pattern read from a file
   * @throws IOException If there is any I/O problem
   */
  public static List<Pair<TregexPattern, TsurgeonPattern>> getOperationsFromFile(String filename, String encoding) throws IOException {
    List<Pair<TregexPattern,TsurgeonPattern>> operations = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding));
    for ( ; ; ) {
      Pair<TregexPattern, TsurgeonPattern> operation = getOperationFromReader(reader);
      if (operation == null) {
        break;
      }
      operations.add(operation);
    }
    reader.close();
    return operations;
  }

  /**
   * Applies {#processPattern} to a collection of trees.
   * @param matchPattern A {@link TregexPattern} to be matched against a {@link Tree}.
   * @param p A {@link TsurgeonPattern} to apply.
   * @param inputTrees The input trees to be processed
   * @return A List of the transformed trees
   */
  public static List<Tree> processPatternOnTrees(TregexPattern matchPattern, TsurgeonPattern p, Collection<Tree> inputTrees) {
    List<Tree> result = new ArrayList<Tree>();
    for (Tree tree : inputTrees)
      result.add(processPattern(matchPattern,p,tree));
    return result;
  }

  /**
   * Tries to match a pattern against a tree.  If it succeeds, apply the surgical operations contained in a {@link TsurgeonPattern}.
   * @param matchPattern A {@link TregexPattern} to be matched against a {@link Tree}.
   * @param p A {@link TsurgeonPattern} to apply.
   * @param t the {@link Tree} to match against and perform surgery on.
   * @return t, which has been surgically modified.
   */
  public static Tree processPattern(TregexPattern matchPattern, TsurgeonPattern p, Tree t) {
    TregexMatcher m = matchPattern.matcher(t);
    while(m.find()) {
      t = p.evaluate(t,m);
      if(t==null)
        break;
      m = matchPattern.matcher(t);
    }
    return t;
  }

  private static boolean matchedOnTree; // hack-in field for seeing whether there was a match.

  public static Tree processPatternsOnTree(List<Pair<TregexPattern, TsurgeonPattern>> ops, Tree t) {
    matchedOnTree = false;
    for (Pair<TregexPattern,TsurgeonPattern> op : ops) {
      try {
        if (DEBUG) {
          System.err.println("Running pattern " + op.first());
        }
        TregexMatcher m = op.first().matcher(t);
        while (m.find()) {
          matchedOnTree = true;
          t = op.second().evaluate(t,m);
          if (t == null) {
            return null;
          }
          m = op.first().matcher(t);
        }
      } catch (NullPointerException npe) {
        throw new RuntimeException("Tsurgeon.processPatternsOnTree failed to match label for pattern: " + op.first() + ", " + op.second(), npe);
      }
    }
    return t;
  }



  /**
   * Parses an operation string into a {@link TsurgeonPattern}.  Throws an {@link IllegalArgumentException} if
   * the operation string is ill-formed.
   * <p>
   * Example of use:
   * <p>
   * <tt>
   * TsurgeonPattern p = Tsurgeon.parseOperation("prune ed");
   * </tt>
   * @param operationString The operation to perform, as a text string
   * @return the operation pattern.
   */
  public static TsurgeonPattern parseOperation(String operationString) {
    try {
      return new TsurgeonPatternRoot(new TsurgeonPattern[] {TsurgeonParser.parse(operationString)} );
    }
    catch(ParseException e) {
      throw new IllegalArgumentException("Ill-formed operation string: " + operationString, e);
    }
  }

  /**
   * Collects a list of operation patterns into a sequence of operations to be applied.  Required to keep track of global properties
   * across a sequence of operations.  For example, if you want to insert a named node and then coindex it with another node,
   * you will need to collect the insertion and coindexation operations into a single TsurgeonPattern so that tsurgeon is aware
   * of the name of the new node and coindexation becomes possible.
   * @param patterns a list of {@link TsurgeonPattern} operations that you want to collect together into a single compound operation
   * @return a new {@link TsurgeonPattern} that performs all the operations in the sequence of the <code>patterns</code> argument
   */
  public static TsurgeonPattern collectOperations(List<TsurgeonPattern> patterns) {
    return new TsurgeonPatternRoot(patterns.toArray(new TsurgeonPattern[patterns.size()]));
  }

}