CaseFinder.java example

Explorer
avro-master
- doc
  - examples
    - java-example
      - src
        main
        java
        example
        GenericMain.java
        SpecificMain.java
    - mr-example
      - src
        main
        java
        example
        AvroWordCount.java
        GenerateData.java
        MapReduceAvroWordCount.java
        MapReduceColorCount.java
        MapredColorCount.java
- lang
  - java
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.avro.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

/** Parser for files containing test cases consisting of
 * <code><String,String></code> pairs, where the first string is
 * the input to the test case, and the second string is the expected
 * output of the test case.
 *
 * <p> A test-case file is a sequence of <a
 * href="en.wikipedia.org/wiki/Here_document">here documents</a>
 * ("heredocs"), very similar in syntax to Unix Shell heredocs.
 * Heredocs labeled "INPUT" indicate the start of a new case, and
 * these INPUT heredocs the inputs of test cases.  Following an
 * "INPUT" heredoc can more zero or more "expected-output" heredocs.
 * Each of these expected-output heredocs defines what we call a
 * <dfn>subcase</dfn>.  The assumption here is that for each
 * interesting test input, there are often multiple different tests
 * one could run, each with different expected outputs.
 *
 * <p> Consumers of this class call the {@link #find} method to find
 * all subcases marked with a given label.  For example, imagine the
 * following test-case file:
 * <blockquote> <pre>
 *    <<INPUT 0
 *    <<VALUE 0
 *    <<PPRINT 0
 *    <<INPUT 1+1
 *    <<VALUE 2
 *    <<PPRINT 1 + 1
 *    <<SEXP (+ 1 1)
 *    SEXP
 * </pre> </blockquote>
 * Calling {@link #find} on the label "VALUE" will return two test
 * cases, the pair <code><"0","0"></code> and
 * <code><"1+1","2"></code>.  Calling it on the label "PPRINT"
 * will return <code><"0","0"></code> and <code><"1+1","1 +
 * 1"></code>.  Notice that there need not be a subcase for every
 * INPUT.  In the case of "SEXP", for example, {@link #find} will
 * return only the single pair <code><"1+1","(+ 1 1)"></code>.
 *
 * <p> There are two forms of heredocs, single-line and multi-line.
 * The examples above (except "SEXP") are single-line heredocs.  The
 * general syntax for these is:
 * <blockquote> <pre>
 * ^<<([a-zA-Z][_a-zA-Z0-9]*) (.*)$
 * </pre> </blockquote>
 * The first group in this regex is the label of the heredoc, and the
 * second group is the text of the heredoc.  A single space separates
 * the two groups and is not part of there heredoc (subsequent spaces
 * <em>will</em> be included in the heredoc).  A "line terminator" as
 * defined by the Java language (i.e., CR, LR, or CR followed by LF)
 * terminates a singline-line heredoc but is not included in the text
 * of the heredoc.
 *
 * <p> As the name implies, multi-line heredocs are spread across
 * multiple lines, as in this example:
 * <blockquote> <pre>
 *    <<INPUT
 *    1
 *    +1 +
 *    1
 *    INPUT
 *    <<VALUE 3
 *    <<PPRINT 1 + 1 + 1
 * </pre> </blockquote>
 * In this case, the input to the test case is spread across multiple
 * lines (the line terminators in these documents are preserved as
 * part of the document text).  Multi-line heredocs can be used for
 * both the inputs of text cases and the expected outputs of them.

 * <p> The syntax of multi-line heredocs obey the following pseudo-regex:
 * <blockquote> <pre>
 * ^<<([a-zA-Z][_a-zA-Z0-9]*)$(.*)$^\1$
 * </pre> </blockquote>
 * That is, as illustrated by the example, a multi-line heredoc named
 * "LABEL" consists of the text <code><lt;LABEL</code> on a line by
 * itself, followed by the text of the heredoc, followed by the text
 * <code>LABEL</code> on a line by itself (if LABEL starts a line but
 * is not the <em>only</em> text on that line, then that entire line
 * is part of the heredoc, and the heredoc is not terminated by that
 * line).
 *
 * <p>In multi-line heredocs, neither the line terminator that
 * terminates the start of the document, nor the one just before the
 * label that ends the heredoc, are part of the text of the heredoc.
 * Thus, for example, the text of the multi-line input from above
 * would be exactly <code>"1\n+1 +\n1"</code>.  If you want a new
 * line at the end of a multi-line heredoc, put a blank line before
 * the label ending the heredoc.
 *
 * <p>Also in multi-line heredocs, line-terminators within the heredoc
 * are normalized to line-feeds ('\n').  Thus, for example, when a
 * test file written on a Windows machine is parsed on any machine,
 * the Windows-style line terminators within heredocs will be
 * translated to Unix-style line terminators, no matter what platform
 * the tests are run on.
 *
 * <p> Note that lines between heredocs are ignored, and can be used
 * to provide spacing between and/or commentary on the test cases.
 */
public class CaseFinder {
  /** Scan test-case file <code>in</code> looking for test subcases
    * marked with <code>caseLabel</code>.  Any such cases are appended
    * (in order) to the "cases" parameter.  If <code>caseLabel</code>
    * equals the string <code>"INPUT"</code>, then returns the list of
    * <<i>input</i>, <code>null</code>> pairs for <i>input</i>
    * equal to all heredoc's named INPUT's found in the input
    * stream. */
  public static List<Object[]> find(BufferedReader in, String label,
                                    List<Object[]> cases)
    throws IOException
  {
    if (! Pattern.matches(LABEL_REGEX, label))
      throw new IllegalArgumentException("Bad case subcase label: " + label);

    final String subcaseMarker = "<<" + label;

    for (String line = in.readLine();;) {
      // Find next new case
      while (line != null && !line.startsWith(NEW_CASE_MARKER))
        line = in.readLine();
      if (line == null) break;
      String input;
      input = processHereDoc(in, line);

      if (label.equals(NEW_CASE_NAME)) {
        cases.add(new Object[] { input, null });
        line = in.readLine();
        continue;
      }

      // Check to see if there's a subcase named "label" for that case
      do {
        line = in.readLine();
      } while (line != null && (!line.startsWith(NEW_CASE_MARKER)
                                && !line.startsWith(subcaseMarker)));
      if (line == null || line.startsWith(NEW_CASE_MARKER)) continue;
      String expectedOutput = processHereDoc(in, line);

      cases.add(new Object[] { input, expectedOutput });
    }
    in.close();
    return cases;
  }

  private static final String NEW_CASE_NAME = "INPUT";
  private static final String NEW_CASE_MARKER = "<<"+NEW_CASE_NAME;
  private static final String LABEL_REGEX = "[a-zA-Z][_a-zA-Z0-9]*";
  private static final Pattern START_LINE_PATTERN
    = Pattern.compile("^<<("+LABEL_REGEX+")(.*)$");

  /** Reads and returns content of a heredoc.  Assumes we just read a
    * start-of-here-doc marker for a here-doc labeled "docMarker."
    * Replaces arbitrary newlines with sytem newlines, but strips
    * newline from final line of heredoc.  Throws IOException if EOF
    * is reached before heredoc is terminate. */
  private static String processHereDoc(BufferedReader in, String docStart)
    throws IOException
  {
    Matcher m = START_LINE_PATTERN.matcher(docStart);
    if (! m.matches())
      throw new IllegalArgumentException("Wasn't given the start of a heredoc (\""+docStart+"\")");
    String docName = m.group(1);

    // Determine if this is a single-line heredoc, and process if it is
    String singleLineText = m.group(2);
    if (singleLineText.length() != 0) {
      if (! singleLineText.startsWith(" "))
        throw new IOException("Single-line heredoc missing initial space (\""+docStart+"\")");
      return singleLineText.substring(1);
    }

    // Process multi-line heredocs
    StringBuilder result = new StringBuilder();
    String line = in.readLine();
    String prevLine = "";
    boolean firstTime = true;
    while (line != null && !line.equals(docName)) {
      if (! firstTime) result.append(prevLine).append('\n');
      else firstTime = false;
      prevLine = line;
      line = in.readLine();
    }
    if (line == null)
      throw new IOException("Here document (" + docName
                            + ") terminated by end-of-file.");
    return result.append(prevLine).toString();
  }
}