Tokenizer.java example

Explorer
RecordBreaker-master
- src
/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.learnstructure;

import java.io.*;
import java.util.*;
import java.util.regex.*;

/*********************************************************
 * Tokenizer transforms a line of text into a set of Token objects.
 * Each Token is one of a handful of classes.
 *
 *********************************************************/
public class Tokenizer {
  // The components of possible date patterns
  static String monthPatternStrs[] = {"(January|Jan|jan|February|Feb|feb|March|Mar|mar|April|Apr|apr|May|may|June|Jun|jun|July|Jul|jul|August|Aug|aug|September|Sep|sep|October|Oct|oct|November|Nov|nov|December|Dec|dec)", "([01]*\\d)"};
  static String dateSeparatorPatternStrs[] = {"(?:\\s+)", "(?:\\.)", "(?:\\/)"};
  static String dateDayPatternStr = "([0123]?\\d)";
  static String dateYearPatternStr = "([12]\\d{3})";

  static List<Pattern> monthFirstPatterns = new ArrayList<Pattern>();
  static List<Pattern> yearFirstPatterns = new ArrayList<Pattern>();
  static List<Pattern> dayFirstPatterns = new ArrayList<Pattern>();

  static Pattern ipAddrPattern = Pattern.compile("((?:(?:\\d+\\.){3,}\\d+)|(?:\\*\\.(?:(?:\\d+|\\*)\\.)*(?:\\d+|\\*)))");
  static Pattern permissionBitPattern = Pattern.compile("([drwx-]{9,})");
  static Pattern timePattern1 = Pattern.compile("(\\d\\d):(\\d\\d):(\\d\\d)");
  static Pattern timePattern2 = Pattern.compile("(\\d\\d):(\\d\\d)");
  static Pattern intPattern = Pattern.compile("([-+]?\\d+)");
  static Pattern intRangePattern = Pattern.compile("(\\d+)-(\\d+)");
  static Pattern floatPattern = Pattern.compile("([+-]?\\d*\\.\\d+)");
  static Pattern floatRangePattern = Pattern.compile("(\\d*\\.\\d+)-(\\d*\\.\\d+)");
  static Pattern stringPattern = Pattern.compile("((?:[\\S&&[^\\\"\\,\\;\\|\\[\\]\\{\\}\\<\\>\\(\\)\\']]){2,})");
  static Pattern charPattern = Pattern.compile("(\\S)");
  static Pattern eolPattern = Pattern.compile("(\\n)");
  static Pattern wsPattern = Pattern.compile("(\\s+)");
  static HashMap<String, String> complements;
  static HashMap<String, String> reverseComplements;

  /**
   * attemptParse() tries to parse the input string with the given token-class.  
   * If successful, it returns the remaining string and adds the token to the given list.
   * If not successful, it returns null and does not modify the given list.
   *
   * This method is used in two places:
   * 1) text-tokenization during the structure-learning phase
   * 2) Guided parsing, after a learned structure-parser has been constructed.
   *
   * For case #1, we expect that a loop will call attemptParse repeatedly, until it find a token-type
   * that can be correctly parsed.  This is what happens inside tokenize() below.
   *
   * For case #2, we expect that the parse-tree will contain a specific token-type that *must* be
   * parsed, or else that branch of the parse-tree is invalid.  This is what happens inside
   * InferredType.BaseType.internalParse().
   */
  public static String attemptParse(int tokenClassId, String tokenParameter, String inputStr, List<Token.AbstractToken> outputToks) {
    switch (tokenClassId) {
    case Token.IPADDR_TOKENCLASSID: {
      Matcher m = ipAddrPattern.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.IPAddrToken(m.group(1)));
        return cutChunk(m, inputStr);
      } else {
        return null;
      }
    }
    case Token.PERMISSIONS_TOKENCLASSID: {
      Matcher m = permissionBitPattern.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.PermissionBits(m.group(1)));
        return cutChunk(m, inputStr);
      } else {
        return null;
      }
    }
    case Token.DATE_TOKENCLASSID: {
      String newStr = null;
      for (Pattern p: monthFirstPatterns) {
        Matcher m = p.matcher(inputStr);
        if (m.lookingAt()) {
          if (m.groupCount() == 2) {
            try {
              outputToks.add(new Token.DateToken(m.group(2), m.group(1)));
            } catch (IOException iex) {
              continue;
            }
          } else {
            try {
              outputToks.add(new Token.DateToken(m.group(2), m.group(1), m.group(3)));
            } catch (IOException iex) {
              continue;
            }
          }
          return cutChunk(m, inputStr);
        }
      }

      for (Pattern p: yearFirstPatterns) {
        Matcher m = p.matcher(inputStr);
        if (m.lookingAt()) {
          try {
            outputToks.add(new Token.DateToken(m.group(3), m.group(2), m.group(1)));
          } catch (IOException iex) {
            continue;
          }
          return cutChunk(m, inputStr);
        }
      }

      for (Pattern p: dayFirstPatterns) {
        Matcher m = p.matcher(inputStr);
        if (m.lookingAt()) {
          if (m.groupCount() == 2) {
            try {
              outputToks.add(new Token.DateToken(m.group(1), m.group(2)));
            } catch (IOException iex) {
              continue;
            }
          } else {
            try {
              outputToks.add(new Token.DateToken(m.group(1), m.group(2), m.group(3)));
            } catch (IOException iex) {
              continue;
            }
          }
          return cutChunk(m, inputStr);
        }
      }
      return null;
    }
    case Token.TIME_TOKENCLASSID: {
      Matcher m = timePattern1.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.TimeToken(m.group(1), m.group(2), m.group(3)));
        return cutChunk(m, inputStr);
      }
      m = timePattern2.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.TimeToken(m.group(1), m.group(2), "00"));
        return cutChunk(m, inputStr);
      }
      return null;
    }
    case Token.CHAR_TOKENCLASSID: {
      Matcher m = charPattern.matcher(inputStr);
      if (m.lookingAt()) {
        if (tokenParameter != null && ! tokenParameter.equals("" + m.group(1).charAt(0))) {
          return null;
        }
        outputToks.add(new Token.CharToken(m.group(1).charAt(0)));
        return cutChunk(m, inputStr);
      }
      return null;
    }
    case Token.FLOAT_TOKENCLASSID: {
      Matcher m = floatPattern.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.FloatToken(m.group(1)));
        return cutChunk(m, inputStr);
      }
      return null;
    }
    case Token.INT_TOKENCLASSID: {
      Matcher m = intPattern.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.IntToken(m.group(1)));
        return cutChunk(m, inputStr);
      }
      return null;
    }
    case Token.STRING_TOKENCLASSID: {
      Matcher m = stringPattern.matcher(inputStr);
      if (m.lookingAt()) {
        Matcher m2 = intPattern.matcher(inputStr);
        Matcher m3 = floatPattern.matcher(inputStr);
        if (! (m2.lookingAt() || m3.lookingAt())) {
          outputToks.add(new Token.StringToken(m.group(1)));
          return cutChunk(m, inputStr);
        }
      }
      return null;
    }
      // CHAR???
    case Token.EOL_TOKENCLASSID: {
      Matcher m = eolPattern.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.EOLToken());
        return cutChunk(m, inputStr);
      }
      return null;
    }
    case Token.WHITESPACE_TOKENCLASSID: {
      Matcher m = wsPattern.matcher(inputStr);
      if (m.lookingAt()) {
        outputToks.add(new Token.WhitespaceToken());
        return cutChunk(m, inputStr);
      }
      return null;
    }
    default: {
      return null;
    }
    }
  }

  private static String cutChunk(Matcher m, String curS) {
    int lastGroupChar = m.end(m.groupCount());
    if (curS.length() > lastGroupChar) {
      return curS.substring(lastGroupChar);
    } else {
      return "";
    }
  }

  final static int CHAR_TOKENCLASSID = 1;
  final static int IPADDR_TOKENCLASSID = 2;
  final static int PERMISSIONS_TOKENCLASSID = 3;
  final static int DATE_TOKENCLASSID = 4;
  final static int TIME_TOKENCLASSID = 5;
  final static int INT_TOKENCLASSID = 6;
  final static int FLOAT_TOKENCLASSID = 7;
  final static int STRING_TOKENCLASSID = 8;
  final static int EOL_TOKENCLASSID = 9;
  final static int WHITESPACE_TOKENCLASSID = 10;
  final static int NOOP_TOKENCLASSID = 11;

  static {
    complements = new HashMap<String, String>();
    complements.put("[", "]");
    complements.put("{", "}");
    complements.put("\"", "\"");
    complements.put("'", "'");
    complements.put("<", ">");
    complements.put("(", ")");
    reverseComplements = new HashMap<String, String>();
    reverseComplements.put("]", "[");
    reverseComplements.put("}", "{");
    reverseComplements.put("\"", "\"");
    reverseComplements.put("'", "'");
    reverseComplements.put(">", "<");
    reverseComplements.put(")", "(");

    // Construct the date patterns
    for (String separatorPatternStr: dateSeparatorPatternStrs) {
      for (String monthPatternStr: monthPatternStrs) {
        // Create all legal combos of month, day, year, and separator
        monthFirstPatterns.add(Pattern.compile(monthPatternStr + separatorPatternStr + dateDayPatternStr + separatorPatternStr + dateYearPatternStr));
        yearFirstPatterns.add(Pattern.compile(dateYearPatternStr + separatorPatternStr + monthPatternStr + separatorPatternStr + dateDayPatternStr));
        dayFirstPatterns.add(Pattern.compile(dateDayPatternStr + separatorPatternStr + monthPatternStr + separatorPatternStr + dateYearPatternStr));
      }
    }
    for (String separatorPatternStr: dateSeparatorPatternStrs) {
      monthFirstPatterns.add(Pattern.compile(monthPatternStrs[0] + separatorPatternStr + dateDayPatternStr));
      dayFirstPatterns.add(Pattern.compile(dateDayPatternStr + separatorPatternStr + monthPatternStrs[0]));
    }
  }


  /**
   * Accepts a single line of input, returns all the tokens for that line.
   * If the line cannot be parsed, we return null.
   */
  static public List<Token.AbstractToken> tokenize(String s) throws IOException {
    String curS = s;
    List<Token.AbstractToken> toksSoFar = new ArrayList<Token.AbstractToken>();

    // We now repeatedly pass through a series of text-extractor tests.
    while (curS.length() > 0) {
      int newStart = -1;

      // META
      char startChar = curS.charAt(0);
      if (complements.get("" + startChar) != null) {
        String closeChar = complements.get("" + startChar);
        int closeIndex = curS.indexOf(closeChar, 1);
        if (closeIndex >= 0) {
          toksSoFar.add(new Token.MetaToken(new Token.CharToken(curS.charAt(0)), new Token.CharToken(closeChar.charAt(0)), tokenize(curS.substring(1, closeIndex))));
          curS = curS.substring(closeIndex+1);
          continue;
        }
      }

      // IP ADDR
      // PERMISSION BITS
      String attemptStr = attemptParse(Token.IPADDR_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // PERMISSION BITS
      attemptStr = attemptParse(Token.PERMISSIONS_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      //
      // DATE
      //
      // Because of the huge number of possible date patterns, and our desire to not perform 
      // multi-token parsing, the date-processing here is a bit of a mess.
      //
      attemptStr = attemptParse(Token.DATE_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // TIME
      attemptStr = attemptParse(Token.TIME_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // FLOAT RANGE
      Matcher m = floatRangePattern.matcher(curS);
      if (m.lookingAt()) {
        toksSoFar.add(new Token.FloatToken(m.group(1)));
        toksSoFar.add(new Token.CharToken('-'));
        toksSoFar.add(new Token.FloatToken(m.group(2)));
        curS = cutChunk(m, curS);
        continue;
      }

      // INTEGER RANGE
      // REMIND - mjc - Should there be a dedicated Token class for ranges?
      m = intRangePattern.matcher(curS);
      if (m.lookingAt()) {
        toksSoFar.add(new Token.IntToken(m.group(1)));
        toksSoFar.add(new Token.CharToken('-'));
        toksSoFar.add(new Token.IntToken(m.group(2)));
        curS = cutChunk(m, curS);
        continue;
      }

      // FLOAT
      attemptStr = attemptParse(Token.FLOAT_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // INTEGER
      attemptStr = attemptParse(Token.INT_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // STRING
      attemptStr = attemptParse(Token.STRING_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // CHAR
      attemptStr = attemptParse(Token.CHAR_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // EOL-Token
      attemptStr = attemptParse(Token.EOL_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // Whitespace
      attemptStr = attemptParse(Token.WHITESPACE_TOKENCLASSID, null, curS, toksSoFar);
      if (attemptStr != null) {
        curS = attemptStr;
        continue;
      }

      // DEFAULT
      // If execution reaches this point, it means no pattern applied, which means the line cannot be parsed.
      return null;
    }
    return toksSoFar;
  }

  ///////////////////////////////////////////////////
  // main() tests the Tokenizer.
  ////////////////////////////////////////////////////
  public static void main(String argv[]) throws IOException {
    if (argv.length < 1) {
      System.err.println("Usage: Tokenizer <datafile> (-verbose)");
      return;
    }
    File f = new File(argv[0]).getCanonicalFile();
    boolean verbose = false;
    for (int i = 1; i < argv.length; i++) {
      if ("-verbose".equals(argv[i])) {
        verbose = true;
      }
    }
    System.err.println("Input file: " + f.getCanonicalPath());

    // Store parse errors and results
    List<Integer> unparseableLineNos = new ArrayList<Integer>();
    List<String> unparseableStrs = new ArrayList<String>();
    List<Integer> parseableLineNos = new ArrayList<Integer>();
    List<List<Token.AbstractToken>> allChunks = new ArrayList<List<Token.AbstractToken>>();

    // Transform the text into a list of "chunks".  
    // A single chunk corresponds to a line of text.  A chunk is a list of Tokens.
    int totalCount = 0;
    int parsedCount = 0;
    int errorCount = 0;
    BufferedReader in = new BufferedReader(new FileReader(f));
    try {
      String s = in.readLine();
      int lineno = 0;
      while (s != null) {
        List<Token.AbstractToken> chunkToks = Tokenizer.tokenize(s);
        if (chunkToks != null) {
          allChunks.add(chunkToks);
          parseableLineNos.add(lineno);
          parsedCount++;
        } else {
          unparseableStrs.add(s);
          unparseableLineNos.add(lineno);
          errorCount++;
        }
        s = in.readLine();
        lineno++;
        totalCount++;
      }
    } finally {
      in.close();
    }

    System.err.println();
    System.err.println("Total lines: " + totalCount);
    System.err.println("Parsed lines: " + parsedCount + " (" + (1.0*parsedCount / totalCount) + ")");
    System.err.println("Error lines: " + errorCount + " (" + (1.0*errorCount / totalCount) + ")");

    //
    // Print out parsed tokens
    //
    if (verbose) {
      System.err.println();
      System.err.println("--RESULTS--------");
      int i = 0;
      for (List<Token.AbstractToken> chunk: allChunks) {
        System.err.print(parseableLineNos.get(i) + ".  ");
        for (Token.AbstractToken tok: chunk) {
          System.err.print(tok + "  ");
        }
        System.err.println();
        i++;
      }

      //
      // Print out error strings
      //
      System.err.println();
      System.err.println("--ERRORS---------");
      i = 0;
      for (String s: unparseableStrs) {
        System.err.println(unparseableLineNos.get(i) + ".  " + s);
        i++;
      }
    }
  }
}