/* * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.learnstructure; import java.io.*; import java.util.*; import java.util.regex.*; /********************************************************* * Tokenizer transforms a line of text into a set of Token objects. * Each Token is one of a handful of classes. * *********************************************************/ public class Tokenizer { // The components of possible date patterns static String monthPatternStrs[] = {"(January|Jan|jan|February|Feb|feb|March|Mar|mar|April|Apr|apr|May|may|June|Jun|jun|July|Jul|jul|August|Aug|aug|September|Sep|sep|October|Oct|oct|November|Nov|nov|December|Dec|dec)", "([01]*\\d)"}; static String dateSeparatorPatternStrs[] = {"(?:\\s+)", "(?:\\.)", "(?:\\/)"}; static String dateDayPatternStr = "([0123]?\\d)"; static String dateYearPatternStr = "([12]\\d{3})"; static List<Pattern> monthFirstPatterns = new ArrayList<Pattern>(); static List<Pattern> yearFirstPatterns = new ArrayList<Pattern>(); static List<Pattern> dayFirstPatterns = new ArrayList<Pattern>(); static Pattern ipAddrPattern = Pattern.compile("((?:(?:\\d+\\.){3,}\\d+)|(?:\\*\\.(?:(?:\\d+|\\*)\\.)*(?:\\d+|\\*)))"); static Pattern permissionBitPattern = Pattern.compile("([drwx-]{9,})"); static Pattern timePattern1 = Pattern.compile("(\\d\\d):(\\d\\d):(\\d\\d)"); static Pattern timePattern2 = Pattern.compile("(\\d\\d):(\\d\\d)"); static Pattern intPattern = Pattern.compile("([-+]?\\d+)"); static Pattern intRangePattern = Pattern.compile("(\\d+)-(\\d+)"); static Pattern floatPattern = Pattern.compile("([+-]?\\d*\\.\\d+)"); static Pattern floatRangePattern = Pattern.compile("(\\d*\\.\\d+)-(\\d*\\.\\d+)"); static Pattern stringPattern = Pattern.compile("((?:[\\S&&[^\\\"\\,\\;\\|\\[\\]\\{\\}\\<\\>\\(\\)\\']]){2,})"); static Pattern charPattern = Pattern.compile("(\\S)"); static Pattern eolPattern = Pattern.compile("(\\n)"); static Pattern wsPattern = Pattern.compile("(\\s+)"); static HashMap<String, String> complements; static HashMap<String, String> reverseComplements; /** * attemptParse() tries to parse the input string with the given token-class. * If successful, it returns the remaining string and adds the token to the given list. * If not successful, it returns null and does not modify the given list. * * This method is used in two places: * 1) text-tokenization during the structure-learning phase * 2) Guided parsing, after a learned structure-parser has been constructed. * * For case #1, we expect that a loop will call attemptParse repeatedly, until it find a token-type * that can be correctly parsed. This is what happens inside tokenize() below. * * For case #2, we expect that the parse-tree will contain a specific token-type that *must* be * parsed, or else that branch of the parse-tree is invalid. This is what happens inside * InferredType.BaseType.internalParse(). */ public static String attemptParse(int tokenClassId, String tokenParameter, String inputStr, List<Token.AbstractToken> outputToks) { switch (tokenClassId) { case Token.IPADDR_TOKENCLASSID: { Matcher m = ipAddrPattern.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.IPAddrToken(m.group(1))); return cutChunk(m, inputStr); } else { return null; } } case Token.PERMISSIONS_TOKENCLASSID: { Matcher m = permissionBitPattern.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.PermissionBits(m.group(1))); return cutChunk(m, inputStr); } else { return null; } } case Token.DATE_TOKENCLASSID: { String newStr = null; for (Pattern p: monthFirstPatterns) { Matcher m = p.matcher(inputStr); if (m.lookingAt()) { if (m.groupCount() == 2) { try { outputToks.add(new Token.DateToken(m.group(2), m.group(1))); } catch (IOException iex) { continue; } } else { try { outputToks.add(new Token.DateToken(m.group(2), m.group(1), m.group(3))); } catch (IOException iex) { continue; } } return cutChunk(m, inputStr); } } for (Pattern p: yearFirstPatterns) { Matcher m = p.matcher(inputStr); if (m.lookingAt()) { try { outputToks.add(new Token.DateToken(m.group(3), m.group(2), m.group(1))); } catch (IOException iex) { continue; } return cutChunk(m, inputStr); } } for (Pattern p: dayFirstPatterns) { Matcher m = p.matcher(inputStr); if (m.lookingAt()) { if (m.groupCount() == 2) { try { outputToks.add(new Token.DateToken(m.group(1), m.group(2))); } catch (IOException iex) { continue; } } else { try { outputToks.add(new Token.DateToken(m.group(1), m.group(2), m.group(3))); } catch (IOException iex) { continue; } } return cutChunk(m, inputStr); } } return null; } case Token.TIME_TOKENCLASSID: { Matcher m = timePattern1.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.TimeToken(m.group(1), m.group(2), m.group(3))); return cutChunk(m, inputStr); } m = timePattern2.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.TimeToken(m.group(1), m.group(2), "00")); return cutChunk(m, inputStr); } return null; } case Token.CHAR_TOKENCLASSID: { Matcher m = charPattern.matcher(inputStr); if (m.lookingAt()) { if (tokenParameter != null && ! tokenParameter.equals("" + m.group(1).charAt(0))) { return null; } outputToks.add(new Token.CharToken(m.group(1).charAt(0))); return cutChunk(m, inputStr); } return null; } case Token.FLOAT_TOKENCLASSID: { Matcher m = floatPattern.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.FloatToken(m.group(1))); return cutChunk(m, inputStr); } return null; } case Token.INT_TOKENCLASSID: { Matcher m = intPattern.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.IntToken(m.group(1))); return cutChunk(m, inputStr); } return null; } case Token.STRING_TOKENCLASSID: { Matcher m = stringPattern.matcher(inputStr); if (m.lookingAt()) { Matcher m2 = intPattern.matcher(inputStr); Matcher m3 = floatPattern.matcher(inputStr); if (! (m2.lookingAt() || m3.lookingAt())) { outputToks.add(new Token.StringToken(m.group(1))); return cutChunk(m, inputStr); } } return null; } // CHAR??? case Token.EOL_TOKENCLASSID: { Matcher m = eolPattern.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.EOLToken()); return cutChunk(m, inputStr); } return null; } case Token.WHITESPACE_TOKENCLASSID: { Matcher m = wsPattern.matcher(inputStr); if (m.lookingAt()) { outputToks.add(new Token.WhitespaceToken()); return cutChunk(m, inputStr); } return null; } default: { return null; } } } private static String cutChunk(Matcher m, String curS) { int lastGroupChar = m.end(m.groupCount()); if (curS.length() > lastGroupChar) { return curS.substring(lastGroupChar); } else { return ""; } } final static int CHAR_TOKENCLASSID = 1; final static int IPADDR_TOKENCLASSID = 2; final static int PERMISSIONS_TOKENCLASSID = 3; final static int DATE_TOKENCLASSID = 4; final static int TIME_TOKENCLASSID = 5; final static int INT_TOKENCLASSID = 6; final static int FLOAT_TOKENCLASSID = 7; final static int STRING_TOKENCLASSID = 8; final static int EOL_TOKENCLASSID = 9; final static int WHITESPACE_TOKENCLASSID = 10; final static int NOOP_TOKENCLASSID = 11; static { complements = new HashMap<String, String>(); complements.put("[", "]"); complements.put("{", "}"); complements.put("\"", "\""); complements.put("'", "'"); complements.put("<", ">"); complements.put("(", ")"); reverseComplements = new HashMap<String, String>(); reverseComplements.put("]", "["); reverseComplements.put("}", "{"); reverseComplements.put("\"", "\""); reverseComplements.put("'", "'"); reverseComplements.put(">", "<"); reverseComplements.put(")", "("); // Construct the date patterns for (String separatorPatternStr: dateSeparatorPatternStrs) { for (String monthPatternStr: monthPatternStrs) { // Create all legal combos of month, day, year, and separator monthFirstPatterns.add(Pattern.compile(monthPatternStr + separatorPatternStr + dateDayPatternStr + separatorPatternStr + dateYearPatternStr)); yearFirstPatterns.add(Pattern.compile(dateYearPatternStr + separatorPatternStr + monthPatternStr + separatorPatternStr + dateDayPatternStr)); dayFirstPatterns.add(Pattern.compile(dateDayPatternStr + separatorPatternStr + monthPatternStr + separatorPatternStr + dateYearPatternStr)); } } for (String separatorPatternStr: dateSeparatorPatternStrs) { monthFirstPatterns.add(Pattern.compile(monthPatternStrs[0] + separatorPatternStr + dateDayPatternStr)); dayFirstPatterns.add(Pattern.compile(dateDayPatternStr + separatorPatternStr + monthPatternStrs[0])); } } /** * Accepts a single line of input, returns all the tokens for that line. * If the line cannot be parsed, we return null. */ static public List<Token.AbstractToken> tokenize(String s) throws IOException { String curS = s; List<Token.AbstractToken> toksSoFar = new ArrayList<Token.AbstractToken>(); // We now repeatedly pass through a series of text-extractor tests. while (curS.length() > 0) { int newStart = -1; // META char startChar = curS.charAt(0); if (complements.get("" + startChar) != null) { String closeChar = complements.get("" + startChar); int closeIndex = curS.indexOf(closeChar, 1); if (closeIndex >= 0) { toksSoFar.add(new Token.MetaToken(new Token.CharToken(curS.charAt(0)), new Token.CharToken(closeChar.charAt(0)), tokenize(curS.substring(1, closeIndex)))); curS = curS.substring(closeIndex+1); continue; } } // IP ADDR // PERMISSION BITS String attemptStr = attemptParse(Token.IPADDR_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // PERMISSION BITS attemptStr = attemptParse(Token.PERMISSIONS_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // // DATE // // Because of the huge number of possible date patterns, and our desire to not perform // multi-token parsing, the date-processing here is a bit of a mess. // attemptStr = attemptParse(Token.DATE_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // TIME attemptStr = attemptParse(Token.TIME_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // FLOAT RANGE Matcher m = floatRangePattern.matcher(curS); if (m.lookingAt()) { toksSoFar.add(new Token.FloatToken(m.group(1))); toksSoFar.add(new Token.CharToken('-')); toksSoFar.add(new Token.FloatToken(m.group(2))); curS = cutChunk(m, curS); continue; } // INTEGER RANGE // REMIND - mjc - Should there be a dedicated Token class for ranges? m = intRangePattern.matcher(curS); if (m.lookingAt()) { toksSoFar.add(new Token.IntToken(m.group(1))); toksSoFar.add(new Token.CharToken('-')); toksSoFar.add(new Token.IntToken(m.group(2))); curS = cutChunk(m, curS); continue; } // FLOAT attemptStr = attemptParse(Token.FLOAT_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // INTEGER attemptStr = attemptParse(Token.INT_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // STRING attemptStr = attemptParse(Token.STRING_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // CHAR attemptStr = attemptParse(Token.CHAR_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // EOL-Token attemptStr = attemptParse(Token.EOL_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // Whitespace attemptStr = attemptParse(Token.WHITESPACE_TOKENCLASSID, null, curS, toksSoFar); if (attemptStr != null) { curS = attemptStr; continue; } // DEFAULT // If execution reaches this point, it means no pattern applied, which means the line cannot be parsed. return null; } return toksSoFar; } /////////////////////////////////////////////////// // main() tests the Tokenizer. //////////////////////////////////////////////////// public static void main(String argv[]) throws IOException { if (argv.length < 1) { System.err.println("Usage: Tokenizer <datafile> (-verbose)"); return; } File f = new File(argv[0]).getCanonicalFile(); boolean verbose = false; for (int i = 1; i < argv.length; i++) { if ("-verbose".equals(argv[i])) { verbose = true; } } System.err.println("Input file: " + f.getCanonicalPath()); // Store parse errors and results List<Integer> unparseableLineNos = new ArrayList<Integer>(); List<String> unparseableStrs = new ArrayList<String>(); List<Integer> parseableLineNos = new ArrayList<Integer>(); List<List<Token.AbstractToken>> allChunks = new ArrayList<List<Token.AbstractToken>>(); // Transform the text into a list of "chunks". // A single chunk corresponds to a line of text. A chunk is a list of Tokens. int totalCount = 0; int parsedCount = 0; int errorCount = 0; BufferedReader in = new BufferedReader(new FileReader(f)); try { String s = in.readLine(); int lineno = 0; while (s != null) { List<Token.AbstractToken> chunkToks = Tokenizer.tokenize(s); if (chunkToks != null) { allChunks.add(chunkToks); parseableLineNos.add(lineno); parsedCount++; } else { unparseableStrs.add(s); unparseableLineNos.add(lineno); errorCount++; } s = in.readLine(); lineno++; totalCount++; } } finally { in.close(); } System.err.println(); System.err.println("Total lines: " + totalCount); System.err.println("Parsed lines: " + parsedCount + " (" + (1.0*parsedCount / totalCount) + ")"); System.err.println("Error lines: " + errorCount + " (" + (1.0*errorCount / totalCount) + ")"); // // Print out parsed tokens // if (verbose) { System.err.println(); System.err.println("--RESULTS--------"); int i = 0; for (List<Token.AbstractToken> chunk: allChunks) { System.err.print(parseableLineNos.get(i) + ". "); for (Token.AbstractToken tok: chunk) { System.err.print(tok + " "); } System.err.println(); i++; } // // Print out error strings // System.err.println(); System.err.println("--ERRORS---------"); i = 0; for (String s: unparseableStrs) { System.err.println(unparseableLineNos.get(i) + ". " + s); i++; } } } }