/* * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.learnstructure; import java.io.*; import java.util.*; import org.apache.avro.Schema; import org.apache.avro.util.Utf8; import org.apache.avro.generic.GenericData; /********************************************************* * Token is one of a handful of data types we expect to appear broadly in * log-style data: ints, floats, dates, IP addrs, timestamps, etc, etc. * * This file contains the superclass Token as well as the many subclasses. *********************************************************/ public class Token { static int EPOCH_START_YEAR = 1970; final static int META_TOKENCLASSID = 0; final static int CHAR_TOKENCLASSID = 1; final static int IPADDR_TOKENCLASSID = 2; final static int PERMISSIONS_TOKENCLASSID = 3; final static int DATE_TOKENCLASSID = 4; final static int TIME_TOKENCLASSID = 5; final static int INT_TOKENCLASSID = 6; final static int FLOAT_TOKENCLASSID = 7; final static int STRING_TOKENCLASSID = 8; final static int EOL_TOKENCLASSID = 9; final static int WHITESPACE_TOKENCLASSID = 10; final static int NOOP_TOKENCLASSID = 11; static abstract class AbstractToken { public static boolean hasData(int tokenClassIdentifier) { switch (tokenClassIdentifier) { case META_TOKENCLASSID: return false; case CHAR_TOKENCLASSID: return true; case IPADDR_TOKENCLASSID: return true; case PERMISSIONS_TOKENCLASSID: return true; case DATE_TOKENCLASSID: return true; case TIME_TOKENCLASSID: return true; case INT_TOKENCLASSID: return true; case FLOAT_TOKENCLASSID: return true; case STRING_TOKENCLASSID: return true; case EOL_TOKENCLASSID: return false; case WHITESPACE_TOKENCLASSID: return false; case NOOP_TOKENCLASSID: return false; default: // should never happen return false; } } public static String getClassStr(int tokenClassIdentifier) { switch (tokenClassIdentifier) { case META_TOKENCLASSID: return "meta"; case CHAR_TOKENCLASSID: return "char"; case IPADDR_TOKENCLASSID: return "ipaddr"; case PERMISSIONS_TOKENCLASSID: return "permissionbits"; case DATE_TOKENCLASSID: return "date"; case TIME_TOKENCLASSID: return "time"; case INT_TOKENCLASSID: return "int"; case FLOAT_TOKENCLASSID: return "float"; case STRING_TOKENCLASSID: return "string"; case EOL_TOKENCLASSID: return "eol"; case WHITESPACE_TOKENCLASSID: return "ws"; case NOOP_TOKENCLASSID: return "noop"; default: // should never happen return null; } } public static Schema createAvroSchema(int tokenClassIdentifier, String tokenParameter, String fieldName) { switch (tokenClassIdentifier) { case META_TOKENCLASSID: return null; case CHAR_TOKENCLASSID: return Schema.create(Schema.Type.STRING); case IPADDR_TOKENCLASSID: return Schema.create(Schema.Type.STRING); case PERMISSIONS_TOKENCLASSID: return Schema.create(Schema.Type.STRING); case DATE_TOKENCLASSID: { Schema s = Schema.createRecord(fieldName, "", "", false); List<Schema.Field> fields = new ArrayList<Schema.Field>(); fields.add(new Schema.Field("month", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("day", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("year", Schema.create(Schema.Type.INT), "", null)); s.setFields(fields); return s; } case TIME_TOKENCLASSID: { Schema s = Schema.createRecord(fieldName, "", "", false); List<Schema.Field> fields = new ArrayList<Schema.Field>(); fields.add(new Schema.Field("hrs", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("mins", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("secs", Schema.create(Schema.Type.INT), "", null)); s.setFields(fields); return s; } case INT_TOKENCLASSID: return Schema.create(Schema.Type.INT); case FLOAT_TOKENCLASSID: return Schema.create(Schema.Type.DOUBLE); case STRING_TOKENCLASSID: return Schema.create(Schema.Type.STRING); case EOL_TOKENCLASSID: return null; case WHITESPACE_TOKENCLASSID: return null; case NOOP_TOKENCLASSID: return null; default: // should never happen return null; } } public static String getStrDesc(int classId, String tokenParameter) { return getClassStr(classId) + ((tokenParameter != null) ? tokenParameter : ""); } int classId; String tokenParameter; public AbstractToken(int classId, String tokenParameter) { this.classId = classId; this.tokenParameter = tokenParameter; } public int getClassId() { return classId; } public String getParameter() { return tokenParameter; } public String getId() { return getStrDesc(classId, tokenParameter); } public abstract Object get(); public abstract String getSampleString(); } static class MetaToken extends AbstractToken { CharToken start; CharToken end; List<AbstractToken> contents; /** */ public MetaToken(CharToken start, CharToken end, List<AbstractToken> contents) { super(META_TOKENCLASSID, null); this.start = start; this.end = end; this.contents = contents; } public CharToken getStartToken() { return start; } public CharToken getEndToken() { return end; } public List<Token.AbstractToken> getMiddleChunk() { return contents; } public String toString() { StringBuffer buf = new StringBuffer(); buf.append("META(" + start + "..." + end + "\n"); for (AbstractToken tok: contents) { buf.append("\t" + tok + "\n"); } buf.append(")"); return buf.toString(); } public String getSampleString() { return toString(); } public String getParameter() { return "" + start.getChar(); } public Object get() { ArrayList<Object> getResults = new ArrayList<Object>(); for (AbstractToken tok: contents) { getResults.add(tok.get()); } return getResults; } } static class CharToken extends AbstractToken { char c; public CharToken(char c) { super(CHAR_TOKENCLASSID, "" + c); this.c = c; } public char getChar() { return c; } public String toString() { return "CHAR(" + c + ")"; } public String getSampleString() { return get().toString(); } public Object get() { return new Utf8(String.valueOf(c)); } } static class IPAddrToken extends AbstractToken { String s; public IPAddrToken(String s) { super(IPADDR_TOKENCLASSID, null); this.s = s; } public String toString() { return "IPADDR(" + s + ")"; } public String getSampleString() { return get().toString(); } public Object get() { return new Utf8(s); } } static class PermissionBits extends AbstractToken { String s; public PermissionBits(String s) { super(PERMISSIONS_TOKENCLASSID, null); this.s = s; } public String toString() { return "PERMISSION-BITS(" + s + ")"; } public String getSampleString() { return get().toString(); } public Object get() { return new Utf8(s); } } static class DateToken extends AbstractToken { int month; int day; int year; public DateToken(String dayStr, String monthStr) throws IOException { super(DATE_TOKENCLASSID, null); try { this.day = Integer.parseInt(dayStr); if (day < 1 || day > 31) { throw new IOException("Illegal day value: " + day); } } catch (NumberFormatException nfe) { nfe.printStackTrace(); } this.month = convertMonthStr(monthStr); this.year = -1; } public DateToken(String dayStr, String monthStr, String yrStr) throws IOException { super(DATE_TOKENCLASSID, null); try { this.day = Integer.parseInt(dayStr); if (day < 1 || day > 31) { throw new IOException("Illegal day value: " + day); } } catch (NumberFormatException nfe) { nfe.printStackTrace(); } this.month = convertMonthStr(monthStr); try { this.year = Integer.parseInt(yrStr); if (year < EPOCH_START_YEAR) { throw new IOException("Illegal year value: " + year); } } catch (NumberFormatException nfe) { nfe.printStackTrace(); } } int convertMonthStr(String monthStr) { try { return Integer.parseInt(monthStr); } catch (NumberFormatException nfe) { } if ("jan".equalsIgnoreCase(monthStr)) { return 1; } if ("feb".equalsIgnoreCase(monthStr)) { return 2; } if ("mar".equalsIgnoreCase(monthStr)) { return 3; } if ("apr".equalsIgnoreCase(monthStr)) { return 4; } if ("may".equalsIgnoreCase(monthStr)) { return 5; } if ("jun".equalsIgnoreCase(monthStr)) { return 6; } if ("jul".equalsIgnoreCase(monthStr)) { return 7; } if ("aug".equalsIgnoreCase(monthStr)) { return 8; } if ("sep".equalsIgnoreCase(monthStr)) { return 9; } if ("oct".equalsIgnoreCase(monthStr)) { return 10; } if ("nov".equalsIgnoreCase(monthStr)) { return 11; } if ("dec".equalsIgnoreCase(monthStr)) { return 12; } return -1; } public String toString() { return "DATE(" + day + ", " + month + ", " + year + ")"; } public String getSampleString() { return "(" + day + ", " + month + ", " + year + ")"; } public Object get() { List<Schema.Field> fields = new ArrayList<Schema.Field>(); fields.add(new Schema.Field("month", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("day", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("year", Schema.create(Schema.Type.INT), "", null)); Schema newSchema = Schema.createRecord("date", "", "", false); newSchema.setFields(fields); GenericData.Record gdr = new GenericData.Record(newSchema); gdr.put("month", month); gdr.put("day", day); gdr.put("year", year); return gdr; } } static class TimeToken extends AbstractToken { int hr; int min; int sec; public TimeToken(String hrS, String minS, String secS) { super(TIME_TOKENCLASSID, null); try { this.hr = Integer.parseInt(hrS); this.min = Integer.parseInt(minS); this.sec = Integer.parseInt(secS); } catch (NumberFormatException nfe) { nfe.printStackTrace(); } } public String toString() { return "TIME(" + hr + ":" + min + ":" + sec + ")"; } public String getSampleString() { return "(" + hr + ", " + min + ", " + sec + ")"; } public Object get() { List<Schema.Field> fields = new ArrayList<Schema.Field>(); fields.add(new Schema.Field("hrs", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("mins", Schema.create(Schema.Type.INT), "", null)); fields.add(new Schema.Field("secs", Schema.create(Schema.Type.INT), "", null)); Schema newSchema = Schema.createRecord("timestamp", "", "", false); newSchema.setFields(fields); GenericData.Record gdr = new GenericData.Record(newSchema); gdr.put("hrs", hr); gdr.put("mins", min); gdr.put("secs", sec); return gdr; } } static class IntToken extends AbstractToken { int i; public IntToken(String s) { super(INT_TOKENCLASSID, null); try { this.i = Integer.parseInt(s); } catch (NumberFormatException nfe) { nfe.printStackTrace(); } } public String toString() { return "INT(" + i + ")"; } public String getSampleString() { return get().toString(); } public Object get() { return i; } } static class FloatToken extends AbstractToken { double f; public FloatToken(String s) { super(FLOAT_TOKENCLASSID, null); try { this.f = Double.parseDouble(s); } catch (NumberFormatException nfe) { nfe.printStackTrace(); } } public String toString() { return "FLOAT(" + f + ")"; } public String getSampleString() { return get().toString(); } public Object get() { return f; } } static class StringToken extends AbstractToken { String s; public StringToken(String s) { super(STRING_TOKENCLASSID, null); this.s = s; } public String toString() { return "STRING(" + s + ")"; } public String getSampleString() { return get().toString(); } public Object get() { return new Utf8(s); } } static class EOLToken extends AbstractToken { public EOLToken() { super(EOL_TOKENCLASSID, null); } public String toString() { return "EOL()"; } public String getSampleString() { return toString(); } public Object get() { return null; } } static class WhitespaceToken extends AbstractToken { public WhitespaceToken() { super(WHITESPACE_TOKENCLASSID, null); } public String toString() { return "WS()"; } public String getSampleString() { return toString(); } public Object get() { return null; } } static class NoopToken extends AbstractToken { public NoopToken() { super(NOOP_TOKENCLASSID, null); } public String toString() { return "NOOP()"; } public String getSampleString() { return toString(); } public Object get() { return null; } } }