/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. */ package org.apache.pig.piggybank.evaluation.util.apachelogparser; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.TimeZone; import org.apache.pig.EvalFunc; import org.apache.pig.FuncSpec; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; /** * DateExtractor has four different constructors which each allow for different functionality. The * incomingDateFormat ("dd/MMM/yyyy:HH:mm:ss Z" by default) is used to match the date string that gets passed in from the * log. The outgoingDateFormat ("yyyy-MM-dd" by default) is used to format the returned string. * * Different constructors exist for each combination; please use the appropriate respective constructor. * * Note that any data that exists in the SimpleDateFormat schema can be supported. For example, if you were * starting with the default incoming format and wanted to extract just the year, you would use the single * string constructor DateExtractor("yyyy"). * * From pig latin you will need to use aliases to use a non-default format, like * * define MyDateExtractor org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor("yyyy-MM"); * * A = FOREACH row GENERATE DateExtractor(dayTime); * * If a string cannot be parsed, null will be returned and an error message printed to stderr. * * By default, the DateExtractor uses the GMT timezone. You can use the three-parameter constructor to override the * timezone. */ public class DateExtractor extends EvalFunc<String> { private static String DEFAULT_INCOMING_DATE_FORMAT = "dd/MMM/yyyy:HH:mm:ss Z"; private static String DEFAULT_OUTGOING_DATE_FORMAT = "yyyy-MM-dd"; private static String DEFAULT_TZ_ID="GMT"; private SimpleDateFormat incomingDateFormat; private SimpleDateFormat outgoingDateFormat; /** * forms the formats based on default incomingDateFormat and default outgoingDateFormat * * @param outgoingDateString outgoingDateFormat is based on outgoingDateString */ public DateExtractor() { this(DEFAULT_INCOMING_DATE_FORMAT, DEFAULT_OUTGOING_DATE_FORMAT, DEFAULT_TZ_ID); } /** * forms the formats based on passed outgoingDateString and the default incomingDateFormat * * @param outgoingDateString outgoingDateFormat is based on outgoingDateString */ public DateExtractor(String outgoingDateString) { this(DEFAULT_INCOMING_DATE_FORMAT, outgoingDateString, "GMT"); } /** * forms the formats based on passed incomingDateString and outgoingDateString * * @param incomingDateString incomingDateFormat is based on incomingDateString * @param outgoingDateString outgoingDateFormat is based on outgoingDateString * */ public DateExtractor(String incomingDateString, String outgoingDateString) { this(incomingDateString, outgoingDateString, DEFAULT_TZ_ID); } /** * forms the formats based on passed incomingDateString and outgoingDateString * * @param incomingDateString incomingDateFormat is based on incomingDateString * @param outgoingDateString outgoingDateFormat is based on outgoingDateString * @param timeZoneID time zone id in which dates should be expressed. * */ public DateExtractor(String incomingDateString, String outgoingDateString, String timeZoneID) { TimeZone tz = TimeZone.getTimeZone(timeZoneID); incomingDateFormat = new SimpleDateFormat(incomingDateString); outgoingDateFormat = new SimpleDateFormat(outgoingDateString); incomingDateFormat.setTimeZone(tz); outgoingDateFormat.setTimeZone(tz); } @Override public String exec(Tuple input) throws IOException { if (input == null || input.size() == 0) return null; String str=""; try{ str = (String)input.get(0); Date date = incomingDateFormat.parse(str); return outgoingDateFormat.format(date); } catch (ParseException pe) { System.err.println("piggybank.evaluation.util.apachelogparser.DateExtractor: unable to parse date "+str); return null; } catch(Exception e){ throw new IOException("Caught exception processing input row ", e); } } @Override public List<FuncSpec> getArgToFuncMapping() throws FrontendException { List<FuncSpec> funcList = new ArrayList<FuncSpec>(); funcList.add(new FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY)))); return funcList; } }