/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the * NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. */ package org.apache.pig.piggybank.storage; import java.io.IOException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.pig.LoadFunc; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DefaultTupleFactory; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; /** * RegExLoader is an abstract class used to parse logs based on a regular expression. * * There is a single abstract method, getPattern which needs to return a Pattern. Each group will be returned * as a different DataAtom. * * Look to org.apache.pig.piggybank.storage.apachelog.CommonLogLoader for example usage. */ public abstract class RegExLoader extends LoadFunc { private LineRecordReader in = null; abstract public Pattern getPattern(); @Override public Tuple getNext() throws IOException { Pattern pattern = getPattern(); Matcher matcher = pattern.matcher(""); TupleFactory mTupleFactory = DefaultTupleFactory.getInstance(); String line; while (in.nextKeyValue()) { Text val = in.getCurrentValue(); line = val.toString(); if (line.length() > 0 && line.charAt(line.length() - 1) == '\r') { line = line.substring(0, line.length() - 1); } matcher = matcher.reset(line); ArrayList<DataByteArray> list = new ArrayList<DataByteArray>(); if (matcher.find()) { for (int i = 1; i <= matcher.groupCount(); i++) { list.add(new DataByteArray(matcher.group(i))); } return mTupleFactory.newTuple(list); } } return null; } @SuppressWarnings("unchecked") @Override public InputFormat getInputFormat() throws IOException { return new TextInputFormat(); } @SuppressWarnings("unchecked") @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { in = (LineRecordReader) reader; } @Override public void setLocation(String location, Job job) throws IOException { FileInputFormat.setInputPaths(job, location); } }