/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.*; import java.util.*; import junit.framework.TestCase; import org.apache.commons.logging.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.*; import org.apache.hadoop.conf.*; public class TestSequenceFileInputFilter extends TestCase { private static final Log LOG = FileInputFormat.LOG; private static final int MAX_LENGTH = 15000; private static final Configuration conf = new Configuration(); private static final JobConf job = new JobConf(conf); private static final FileSystem fs; private static final Path inDir = new Path(System.getProperty("test.build.data",".") + "/mapred"); private static final Path inFile = new Path(inDir, "test.seq"); private static final Random random = new Random(1); private static final Reporter reporter = Reporter.NULL; static { FileInputFormat.setInputPaths(job, inDir); try { fs = FileSystem.getLocal(conf); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } private static void createSequenceFile(int numRecords) throws Exception { // create a file with length entries SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, inFile, Text.class, BytesWritable.class); try { for (int i = 1; i <= numRecords; i++) { Text key = new Text(Integer.toString(i)); byte[] data = new byte[random.nextInt(10)]; random.nextBytes(data); BytesWritable value = new BytesWritable(data); writer.append(key, value); } } finally { writer.close(); } } private int countRecords(int numSplits) throws IOException { InputFormat<Text, BytesWritable> format = new SequenceFileInputFilter<Text, BytesWritable>(); Text key = new Text(); BytesWritable value = new BytesWritable(); if (numSplits==0) { numSplits = random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1; } InputSplit[] splits = format.getSplits(job, numSplits); // check each split int count = 0; LOG.info("Generated " + splits.length + " splits."); for (int j = 0; j < splits.length; j++) { RecordReader<Text, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter); try { while (reader.next(key, value)) { LOG.info("Accept record "+key.toString()); count++; } } finally { reader.close(); } } return count; } public void testRegexFilter() throws Exception { // set the filter class LOG.info("Testing Regex Filter with patter: \\A10*"); SequenceFileInputFilter.setFilterClass(job, SequenceFileInputFilter.RegexFilter.class); SequenceFileInputFilter.RegexFilter.setPattern(job, "\\A10*"); // clean input dir fs.delete(inDir, true); // for a variety of lengths for (int length = 1; length < MAX_LENGTH; length+= random.nextInt(MAX_LENGTH/10)+1) { LOG.info("******Number of records: "+length); createSequenceFile(length); int count = countRecords(0); assertEquals(count, length==0?0:(int)Math.log10(length)+1); } // clean up fs.delete(inDir, true); } public void testPercentFilter() throws Exception { LOG.info("Testing Percent Filter with frequency: 1000"); // set the filter class SequenceFileInputFilter.setFilterClass(job, SequenceFileInputFilter.PercentFilter.class); SequenceFileInputFilter.PercentFilter.setFrequency(job, 1000); // clean input dir fs.delete(inDir, true); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length+= random.nextInt(MAX_LENGTH/10)+1) { LOG.info("******Number of records: "+length); createSequenceFile(length); int count = countRecords(1); LOG.info("Accepted "+count+" records"); int expectedCount = length/1000; if (expectedCount*1000!=length) expectedCount++; assertEquals(count, expectedCount); } // clean up fs.delete(inDir, true); } public void testMD5Filter() throws Exception { // set the filter class LOG.info("Testing MD5 Filter with frequency: 1000"); SequenceFileInputFilter.setFilterClass(job, SequenceFileInputFilter.MD5Filter.class); SequenceFileInputFilter.MD5Filter.setFrequency(job, 1000); // clean input dir fs.delete(inDir, true); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length+= random.nextInt(MAX_LENGTH/10)+1) { LOG.info("******Number of records: "+length); createSequenceFile(length); LOG.info("Accepted "+countRecords(0)+" records"); } // clean up fs.delete(inDir, true); } public static void main(String[] args) throws Exception { TestSequenceFileInputFilter filter = new TestSequenceFileInputFilter(); filter.testRegexFilter(); } }