/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.hadoop.mergeutils; /* * Copyright 2010 - CommonCrawl Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.DataOutputStream; import java.io.IOException; import java.util.Random; import java.util.TreeMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.StringUtils; /** * A bunch of unit tests covering possible combinations of comparators. * * @author rana * */ public class MergeSortSpillWriterUnitTest { static abstract class BaseTest { private static final String keyForNumber(int number) { // establish pattern start location int patternStartIdx = number % 26; // establish pattern size ... int patternSize = (number % 100) + 1; // preallocate buffer StringBuffer buffer = new StringBuffer(patternSize); // build pattern int currPatternIdx = patternStartIdx; for (int i = 0; i < patternSize; ++i) { buffer.append((char) ('A' + currPatternIdx)); currPatternIdx = (currPatternIdx + 1) % 26; } return buffer.toString(); } private static final void swap(int[] arr, int i, int j) { int tmp = arr[i]; arr[i] = arr[j]; arr[j] = tmp; } TreeMap<Integer, Text> originalKeyValueMap = new TreeMap<Integer, Text>(); int index[]; String _testName = null; int _keySetSize = -1; int _indexBufferSize = -1; int _dataBufferSize = -1; int _spillBufferSize = -1; public BaseTest(String testName, int keySetSize, int indexBufferSize, int dataBufferSize, int spillBufferSize) { _testName = testName; _keySetSize = keySetSize; _indexBufferSize = indexBufferSize; _dataBufferSize = dataBufferSize; _spillBufferSize = spillBufferSize; } protected abstract MergeSortSpillWriter<IntWritable, Text> constructMerger(Configuration conf, RawDataSpillWriter<IntWritable, Text> writer, FileSystem tempFileSystem, Path tempFilePath, RawKeyValueComparator<IntWritable, Text> comparator, Class keyClass, Class valueClass) throws IOException; public void runTest() throws IOException { LOG.info("*************** STARTING TEST:" + _testName); LOG.info("Set Size:" + _keySetSize); LOG.info("Index Buffer Size:" + _indexBufferSize); LOG.info("Data Buffer Size:" + _dataBufferSize); LOG.info("Spill Buffer Size:" + _spillBufferSize); LOG.info(""); long testStartTime = System.currentTimeMillis(); // initialization here // create an array of keys and an index into them ... index = new int[_keySetSize]; for (int i = 0; i < _keySetSize; ++i) { index[i] = i; originalKeyValueMap.put(i, new Text(keyForNumber(i))); } // randomly shuffle the index Random r = new Random(); // Shuffle array for (int i = index.length; i > 1; i--) swap(index, i - 1, r.nextInt(i)); // ok create a spill writer that validates position and value RawDataSpillWriter<IntWritable, Text> validatingSpillWriter = new RawDataSpillWriter<IntWritable, Text>() { int closeCount = 0; // initial spill count to zero int spilledKeyCount = 0; DataInputBuffer keyReader = new DataInputBuffer(); DataInputBuffer valueReader = new DataInputBuffer(); IntWritable keyObject = new IntWritable(); Text valueObject = new Text(); @Override public void close() throws IOException { if (++closeCount > 1) { throw new IOException("Close Called One Too Many Times!"); } if (spilledKeyCount != index.length) { throw new IOException("Spilled Key Count:" + spilledKeyCount + " Excpected Key Count:" + index.length); } } @Override public void spillRawRecord(byte[] keyData, int keyOffset, int keyLength, byte[] valueData, int valueOffset, int valueLength) throws IOException { // LOG.info("Got Raw Record"); // initialize key / value readers . keyReader.reset(keyData, keyOffset, keyLength); valueReader.reset(valueData, valueOffset, valueLength); keyObject.readFields(keyReader); valueObject.readFields(valueReader); this.spillRecord(keyObject, valueObject); } @Override public void spillRecord(IntWritable key, Text value) throws IOException { // LOG.info("Got Key:" + key.get() + " Value:"+ value); // if keys don't match ... if (key.get() != spilledKeyCount) { throw new IOException("Got Key:" + key.get() + " Expected Key:" + spilledKeyCount); } // ok keys match... check that values match ... Text expectedValue = originalKeyValueMap.get(spilledKeyCount); // ok validate expected value .. if (expectedValue == null || value == null) { throw new IOException("Null Expected or Incoming Value"); } else { if (expectedValue.compareTo(value) != 0) { throw new IOException("Expected Value:" + expectedValue + " @index:" + spilledKeyCount + " differs from resulting value:" + value); } } spilledKeyCount++; } }; // create a local file system Configuration conf = new Configuration(); // create a raw comparator RawKeyValueComparator<IntWritable, Text> comparator = new RawKeyValueComparator<IntWritable, Text>() { DataInputBuffer keyReader1 = new DataInputBuffer(); DataInputBuffer keyReader2 = new DataInputBuffer(); @Override public int compare(IntWritable key1, Text value1, IntWritable key2, Text value2) { return key1.compareTo(key2); } @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { keyReader1.reset(key1Data, key1Offset, key1Length); keyReader2.reset(key2Data, key2Offset, key2Length); return ((Integer) keyReader1.readInt()).compareTo(keyReader2.readInt()); } }; // setup conf // number of records to store in RAM before doing an intermediate sort conf.setInt(MergeSortSpillWriter.SPILL_INDEX_BUFFER_SIZE_PARAM, _indexBufferSize); // size of intermediate buffer key value buffer ... conf.setInt(MergeSortSpillWriter.SPILL_DATA_BUFFER_SIZE_PARAM, _dataBufferSize); // set spill write buffer size ... conf.setInt(SequenceFileSpillWriter.SPILL_WRITER_BUFFER_SIZE_PARAM, _spillBufferSize); // ok create the spill writer MergeSortSpillWriter<IntWritable, Text> merger = constructMerger(conf, validatingSpillWriter, FileSystem .getLocal(conf), new Path("/tmp"), comparator, IntWritable.class, Text.class); // and finally ... spill the records in random order for (int i = 0; i < index.length; ++i) { merger.spillRecord(new IntWritable(index[i]), originalKeyValueMap.get(index[i])); } // ok close merger ... merger.close(); // now close the external spill writer ... validatingSpillWriter.close(); LOG.info("*************** ENDING TEST:" + _testName + " TOOK:" + (System.currentTimeMillis() - testStartTime)); } } public static class BasicOptimizedTest extends BaseTest { public BasicOptimizedTest() { super("OptimizedKeyGenerator - using Long ONLY Keys Test", 1000000, 10000, 10000 * 200, 1000000); } @Override protected MergeSortSpillWriter<IntWritable, Text> constructMerger(Configuration conf, RawDataSpillWriter<IntWritable, Text> writer, FileSystem tempFileSystem, Path tempFilePath, RawKeyValueComparator<IntWritable, Text> comparator, Class keyClass, Class valueClass) throws IOException { return new MergeSortSpillWriter<IntWritable, Text>(conf, writer, tempFileSystem, tempFilePath, new OptimizedKeyGeneratorAndComparator<IntWritable, Text>() { @Override public void generateOptimizedKeyForPair(IntWritable key, Text value, org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut) throws IOException { optimizedKeyOut.setLongKeyValue(key.get()); } @Override public int getGeneratedKeyType() { return OptimizedKey.KEY_TYPE_LONG; } }, keyClass, valueClass, false, null); } } public static class BasicOptimizedWithBufferOnlyTest extends BaseTest { public BasicOptimizedWithBufferOnlyTest(int keySetSize, int indexBufferSize, int dataBufferSize, int spillBufferSize) { super("OptimizedKeyGenerator - using Buffer ONLY Keys Test", keySetSize, indexBufferSize, dataBufferSize, spillBufferSize); } @Override protected MergeSortSpillWriter<IntWritable, Text> constructMerger(Configuration conf, RawDataSpillWriter<IntWritable, Text> writer, FileSystem tempFileSystem, Path tempFilePath, RawKeyValueComparator<IntWritable, Text> comparator, Class keyClass, Class valueClass) throws IOException { return new MergeSortSpillWriter<IntWritable, Text>(conf, writer, tempFileSystem, tempFilePath, new OptimizedKeyGeneratorAndComparator<IntWritable, Text>() { DataInputBuffer key1ReaderStream = new DataInputBuffer(); DataInputBuffer key2ReaderStream = new DataInputBuffer(); @Override public int compareOptimizedBufferKeys(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length) throws IOException { key1ReaderStream.reset(key1Data, key1Offset, key1Length); key2ReaderStream.reset(key2Data, key2Offset, key2Length); return (int) (key1ReaderStream.readLong() - key2ReaderStream.readLong()); } @Override public void generateOptimizedKeyForPair(IntWritable key, Text value, org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut) throws IOException { // and set the buffer value by first obtaining an output stream // from key object DataOutputStream bufferOutput = optimizedKeyOut.getBufferKeyValueStream(); // and then writing into it bufferOutput.writeLong(key.get()); // and finally committing it by calling close bufferOutput.close(); } @Override public int getGeneratedKeyType() { return OptimizedKey.KEY_TYPE_BUFFER; } }, keyClass, valueClass, false, null); } } public static class BasicOptimizedWithLongAndBufferTest extends BaseTest { public BasicOptimizedWithLongAndBufferTest() { super("OptimizedKeyGenerator - using Long AND Buffer Keys Test", 1000000, 10000, 10000 * 200, 1000000); } @Override protected MergeSortSpillWriter<IntWritable, Text> constructMerger(Configuration conf, RawDataSpillWriter<IntWritable, Text> writer, FileSystem tempFileSystem, Path tempFilePath, RawKeyValueComparator<IntWritable, Text> comparator, Class keyClass, Class valueClass) throws IOException { return new MergeSortSpillWriter<IntWritable, Text>(conf, writer, tempFileSystem, tempFilePath, new OptimizedKeyGeneratorAndComparator<IntWritable, Text>() { DataInputBuffer key1ReaderStream = new DataInputBuffer(); DataInputBuffer key2ReaderStream = new DataInputBuffer(); @Override public int compareOptimizedBufferKeys(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length) throws IOException { key1ReaderStream.reset(key1Data, key1Offset, key1Length); key2ReaderStream.reset(key2Data, key2Offset, key2Length); return (int) (key1ReaderStream.readLong() - key2ReaderStream.readLong()); } @Override public void generateOptimizedKeyForPair(IntWritable key, Text value, org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut) throws IOException { // set the long to dummy value to force secondary comparator to // trigger optimizedKeyOut.setLongKeyValue(0); // and set the buffer value by first obtaining an output stream // from key object DataOutputStream bufferOutput = optimizedKeyOut.getBufferKeyValueStream(); // and then writing into it bufferOutput.writeLong(key.get()); // and finally committing it by calling close bufferOutput.close(); } @Override public int getGeneratedKeyType() { return OptimizedKey.KEY_TYPE_LONG_AND_BUFFER; } }, keyClass, valueClass, false, null); } } public static class BasicTest extends BaseTest { public BasicTest() { super("Basic RawKeyValueComparator Test", 1000000, 10000, 10000 * 200, 1000000); } @Override protected MergeSortSpillWriter<IntWritable, Text> constructMerger(Configuration conf, RawDataSpillWriter<IntWritable, Text> writer, FileSystem tempFileSystem, Path tempFilePath, RawKeyValueComparator<IntWritable, Text> comparator, Class keyClass, Class valueClass) throws IOException { return new MergeSortSpillWriter<IntWritable, Text>(conf, writer, tempFileSystem, tempFilePath, null, comparator, keyClass, valueClass, false, null); } } public static final Log LOG = LogFactory.getLog(MergeSortSpillWriterUnitTest.class); public static void main(String[] args) { try { new BasicTest().runTest(); new BasicOptimizedTest().runTest(); new BasicOptimizedWithLongAndBufferTest().runTest(); new BasicOptimizedWithBufferOnlyTest(1000000, 10000, 10000 * 200, 1000000).runTest(); new BasicOptimizedWithBufferOnlyTest(1000000, 1000000, 1000000 * 200, 1000000).runTest(); // new // BasicOptimizedWithBufferOnlyTest(10000000,1000000,1000000*200,1000000).runTest(); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); } } }