package org.apache.blur.mapreduce.lib; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.BufferedReader; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.Collection; import java.util.TreeSet; import org.apache.blur.MiniCluster; import org.apache.blur.server.TableContext; import org.apache.blur.store.buffer.BufferStore; import org.apache.blur.store.hdfs.HdfsDirectory; import org.apache.blur.thrift.generated.TableDescriptor; import org.apache.blur.utils.JavaHome; import org.apache.blur.utils.ShardUtil; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.lucene.index.DirectoryReader; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; public class BlurOutputFormatTest { private static Configuration _conf = new Configuration(); private static FileSystem _fileSystem; private static MiniCluster _miniCluster; private static Path _root; @BeforeClass public static void setupTest() throws Exception { JavaHome.checkJavaHome(); File file = new File("./target/tmp/BlurOutputFormatTest_tmp"); String pathStr = file.getAbsoluteFile().toURI().toString(); String hdfsPath = pathStr + "/hdfs"; System.setProperty("test.build.data", hdfsPath); System.setProperty("hadoop.log.dir", pathStr + "/hadoop_log"); _miniCluster = new MiniCluster(); _miniCluster.startDfs(hdfsPath); _fileSystem = _miniCluster.getFileSystem(); _root = new Path(_fileSystem.getUri() + "/testroot"); _miniCluster.startMrMiniCluster(); _conf = _miniCluster.getMRConfiguration(); BufferStore.initNewBuffer(128, 128 * 128); } @AfterClass public static void teardown() throws IOException { if (_miniCluster != null) { _miniCluster.stopMrMiniCluster(); _miniCluster.shutdownDfs(); } rm(new File("build")); } private static void rm(File file) { if (!file.exists()) { return; } if (file.isDirectory()) { for (File f : file.listFiles()) { rm(f); } } file.delete(); } @Before public void setup() { TableContext.clear(); } @Test public void testBlurOutputFormat() throws IOException, InterruptedException, ClassNotFoundException { Path input = getInDir(); Path output = getOutDir(); _fileSystem.delete(input, true); _fileSystem.delete(output, true); writeRecordsFile(new Path(input, "part1"), 1, 1, 1, 1, "cf1"); writeRecordsFile(new Path(input, "part2"), 1, 1, 2, 1, "cf1"); Job job = Job.getInstance(_conf, "blur index"); job.setJarByClass(BlurOutputFormatTest.class); job.setMapperClass(CsvBlurMapper.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, input); CsvBlurMapper.addColumns(job, "cf1", "col"); Path tablePath = new Path(new Path(_root, "table"), "test"); TableDescriptor tableDescriptor = new TableDescriptor(); tableDescriptor.setShardCount(1); tableDescriptor.setTableUri(tablePath.toString()); tableDescriptor.setName("test"); createShardDirectories(tablePath, 1); BlurOutputFormat.setupJob(job, tableDescriptor); BlurOutputFormat.setOutputPath(job, output); assertTrue(job.waitForCompletion(true)); Counters ctrs = job.getCounters(); System.out.println("Counters: " + ctrs); Path path = new Path(output, ShardUtil.getShardName(0)); dump(path, _conf); Collection<Path> commitedTasks = getCommitedTasks(path); assertEquals(1, commitedTasks.size()); DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next())); assertEquals(2, reader.numDocs()); reader.close(); } private Path getOutDir() { return new Path(_root, "out"); } private Path getInDir() { return new Path(_root, "in"); } private void dump(Path path, Configuration conf) throws IOException { FileSystem fileSystem = path.getFileSystem(conf); System.out.println(path); if (!fileSystem.isFile(path)) { FileStatus[] listStatus = fileSystem.listStatus(path); for (FileStatus fileStatus : listStatus) { dump(fileStatus.getPath(), conf); } } } private Collection<Path> getCommitedTasks(Path path) throws IOException { Collection<Path> result = new TreeSet<Path>(); FileSystem fileSystem = path.getFileSystem(_conf); FileStatus[] listStatus = fileSystem.listStatus(path); for (FileStatus fileStatus : listStatus) { Path p = fileStatus.getPath(); if (fileStatus.isDir() && p.getName().endsWith(".commit")) { result.add(p); } } return result; } @Test public void testBlurOutputFormatOverFlowTest() throws IOException, InterruptedException, ClassNotFoundException { Path input = getInDir(); Path output = getOutDir(); _fileSystem.delete(input, true); _fileSystem.delete(output, true); // 1500 * 50 = 75,000 writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1"); // 100 * 50 = 5,000 writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1"); Job job = Job.getInstance(_conf, "blur index"); job.setJarByClass(BlurOutputFormatTest.class); job.setMapperClass(CsvBlurMapper.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, input); CsvBlurMapper.addColumns(job, "cf1", "col"); Path tablePath = new Path(new Path(_root, "table"), "test"); TableDescriptor tableDescriptor = new TableDescriptor(); tableDescriptor.setShardCount(1); tableDescriptor.setTableUri(tablePath.toString()); tableDescriptor.setName("test"); createShardDirectories(tablePath, 1); BlurOutputFormat.setupJob(job, tableDescriptor); BlurOutputFormat.setOutputPath(job, output); BlurOutputFormat.setIndexLocally(job, true); BlurOutputFormat.setOptimizeInFlight(job, false); assertTrue(job.waitForCompletion(true)); Counters ctrs = job.getCounters(); System.out.println("Counters: " + ctrs); Path path = new Path(output, ShardUtil.getShardName(0)); Collection<Path> commitedTasks = getCommitedTasks(path); assertEquals(1, commitedTasks.size()); DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next())); assertEquals(80000, reader.numDocs()); reader.close(); } @Test public void testBlurOutputFormatOverFlowMultipleReducersTest() throws IOException, InterruptedException, ClassNotFoundException { Path input = getInDir(); Path output = getOutDir(); _fileSystem.delete(input, true); _fileSystem.delete(output, true); // 1500 * 50 = 75,000 writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1"); // 100 * 50 = 5,000 writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1"); Job job = Job.getInstance(_conf, "blur index"); job.setJarByClass(BlurOutputFormatTest.class); job.setMapperClass(CsvBlurMapper.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, input); CsvBlurMapper.addColumns(job, "cf1", "col"); Path tablePath = new Path(new Path(_root, "table"), "test"); TableDescriptor tableDescriptor = new TableDescriptor(); tableDescriptor.setShardCount(2); tableDescriptor.setTableUri(tablePath.toString()); tableDescriptor.setName("test"); createShardDirectories(output, 2); BlurOutputFormat.setupJob(job, tableDescriptor); BlurOutputFormat.setOutputPath(job, output); BlurOutputFormat.setIndexLocally(job, false); BlurOutputFormat.setDocumentBufferStrategy(job, DocumentBufferStrategyHeapSize.class); BlurOutputFormat.setMaxDocumentBufferHeapSize(job, 128 * 1024); assertTrue(job.waitForCompletion(true)); Counters ctrs = job.getCounters(); System.out.println("Counters: " + ctrs); long total = 0; for (int i = 0; i < tableDescriptor.getShardCount(); i++) { Path path = new Path(output, ShardUtil.getShardName(i)); Collection<Path> commitedTasks = getCommitedTasks(path); assertEquals(1, commitedTasks.size()); DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next())); total += reader.numDocs(); reader.close(); } assertEquals(80000, total); } @Test public void testBlurOutputFormatOverFlowMultipleReducersWithReduceMultiplierTest() throws IOException, InterruptedException, ClassNotFoundException { Path input = getInDir(); Path output = getOutDir(); _fileSystem.delete(input, true); _fileSystem.delete(output, true); // 1500 * 50 = 75,000 writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1"); // 100 * 50 = 5,000 writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1"); Job job = Job.getInstance(_conf, "blur index"); job.setJarByClass(BlurOutputFormatTest.class); job.setMapperClass(CsvBlurMapper.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, input); CsvBlurMapper.addColumns(job, "cf1", "col"); Path tablePath = new Path(new Path(_root, "table"), "test"); TableDescriptor tableDescriptor = new TableDescriptor(); tableDescriptor.setShardCount(7); tableDescriptor.setTableUri(tablePath.toString()); tableDescriptor.setName("test"); createShardDirectories(output, 7); BlurOutputFormat.setupJob(job, tableDescriptor); BlurOutputFormat.setOutputPath(job, output); int multiple = 2; BlurOutputFormat.setReducerMultiplier(job, multiple); assertTrue(job.waitForCompletion(true)); Counters ctrs = job.getCounters(); System.out.println("Counters: " + ctrs); long total = 0; for (int i = 0; i < tableDescriptor.getShardCount(); i++) { Path path = new Path(output, ShardUtil.getShardName(i)); Collection<Path> commitedTasks = getCommitedTasks(path); assertTrue(commitedTasks.size() >= multiple); for (Path p : commitedTasks) { DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, p)); total += reader.numDocs(); reader.close(); } } assertEquals(80000, total); } @Test(expected = IllegalArgumentException.class) public void testBlurOutputFormatValidateReducerCount() throws IOException, InterruptedException, ClassNotFoundException { Path input = getInDir(); Path output = getOutDir(); _fileSystem.delete(input, true); _fileSystem.delete(output, true); writeRecordsFile(new Path(input, "part1"), 1, 1, 1, 1, "cf1"); writeRecordsFile(new Path(input, "part2"), 1, 1, 2, 1, "cf1"); Job job = Job.getInstance(_conf, "blur index"); job.setJarByClass(BlurOutputFormatTest.class); job.setMapperClass(CsvBlurMapper.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, input); CsvBlurMapper.addColumns(job, "cf1", "col"); Path tablePath = new Path(new Path(_root, "table"), "test"); TableDescriptor tableDescriptor = new TableDescriptor(); tableDescriptor.setShardCount(1); tableDescriptor.setTableUri(tablePath.toString()); tableDescriptor.setName("test"); createShardDirectories(getOutDir(), 1); BlurOutputFormat.setupJob(job, tableDescriptor); BlurOutputFormat.setOutputPath(job, output); BlurOutputFormat.setReducerMultiplier(job, 2); job.setNumReduceTasks(4); job.submit(); } // @TODO this test to fail sometimes due to issues in the MR MiniCluster // @Test public void testBlurOutputFormatCleanupDuringJobKillTest() throws IOException, InterruptedException, ClassNotFoundException { Path input = getInDir(); Path output = getOutDir(); _fileSystem.delete(input, true); _fileSystem.delete(output, true); // 1500 * 50 = 75,000 writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1"); // 100 * 5000 = 500,000 writeRecordsFile(new Path(input, "part2"), 1, 5000, 2000, 100, "cf1"); Job job = Job.getInstance(_conf, "blur index"); job.setJarByClass(BlurOutputFormatTest.class); job.setMapperClass(CsvBlurMapper.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, input); CsvBlurMapper.addColumns(job, "cf1", "col"); Path tablePath = new Path(new Path(_root, "table"), "test"); TableDescriptor tableDescriptor = new TableDescriptor(); tableDescriptor.setShardCount(2); tableDescriptor.setTableUri(tablePath.toString()); tableDescriptor.setName("test"); createShardDirectories(getOutDir(), 2); BlurOutputFormat.setupJob(job, tableDescriptor); BlurOutputFormat.setOutputPath(job, output); BlurOutputFormat.setIndexLocally(job, false); job.submit(); boolean killCalled = false; while (!job.isComplete()) { Thread.sleep(1000); System.out.printf("Killed [" + killCalled + "] Map [%f] Reduce [%f]%n", job.mapProgress() * 100, job.reduceProgress() * 100); if (job.reduceProgress() > 0.7 && !killCalled) { job.killJob(); killCalled = true; } } assertFalse(job.isSuccessful()); for (int i = 0; i < tableDescriptor.getShardCount(); i++) { Path path = new Path(output, ShardUtil.getShardName(i)); FileSystem fileSystem = path.getFileSystem(job.getConfiguration()); FileStatus[] listStatus = fileSystem.listStatus(path); assertEquals(toString(listStatus), 0, listStatus.length); } } private String toString(FileStatus[] listStatus) { if (listStatus == null || listStatus.length == 0) { return ""; } String s = ""; for (FileStatus fileStatus : listStatus) { if (s.length() > 0) { s += ","; } s += fileStatus.getPath(); } return s; } public static String readFile(Path file) throws IOException { DataInputStream f = _fileSystem.open(file); BufferedReader b = new BufferedReader(new InputStreamReader(f)); StringBuilder result = new StringBuilder(); String line = b.readLine(); while (line != null) { result.append(line); result.append('\n'); line = b.readLine(); } b.close(); return result.toString(); } private Path writeRecordsFile(Path file, int starintgRowId, int numberOfRows, int startRecordId, int numberOfRecords, String family) throws IOException { _fileSystem.delete(file, false); DataOutputStream f = _fileSystem.create(file); PrintWriter writer = new PrintWriter(f); for (int row = 0; row < numberOfRows; row++) { for (int record = 0; record < numberOfRecords; record++) { writer.println(getRecord(row + starintgRowId, record + startRecordId, family)); } } writer.close(); return file; } private void createShardDirectories(Path outDir, int shardCount) throws IOException { _fileSystem.mkdirs(outDir); for (int i = 0; i < shardCount; i++) { _fileSystem.mkdirs(new Path(outDir, ShardUtil.getShardName(i))); } } private String getRecord(int rowId, int recordId, String family) { return rowId + "," + recordId + "," + family + ",valuetoindex"; } }