/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.text; import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.HashMap; import java.util.Map; import org.apache.commons.io.Charsets; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class TestSequenceFilesFromDirectory extends MahoutTestCase { private static final Logger logger = LoggerFactory.getLogger(TestSequenceFilesFromDirectory.class); private static final String[][] DATA1 = { {"test1", "This is the first text."}, {"test2", "This is the second text."}, {"test3", "This is the third text."} }; private static final String[][] DATA2 = { {"recursive_test1", "This is the first text."}, {"recursive_test2", "This is the second text."}, {"recursive_test3", "This is the third text."} }; @Test public void testSequenceFileFromDirectoryBasic() throws Exception { // parameters Configuration configuration = getConfiguration(); FileSystem fs = FileSystem.get(configuration); // create Path tmpDir = this.getTestTempDirPath(); Path inputDir = new Path(tmpDir, "inputDir"); fs.mkdirs(inputDir); Path outputDir = new Path(tmpDir, "outputDir"); Path outputDirRecursive = new Path(tmpDir, "outputDirRecursive"); Path inputDirRecursive = new Path(tmpDir, "inputDirRecur"); fs.mkdirs(inputDirRecursive); // prepare input files createFilesFromArrays(configuration, inputDir, DATA1); SequenceFilesFromDirectory.main(new String[]{ "--input", inputDir.toString(), "--output", outputDir.toString(), "--chunkSize", "64", "--charset", Charsets.UTF_8.name(), "--keyPrefix", "UID", "--method", "sequential"}); // check output chunk files checkChunkFiles(configuration, outputDir, DATA1, "UID"); createRecursiveDirFilesFromArrays(configuration, inputDirRecursive, DATA2); FileStatus fstInputPath = fs.getFileStatus(inputDirRecursive); String dirs = HadoopUtil.buildDirList(fs, fstInputPath); System.out.println("\n\n ----- recursive dirs: " + dirs); SequenceFilesFromDirectory.main(new String[]{ "--input", inputDirRecursive.toString(), "--output", outputDirRecursive.toString(), "--chunkSize", "64", "--charset", Charsets.UTF_8.name(), "--keyPrefix", "UID", "--method", "sequential"}); checkRecursiveChunkFiles(configuration, outputDirRecursive, DATA2, "UID"); } @Test public void testSequenceFileFromDirectoryMapReduce() throws Exception { Configuration conf = getConfiguration(); FileSystem fs = FileSystem.get(conf); // create Path tmpDir = this.getTestTempDirPath(); Path inputDir = new Path(tmpDir, "inputDir"); fs.mkdirs(inputDir); Path inputDirRecur = new Path(tmpDir, "inputDirRecur"); fs.mkdirs(inputDirRecur); Path mrOutputDir = new Path(tmpDir, "mrOutputDir"); Path mrOutputDirRecur = new Path(tmpDir, "mrOutputDirRecur"); createFilesFromArrays(conf, inputDir, DATA1); SequenceFilesFromDirectory.main(new String[]{ "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"), "--input", inputDir.toString(), "--output", mrOutputDir.toString(), "--chunkSize", "64", "--charset", Charsets.UTF_8.name(), "--method", "mapreduce", "--keyPrefix", "UID", "--fileFilterClass", "org.apache.mahout.text.TestPathFilter" }); checkMRResultFiles(conf, mrOutputDir, DATA1, "UID"); createRecursiveDirFilesFromArrays(conf, inputDirRecur, DATA2); FileStatus fst_input_path = fs.getFileStatus(inputDirRecur); String dirs = HadoopUtil.buildDirList(fs, fst_input_path); logger.info("\n\n ---- recursive dirs: {}", dirs); SequenceFilesFromDirectory.main(new String[]{ "-Dhadoop.tmp.dir=" + conf.get("hadoop.tmp.dir"), "--input", inputDirRecur.toString(), "--output", mrOutputDirRecur.toString(), "--chunkSize", "64", "--charset", Charsets.UTF_8.name(), "--method", "mapreduce", "--keyPrefix", "UID", "--fileFilterClass", "org.apache.mahout.text.TestPathFilter" }); checkMRResultFilesRecursive(conf, mrOutputDirRecur, DATA2, "UID"); } private static void createFilesFromArrays(Configuration conf, Path inputDir, String[][] data) throws IOException { FileSystem fs = FileSystem.get(conf); for (String[] aData : data) { try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(new Path(inputDir, aData[0])), Charsets.UTF_8)){ writer.write(aData[1]); } } } private static void createRecursiveDirFilesFromArrays(Configuration configuration, Path inputDir, String[][] data) throws IOException { FileSystem fs = FileSystem.get(configuration); logger.info("creativeRecursiveDirFilesFromArrays > based on: {}", inputDir.toString()); Path curPath; String currentRecursiveDir = inputDir.toString(); for (String[] aData : data) { currentRecursiveDir += "/" + aData[0]; File subDir = new File(currentRecursiveDir); subDir.mkdir(); curPath = new Path(subDir.toString(), "file.txt"); logger.info("Created file: {}", curPath.toString()); try (OutputStreamWriter writer = new OutputStreamWriter(fs.create(curPath), Charsets.UTF_8)){ writer.write(aData[1]); } } } private static void checkChunkFiles(Configuration configuration, Path outputDir, String[][] data, String prefix) throws IOException { FileSystem fs = FileSystem.get(configuration); // output exists? FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter()); assertEquals(1, fileStatuses.length); // only one assertEquals("chunk-0", fileStatuses[0].getPath().getName()); Map<String, String> fileToData = new HashMap<>(); for (String[] aData : data) { fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]); } // read a chunk to check content try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)){ while (iterator.hasNext()) { Pair<Text, Text> record = iterator.next(); String retrievedData = fileToData.get(record.getFirst().toString().trim()); assertNotNull(retrievedData); assertEquals(retrievedData, record.getSecond().toString().trim()); } } } private static void checkRecursiveChunkFiles(Configuration configuration, Path outputDir, String[][] data, String prefix) throws IOException { FileSystem fs = FileSystem.get(configuration); System.out.println(" ----------- check_Recursive_ChunkFiles ------------"); // output exists? FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter()); assertEquals(1, fileStatuses.length); // only one assertEquals("chunk-0", fileStatuses[0].getPath().getName()); Map<String, String> fileToData = new HashMap<>(); String currentPath = prefix; for (String[] aData : data) { currentPath += Path.SEPARATOR + aData[0]; fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]); } // read a chunk to check content try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>(fileStatuses[0].getPath(), true, configuration)) { while (iterator.hasNext()) { Pair<Text, Text> record = iterator.next(); String retrievedData = fileToData.get(record.getFirst().toString().trim()); System.out.printf("%s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim()); assertNotNull(retrievedData); assertEquals(retrievedData, record.getSecond().toString().trim()); System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(), record.getSecond().toString()); } } } private static void checkMRResultFiles(Configuration conf, Path outputDir, String[][] data, String prefix) throws IOException { FileSystem fs = FileSystem.get(conf); // output exists? FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter()); assertEquals(1, fileStatuses.length); // only one assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); Map<String, String> fileToData = new HashMap<>(); for (String[] aData : data) { System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]); fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]); } // read a chunk to check content try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>( fileStatuses[0].getPath(), true, conf)) { while (iterator.hasNext()) { Pair<Text, Text> record = iterator.next(); String retrievedData = fileToData.get(record.getFirst().toString().trim()); System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(), record.getSecond().toString().trim()); assertNotNull(retrievedData); assertEquals(retrievedData, record.getSecond().toString().trim()); } } } private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir, String[][] data, String prefix) throws IOException { FileSystem fs = FileSystem.get(configuration); // output exists? FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter()); assertEquals(1, fileStatuses.length); // only one assertEquals("part-m-00000", fileStatuses[0].getPath().getName()); Map<String, String> fileToData = new HashMap<>(); String currentPath = prefix; for (String[] aData : data) { currentPath += Path.SEPARATOR + aData[0]; fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]); } // read a chunk to check content try (SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<>( fileStatuses[0].getPath(), true, configuration)){ while (iterator.hasNext()) { Pair<Text, Text> record = iterator.next(); System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim()); String retrievedData = fileToData.get(record.getFirst().toString().trim()); assertNotNull(retrievedData); assertEquals(retrievedData, record.getSecond().toString().trim()); } } } }