/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.hadoop.rcfile;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import com.cloudera.cdk.morphline.api.AbstractMorphlineTest;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.Fields;
import com.google.common.collect.Lists;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
public class ReadRCFileTest extends AbstractMorphlineTest {
private static MiniDFSCluster cluster = null;
private static FileSystem dfs = null;
private Path testDirectory;
private static final int NUM_RECORDS = 5;
private static final int NUM_COLUMNS = 5;
@BeforeClass
public static void setupFS() throws IOException {
final Configuration conf = new Configuration();
cluster = new MiniDFSCluster.Builder(conf).build();
dfs = cluster.getFileSystem();
}
@AfterClass
public static void teardownFS() throws IOException {
dfs = null;
if (cluster != null) {
cluster.shutdown();
cluster = null;
}
}
@Before
public void setUp() throws Exception {
super.setUp();
testDirectory = new Path(Files.createTempDir().getAbsolutePath());
}
@After
public void tearDown() throws Exception {
super.tearDown();
dfs.delete(testDirectory, true);
}
@Test
public void testRCFileRowWise() throws Exception {
morphline = createMorphline("test-morphlines/rcFileMorphlineRow");
String rcFileName = "testRCFileRowWise.rc";
List<Record> expected = setupRCFile(rcFileName, NUM_RECORDS, NUM_COLUMNS,
true);
Path inputFile = dfs.makeQualified(new Path(testDirectory, rcFileName));
Record input = new Record();
input.put(Fields.ATTACHMENT_NAME, inputFile.toString());
input.put(Fields.ATTACHMENT_BODY, readPath(inputFile));
startSession();
assertEquals(1, collector.getNumStartEvents());
assertTrue(morphline.process(input));
assertTrue(areFieldsEqual(expected, collector.getRecords(), NUM_COLUMNS,
NUM_RECORDS, true));
}
@Test
public void testRCFileColumnWise() throws Exception {
morphline = createMorphline("test-morphlines/rcFileMorphlineColumn");
String rcFileName = "testRCFileColumnWise.rc";
List<Record> expected = setupRCFile(rcFileName, NUM_RECORDS, NUM_COLUMNS,
false);
Path inputFile = dfs.makeQualified(new Path(testDirectory, rcFileName));
Record input = new Record();
input.put(Fields.ATTACHMENT_NAME, inputFile.toString());
input.put(Fields.ATTACHMENT_BODY, readPath(inputFile));
startSession();
assertEquals(1, collector.getNumStartEvents());
assertTrue(morphline.process(input));
assertTrue(areFieldsEqual(expected, collector.getRecords(), NUM_COLUMNS,
NUM_RECORDS, false));
}
private void createRCFile(final String fileName, final int numRecords,
final int maxColumns) throws IOException {
// Write the sequence file
SequenceFile.Metadata metadata = getMetadataForRCFile();
Configuration conf = new Configuration();
conf.set(RCFile.COLUMN_NUMBER_CONF_STR, String.valueOf(maxColumns));
Path inputFile = dfs.makeQualified(new Path(testDirectory, fileName));
RCFile.Writer rcFileWriter = new RCFile.Writer(dfs, conf, inputFile, null,
metadata, null);
for (int row = 0; row < numRecords; row++) {
BytesRefArrayWritable dataWrite = new BytesRefArrayWritable(maxColumns);
dataWrite.resetValid(maxColumns);
for (int column = 0; column < maxColumns; column++) {
Text sampleText = new Text("ROW-NUM:" + row + ", COLUMN-NUM:" + column);
ByteArrayDataOutput dataOutput = ByteStreams.newDataOutput();
sampleText.write(dataOutput);
dataWrite.set(column, new BytesRefWritable(dataOutput.toByteArray()));
}
rcFileWriter.append(dataWrite);
}
rcFileWriter.close();
}
private InputStream readPath(final Path inputFile) throws IOException {
FileSystem fs = inputFile.getFileSystem(new Configuration());
return fs.open(inputFile);
}
private List<Record> setupRCFile(final String fileName, final int numRecords,
final int maxColumns, final boolean rowWise)
throws IOException {
createRCFile(fileName, numRecords, maxColumns);
List<Record> expected = Lists.newArrayList();
if (rowWise) {
// Row wise expected records
for (int row = 0; row < numRecords; row++) {
Record record = new Record();
for (int column = 0; column < maxColumns; column++) {
Text sampleText = new Text("ROW-NUM:" + row + ", COLUMN-NUM:"
+ column);
record.put("field" + (column + 1), sampleText);
}
expected.add(record);
}
} else {
// Column wise expected records
for (int column = 0; column < maxColumns; column++) {
for (int row = 0; row < numRecords; row++) {
Record record = new Record();
Text sampleText = new Text("ROW-NUM:" + row + ", COLUMN-NUM:"
+ column);
record.put("field" + (column + 1), sampleText);
expected.add(record);
}
}
}
return expected;
}
private SequenceFile.Metadata getMetadataForRCFile() {
return RCFile.createMetadata(new Text("metaField"), new Text("metaValue"));
}
private boolean areFieldsEqual(List<Record> expected, List<Record> actual,
final int columnSize, final int rowSize, final boolean rowWiseCheck) {
if (expected.size() != actual.size()) {
return false;
}
if (rowWiseCheck) {
for (int i = 0; i < actual.size(); i++) {
Record currentExpected = expected.get(i);
Record currentActual = actual.get(i);
if (!areRecordColumnsEqual(currentActual, currentExpected, columnSize)) {
return false;
}
}
} else {
for (int i = 0; i < columnSize; i++) {
String fieldName = "field" + (i + 1);
for (int j = 0; j < rowSize; j++) {
Record currentExpected = expected.get((i * rowSize) + j);
Record currentActual = actual.get((i * rowSize) + j);
if (!isRecordColumnEqual(currentActual, currentExpected, fieldName)) {
return false;
}
}
}
}
return true;
}
private boolean areRecordColumnsEqual(final Record actual,
final Record expected, final int columnSize) {
for (int i = 0; i < columnSize; i++) {
String fieldName = "field" + (i + 1);
if (!isRecordColumnEqual(actual, expected, fieldName)) {
return false;
}
}
return true;
}
private boolean isRecordColumnEqual(final Record actual,
final Record expected, final String fieldName) {
return actual.get(fieldName).equals(expected.get(fieldName));
}
}