/******************************************************************************* * Copyright 2013 * TU Darmstadt, FG Sprachtechnologie * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.bigdata.io.hadoop; import static org.junit.Assert.assertEquals; import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.dkpro.bigdata.io.hadoop.LeipzigInputFormat; import org.dkpro.bigdata.io.hadoop.LeipzigInputFormat.LeipzigRecordReader; import org.junit.Before; import org.junit.Test; /** * Unit tests for webcorpus.common.io.LeipzigInputFormat * * @author Johannes Simon * */ public class LeipzigInputFormatTest extends InputFormatTest { JobConf jobConf; // Test file is 364 bytes (not characters!) in size final static String SIMPLE_ARCHIVE = "src/test/resources/leipzig/simple-archive.leipzig"; final static String PROBLEMATIC_ARCHIVE = "src/test/resources/leipzig/problematic-archive.leipzig"; //final static String COMPLEX_ARCHIVE = "src/test/resources/leipzig/complex-archive.leipzig"; final static String SEMI_COMPLEX_ARCHIVE = "src/test/resources/leipzig/semi-complex-archive.leipzig"; //final static String LARGE_RECORD_ARCHIVE = "src/test/resources/leipzig/large-record-archive.leipzig"; @Before public void init() { jobConf = new JobConf(LeipzigInputFormatTest.class); } @Test public void testReadFromStart() throws IOException { FileSplit inputSplit = new FileSplit(new Path(SIMPLE_ARCHIVE), 0, 364, (String[])null); LeipzigRecordReader recordReader = new LeipzigRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 2); } @Test public void testReadParts() throws IOException { // If split ends within a record (e.g. 1 byte is well within the first record) // then RecordReader must read until the end of the record FileSplit inputSplit1 = new FileSplit(new Path(SIMPLE_ARCHIVE), 0, 1, (String[])null); // Also, if split begins within a record then RecordReader must start at // beginning of next record FileSplit inputSplit2 = new FileSplit(new Path(SIMPLE_ARCHIVE), 1, 364, (String[])null); LeipzigRecordReader recordReader1 = new LeipzigRecordReader(inputSplit1, jobConf); LeipzigRecordReader recordReader2 = new LeipzigRecordReader(inputSplit2, jobConf); checkNRecordsRemaining(recordReader1, 1); checkNRecordsRemaining(recordReader2, 1); } @Test public void testReadEmptyPart() throws IOException { // If split starts and ends within a record, than no record should be read FileSplit inputSplit = new FileSplit(new Path(SIMPLE_ARCHIVE), 1, 1, (String[])null); LeipzigRecordReader recordReader = new LeipzigRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 0); } final static int offsetInRecord1 = 10; final static int offsetInRecord2 = 200; @Test public void testReadFromOffset1() throws IOException { FileSplit inputSplit = new FileSplit(new Path(SIMPLE_ARCHIVE), offsetInRecord1, 364, (String[])null); LeipzigRecordReader recordReader = new LeipzigRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 1); } @Test public void testReadFromOffset2() throws IOException { FileSplit inputSplit = new FileSplit(new Path(SIMPLE_ARCHIVE), offsetInRecord2, 364, (String[])null); LeipzigRecordReader recordReader = new LeipzigRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 0); } @Test public void testReadProblematicArchive() throws IOException { Path filePath = new Path(PROBLEMATIC_ARCHIVE); FileSplit inputSplit = new FileSplit(filePath, 0, Integer.MAX_VALUE, (String[])null); LeipzigRecordReader recordReader = new LeipzigRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 9); } /* @Test public void testReadComplexArchive() throws IOException { Path filePath = new Path(COMPLEX_ARCHIVE); FileSplit inputSplit = new FileSplit(filePath, 0, Integer.MAX_VALUE, (String[])null); LeipzigRecordReader recordReader = new LeipzigRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 438); } */ @Test public void testReadSimpleArchiveInSplits() throws IOException { LeipzigInputFormat inputFormat = new LeipzigInputFormat(); String archiveFile = SIMPLE_ARCHIVE; assertEquals(2, readArchiveInSplits(archiveFile, 10, inputFormat, jobConf)); assertEquals(2, readArchiveInSplits(archiveFile, 1024 * 1, inputFormat, jobConf)); assertEquals(2, readArchiveInSplits(archiveFile, 1024 * 10, inputFormat, jobConf)); assertEquals(2, readArchiveInSplits(archiveFile, 1024 * 100, inputFormat, jobConf)); assertEquals(2, readArchiveInSplits(archiveFile, 1024 * 1000, inputFormat, jobConf)); } @Test public void testReadSemiComplexArchive() throws IOException { LeipzigInputFormat inputFormat = new LeipzigInputFormat(); String archiveFile = SEMI_COMPLEX_ARCHIVE; assertEquals(5, readArchiveInSplits(archiveFile, 1024 * 1000, inputFormat, jobConf)); assertEquals(5, readArchiveInSplits(archiveFile, 1024 * 100, inputFormat, jobConf)); assertEquals(5, readArchiveInSplits(archiveFile, 1024 * 10, inputFormat, jobConf)); assertEquals(5, readArchiveInSplits(archiveFile, Integer.MAX_VALUE, inputFormat, jobConf)); } @Test public void testReadSemiComplexArchiveInSplits() throws IOException { LeipzigInputFormat inputFormat = new LeipzigInputFormat(); String archiveFile = SEMI_COMPLEX_ARCHIVE; assertEquals(5, readArchiveInSplits(archiveFile, 1024 * 1000, inputFormat, jobConf)); assertEquals(5, readArchiveInSplits(archiveFile, 1024 * 100, inputFormat, jobConf)); assertEquals(5, readArchiveInSplits(archiveFile, 1024 * 10, inputFormat, jobConf)); assertEquals(5, readArchiveInSplits(archiveFile, 10, inputFormat, jobConf)); } /* @Test public void testReadLargeRecordArchiveInSplits() throws IOException { LeipzigInputFormat inputFormat = new LeipzigInputFormat(); String archiveFile = LARGE_RECORD_ARCHIVE; assertEquals(2, readArchiveInSplits(archiveFile, Integer.MAX_VALUE, inputFormat, jobConf)); } */ }