/******************************************************************************* * Copyright 2013 * TU Darmstadt, FG Sprachtechnologie * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.bigdata.io.hadoop; import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.dkpro.bigdata.io.hadoop.WARCInputFormat.WARCRecordReader; import org.junit.Before; import org.junit.Test; /** * Unit tests for webcorpus.common.io.ARCInputFormat * * @author Johannes Simon * */ public class WARCInputFormatTest extends InputFormatTest { JobConf jobConf; final static String ARCHIVE_SIMPLE = "src/test/resources/warc/simple-archive.warc"; @Before public void init() { jobConf = new JobConf(WARCInputFormatTest.class); } @Test public void testReadSimpleArchive() throws IOException { Path filePath = new Path(ARCHIVE_SIMPLE); FileSplit inputSplit = new FileSplit(filePath, 0, 2177, (String[])null); WARCRecordReader recordReader = new WARCRecordReader(inputSplit, jobConf); // Archive contains only 2 records. The arc version block is only meta data. checkNRecordsRemaining(recordReader, 2); } final static int offsetInRecord1 = 10; final static int offsetInRecord2 = 792; @Test public void testReadFromOffset1() throws IOException { Path filePath = new Path(ARCHIVE_SIMPLE); FileSplit inputSplit = new FileSplit(filePath, offsetInRecord1, 2177, (String[])null); WARCRecordReader recordReader = new WARCRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 2); } @Test public void testReadFromOffset2() throws IOException { Path filePath = new Path(ARCHIVE_SIMPLE); FileSplit inputSplit = new FileSplit(filePath, offsetInRecord2, 2177, (String[])null); WARCRecordReader recordReader = new WARCRecordReader(inputSplit, jobConf); checkNRecordsRemaining(recordReader, 1); } }