/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.text; import java.io.IOException; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import static org.apache.mahout.text.SequenceFilesFromDirectory.FILE_FILTER_CLASS_OPTION; /** * RecordReader used with the MultipleTextFileInputFormat class to read full files as * k/v pairs and groups of files as single input splits. */ public class WholeFileRecordReader extends RecordReader<IntWritable, BytesWritable> { private FileSplit fileSplit; private boolean processed = false; private Configuration configuration; private BytesWritable value = new BytesWritable(); private IntWritable index; private String fileFilterClassName = null; private PathFilter pathFilter = null; public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext taskAttemptContext, Integer idx) throws IOException { this.fileSplit = new FileSplit(fileSplit.getPath(idx), fileSplit.getOffset(idx), fileSplit.getLength(idx), fileSplit.getLocations()); this.configuration = taskAttemptContext.getConfiguration(); this.index = new IntWritable(idx); this.fileFilterClassName = this.configuration.get(FILE_FILTER_CLASS_OPTION[0]); } @Override public IntWritable getCurrentKey() { return index; } @Override public BytesWritable getCurrentValue() { return value; } @Override public float getProgress() throws IOException { return processed ? 1.0f : 0.0f; } @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { if (!StringUtils.isBlank(fileFilterClassName) && !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) { try { pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance(); } catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) { throw new IllegalStateException(e); } } } @Override public boolean nextKeyValue() throws IOException { if (!processed) { byte[] contents = new byte[(int) fileSplit.getLength()]; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(this.configuration); if (!fs.isFile(file)) { return false; } FileStatus[] fileStatuses; if (pathFilter != null) { fileStatuses = fs.listStatus(file, pathFilter); } else { fileStatuses = fs.listStatus(file); } if (fileStatuses.length == 1) { try (FSDataInputStream in = fs.open(fileStatuses[0].getPath())) { IOUtils.readFully(in, contents, 0, contents.length); value.setCapacity(contents.length); value.set(contents, 0, contents.length); } processed = true; return true; } } return false; } @Override public void close() throws IOException { } }