/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.hadoop.rdf.io.input.readers; import java.io.IOException; import java.io.InputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.jena.hadoop.rdf.io.RdfIOConstants; import org.apache.jena.hadoop.rdf.io.input.util.RdfIOUtils; import org.apache.jena.hadoop.rdf.io.input.util.TrackableInputStream; import org.apache.jena.hadoop.rdf.io.input.util.TrackedInputStream; import org.apache.jena.hadoop.rdf.io.input.util.TrackedPipedRDFStream; import org.apache.jena.hadoop.rdf.types.AbstractNodeTupleWritable; import org.apache.jena.riot.Lang; import org.apache.jena.riot.RDFDataMgr; import org.apache.jena.riot.ReaderRIOT; import org.apache.jena.riot.lang.PipedRDFIterator; import org.apache.jena.riot.lang.PipedRDFStream; import org.apache.jena.riot.system.ParserProfile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * An abstract implementation for a record reader that reads records from whole * files i.e. the whole file must be kept together to allow tuples to be * successfully read. This only supports reading from file splits currently. * <p> * The keys produced are the approximate position in the file at which a tuple * was found and the values will be node tuples. Positions are approximate * because they are recorded after the point at which the most recent tuple was * parsed from the input thus they reflect the approximate position in the * stream immediately after which the triple was found. * </p> * <p> * You should also be aware that with whole file formats syntax compressions in * the format may mean that there are multiple triples produced with the same * position and thus key. * </p> * * * * @param <TValue> * Value type * @param <T> * Tuple type */ public abstract class AbstractWholeFileNodeTupleReader<TValue, T extends AbstractNodeTupleWritable<TValue>> extends RecordReader<LongWritable, T> { private static final Logger LOG = LoggerFactory.getLogger(AbstractLineBasedNodeTupleReader.class); private CompressionCodec compressionCodecs; private TrackedInputStream input; private LongWritable key; private long length; private T tuple; private TrackedPipedRDFStream<TValue> stream; private PipedRDFIterator<TValue> iter; private Thread parserThread; private boolean finished = false; private boolean ignoreBadTuples = true; private boolean parserFinished = false; private Throwable parserError = null; @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { LOG.debug("initialize({}, {})", genericSplit, context); // Assuming file split if (!(genericSplit instanceof FileSplit)) throw new IOException("This record reader only supports FileSplit inputs"); FileSplit split = (FileSplit) genericSplit; // Configuration Configuration config = context.getConfiguration(); this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true); if (this.ignoreBadTuples) LOG.warn( "Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES); // Figure out what portion of the file to read if (split.getStart() > 0) throw new IOException("This record reader requires a file split which covers the entire file"); final Path file = split.getPath(); long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen(); CompressionCodecFactory factory = new CompressionCodecFactory(config); this.compressionCodecs = factory.getCodec(file); LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { split.getStart(), split.getLength(), totalLength })); if (totalLength > split.getLength()) throw new IOException("This record reader requires a file split which covers the entire file"); // Open the file and prepare the input stream FileSystem fs = file.getFileSystem(config); FSDataInputStream fileIn = fs.open(file); this.length = split.getLength(); if (this.compressionCodecs != null) { // Compressed input input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn)); } else { // Uncompressed input input = new TrackedInputStream(fileIn); } // Set up background thread for parser iter = this.getPipedIterator(); this.stream = this.getPipedStream(iter, this.input); ParserProfile profile = RdfIOUtils.createParserProfile(context, file); Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), profile); this.parserThread = new Thread(parserRunnable); this.parserThread.setDaemon(true); this.parserThread.start(); } /** * Gets the RDF iterator to use * * @return Iterator */ protected abstract PipedRDFIterator<TValue> getPipedIterator(); /** * Gets the RDF stream to parse to * * @param iterator * Iterator * @return RDF stream */ protected abstract TrackedPipedRDFStream<TValue> getPipedStream(PipedRDFIterator<TValue> iterator, TrackableInputStream input); /** * Gets the RDF language to use for parsing * * @return */ protected abstract Lang getRdfLanguage(); /** * Creates the runnable upon which the parsing will run * * @param input * Input * @param stream * Stream * @param lang * Language to use for parsing * @return Parser runnable */ private Runnable createRunnable(@SuppressWarnings("rawtypes") final AbstractWholeFileNodeTupleReader reader, final InputStream input, final PipedRDFStream<TValue> stream, final Lang lang, final ParserProfile profile) { return new Runnable() { @Override public void run() { try { ReaderRIOT riotReader = RDFDataMgr.createReader(lang); riotReader.setParserProfile(profile); riotReader.read(input, null, lang.getContentType(), stream, null); reader.setParserFinished(null); } catch (Throwable e) { reader.setParserFinished(e); } } }; } /** * Sets the parser thread finished state * * @param e * Error (if any) */ private void setParserFinished(Throwable e) { synchronized (this.parserThread) { this.parserError = e; this.parserFinished = true; } } /** * Waits for the parser thread to have reported as finished * * @throws InterruptedException */ private void waitForParserFinished() throws InterruptedException { do { synchronized (this.parserThread) { if (this.parserFinished) return; } Thread.sleep(50); } while (true); } /** * Creates an instance of a writable tuple from the given tuple value * * @param tuple * Tuple value * @return Writable tuple */ protected abstract T createInstance(TValue tuple); @Override public boolean nextKeyValue() throws IOException { // Reuse key for efficiency if (key == null) { key = new LongWritable(); } if (this.finished) return false; try { if (this.iter.hasNext()) { Long l = this.stream.getPosition(); if (l != null) { this.key.set(l); // For compressed input the actual length from which we // calculate progress is likely less than the actual // uncompressed length so we may need to increment the // length as we go along // We always add 1 more than the current length because we // don't want to report 100% progress until we really have // finished if (this.compressionCodecs != null && l > this.length) this.length = l + 1; } this.tuple = this.createInstance(this.iter.next()); return true; } else { // Need to ensure that the parser thread has finished in order // to determine whether we finished without error this.waitForParserFinished(); if (this.parserError != null) { LOG.error("Error parsing whole file, aborting further parsing", this.parserError); if (!this.ignoreBadTuples) throw new IOException("Error parsing whole file at position " + this.input.getBytesRead() + ", aborting further parsing", this.parserError); } this.key = null; this.tuple = null; this.finished = true; // This is necessary so that when compressed input is used we // report 100% progress once we've reached the genuine end of // the stream if (this.compressionCodecs != null) this.length--; return false; } } catch (Throwable e) { // Failed to read the tuple on this line LOG.error("Error parsing whole file, aborting further parsing", e); if (!this.ignoreBadTuples) { this.iter.close(); throw new IOException("Error parsing whole file at position " + this.input.getBytesRead() + ", aborting further parsing", e); } this.key = null; this.tuple = null; this.finished = true; return false; } } @Override public LongWritable getCurrentKey() { return this.key; } @Override public T getCurrentValue() { return this.tuple; } @Override public float getProgress() { float progress = 0.0f; if (this.key == null) { // We've either not started or we've finished progress = (this.finished ? 1.0f : 0.0f); } else if (this.key.get() == Long.MIN_VALUE) { // We don't have a position so we've either in-progress or finished progress = (this.finished ? 1.0f : 0.5f); } else { // We're some way through the file progress = this.key.get() / (float) this.length; } LOG.debug("getProgress() --> {}", progress); return progress; } @Override public void close() throws IOException { this.iter.close(); this.input.close(); this.finished = true; } }