/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.icij.extract.parser; import java.io.*; import java.util.concurrent.Executor; import java.util.function.Function; import org.apache.tika.parser.Parser; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; /** * Reader for the content from a given binary stream. This class uses a background parsing task with a {@link Parser} * to parse the content from a given input stream. A {@link ContentHandler} class and a pipe is used to convert the * push-based SAX event stream to the pull-based character stream defined by the {@link Reader} interface. * * Based on an implementation from the Tika source. This version adds functionality for markup output. * * @since 1.0.0-beta */ public class ParsingReader extends Reader { /** * Executor for background parsing tasks. */ private final Executor executor = new ParsingExecutor(); /** * Parser instance used for parsing the given binary stream. */ protected final Parser parser; /** * Buffered read end of the pipe. */ protected final Reader reader; /** * Write end of the pipe. */ private final Writer writer; /** * The binary stream being parsed. */ protected final InputStream input; /** * Metadata associated with the document being parsed. */ protected final Metadata metadata; /** * The parse context. */ protected final ParseContext context; /** * Receives SAX events. */ protected final ContentHandler handler; /** * An exception (if any) thrown by the parsing thread. */ private transient Throwable throwable; /** * Utility method that returns a {@link Metadata} instance for a document with the given name. * * @param name resource name (or <code>null</code>) * @return metadata instance */ private static Metadata getMetadata(final String name) { final Metadata metadata = new Metadata(); if (name != null && name.length() > 0) { metadata.set(Metadata.RESOURCE_NAME_KEY, name); } return metadata; } /** * Creates a reader for the content of the given binary stream. * * @param input binary stream * @throws IOException if the document can not be parsed */ public ParsingReader(final InputStream input) throws IOException { this(new AutoDetectParser(), input, getMetadata(null), new ParseContext()); context.set(Parser.class, parser); } /** * Creates a reader for the content of the given binary stream with the given name. * * @param input binary stream * @param name document name * @throws IOException if the document can not be parsed */ public ParsingReader(final InputStream input, final String name) throws IOException { this(new AutoDetectParser(), input, getMetadata(name), new ParseContext()); context.set(Parser.class, parser); } public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext context) throws IOException { this(parser, input, metadata, context, BodyContentHandler::new); } /** * Creates a reader for the content of the given binary stream * with the given document metadata. The given parser is used for the * parsing task that is run with the given executor. * * The created reader will be responsible for closing the given stream. * The stream and any associated resources will be closed at or before * the time when the {@link #close()} method is called on this reader. * * @param parser parser instance * @param input binary stream * @param metadata document metadata * @param context parsing context * @throws IOException if the document can not be parsed */ public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext context, final Function<Writer, ContentHandler> handler) throws IOException { final PipedReader pipedReader = new PipedReader(); this.parser = parser; reader = new BufferedReader(pipedReader); try { writer = new PipedWriter(pipedReader); } catch (IOException e) { throw new IllegalStateException(e); // Should never happen. } this.input = input; this.metadata = metadata; this.context = context; // Generate the handler. this.handler = handler.apply(writer); parse(); // TIKA-203: Buffer first character to force metadata extraction. reader.mark(1); //noinspection ResultOfMethodCallIgnored reader.read(); reader.reset(); } /** * Reads parsed text from the pipe connected to the parsing thread. * Fails if the parsing thread has thrown an exception. * * @param buffer character buffer * @param off start offset within the buffer * @param len maximum number of characters to read * @throws IOException if the parsing thread has failed or * if for some reason the pipe does not work properly */ @Override public int read(final char[] buffer, final int off, final int len) throws IOException { if (throwable instanceof IOException) { throw (IOException) throwable; } else if (throwable != null) { throw new IOException("", throwable); } return reader.read(buffer, off, len); } /** * Closes the read end of the pipe. If the parsing thread is still * running, next write to the pipe will fail and cause the thread * to stop. Thus there is no need to explicitly terminate the thread. * * @throws IOException if the pipe cannot be closed */ @Override public void close() throws IOException { reader.close(); } /** * Parses the given binary stream and writes the text content to the write end of the pipe. * * Potential exceptions (including the one caused if the read end is closed unexpectedly) are stored before the * input stream is closed and processing is stopped. */ public void parse() { executor.execute(()-> { try { parser.parse(input, handler, metadata, context); } catch (Throwable t) { throwable = t; } try { input.close(); } catch (Throwable t) { if (throwable == null) { throwable = t; } } try { writer.close(); } catch (Throwable t) { if (throwable == null) { throwable = t; } } }); } /** * The executor for background parsing tasks. */ private class ParsingExecutor implements Executor { /** * Executes the given task in a daemon thread. * * @param task background parsing task */ @Override public void execute(final Runnable task) { String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name != null) { name = "ICIJ Extract: " + name; } else { name = "ICIJ Extract"; } final Thread thread = new Thread(task, name); thread.setDaemon(true); thread.start(); } } }