LazyTextExtractorField.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tika.example;

import java.io.InputStream;
import java.io.Reader;
import java.util.concurrent.Executor;

import org.apache.jackrabbit.core.query.lucene.FieldNames;
import org.apache.jackrabbit.core.value.InternalValue;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * <code>LazyTextExtractorField</code> implements a Lucene field with a String
 * value that is lazily initialized from a given {@link Reader}. In addition
 * this class provides a method to find out whether the purpose of the reader is
 * to extract text and whether the extraction process is already finished.
 *
 * @see #isExtractorFinished()
 */
@SuppressWarnings("serial")
public class LazyTextExtractorField extends AbstractField {
    /**
     * The logger instance for this class.
     */
    private static final Logger LOG = LoggerFactory.getLogger(LazyTextExtractorField.class);

    /**
     * The exception used to forcibly terminate the extraction process when the
     * maximum field length is reached.
     * <p>
     * Such exceptions shouldn't be used in logging since its stack trace is meaningless.
     */
    private static final SAXException STOP = new SAXException("max field length reached");

    /**
     * The extracted text content of the given binary value. Set to non-null
     * when the text extraction task finishes.
     */
    private volatile String extract = null;

    /**
     * Creates a new <code>LazyTextExtractorField</code> with the given
     * <code>name</code>.
     *
     * @param name         the name of the field.
     * @param reader       the reader where to obtain the string from.
     * @param highlighting set to <code>true</code> to enable result highlighting support
     */
    public LazyTextExtractorField(Parser parser, InternalValue value,
                                  Metadata metadata, Executor executor, boolean highlighting,
                                  int maxFieldLength) {
        super(FieldNames.FULLTEXT, highlighting ? Store.YES : Store.NO,
                Field.Index.ANALYZED, highlighting ? TermVector.WITH_OFFSETS
                        : TermVector.NO);
        executor.execute(new ParsingTask(parser, value, metadata,
                maxFieldLength));
    }

    /**
     * Returns the extracted text. This method blocks until the text extraction
     * task has been completed.
     *
     * @return the string value of this field
     */
    public synchronized String stringValue() {
        try {
            while (!isExtractorFinished()) {
                wait();
            }
            return extract;
        } catch (InterruptedException e) {
            LOG.error("Text extraction thread was interrupted", e);
            return "";
        }
    }

    /**
     * @return always <code>null</code>
     */
    public Reader readerValue() {
        return null;
    }

    /**
     * @return always <code>null</code>
     */
    public byte[] binaryValue() {
        return null;
    }

    /**
     * @return always <code>null</code>
     */
    public TokenStream tokenStreamValue() {
        return null;
    }

    /**
     * Checks whether the text extraction task has finished.
     *
     * @return <code>true</code> if the extracted text is available
     */
    public boolean isExtractorFinished() {
        return extract != null;
    }

    private synchronized void setExtractedText(String value) {
        extract = value;
        notify();
    }

    /**
     * Releases all resources associated with this field.
     */
    public void dispose() {
        // TODO: Cause the ContentHandler below to throw an exception
    }

    /**
     * The background task for extracting text from a binary value.
     */
    private class ParsingTask extends DefaultHandler implements Runnable {
        private final Parser parser;

        private final InternalValue value;

        private final Metadata metadata;

        private final int maxFieldLength;

        private final StringBuilder builder = new StringBuilder();

        private final ParseContext context = new ParseContext();

        // NOTE: not a part of Jackrabbit code, made
        private final ContentHandler handler = new DefaultHandler();

        public ParsingTask(Parser parser, InternalValue value,
                           Metadata metadata, int maxFieldLength) {
            this.parser = parser;
            this.value = value;
            this.metadata = metadata;
            this.maxFieldLength = maxFieldLength;
        }

        public void run() {
            try {
                try (InputStream stream = value.getStream()) {
                    parser.parse(stream, handler, metadata, context);
                }
            } catch (LinkageError e) {
                // Capture and ignore
            } catch (Throwable t) {
                if (t != STOP) {
                    LOG.debug("Failed to extract text.", t);
                    setExtractedText("TextExtractionError");
                    return;
                }
            } finally {
                value.discard();
            }
            setExtractedText(handler.toString());

        }

        @Override
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            builder.append(ch, start,
                    Math.min(length, maxFieldLength - builder.length()));
            if (builder.length() >= maxFieldLength) {
                throw STOP;
            }
        }

        @Override
        public void ignorableWhitespace(char[] ch, int start, int length)
                throws SAXException {
            characters(ch, start, length);
        }
    }
}