/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.core.query.lucene;
import java.io.InputStream;
import java.io.Reader;
import java.util.concurrent.Executor;
import org.apache.jackrabbit.core.LowPriorityTask;
import org.apache.jackrabbit.core.value.InternalValue;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* <code>LazyTextExtractorField</code> implements a Lucene field with a String
* value that is lazily initialized from a given {@link Reader}. In addition
* this class provides a method to find out whether the purpose of the reader
* is to extract text and whether the extraction process is already finished.
*
* @see #isExtractorFinished()
*/
@SuppressWarnings("serial")
public class LazyTextExtractorField extends AbstractField {
/**
* The logger instance for this class.
*/
private static final Logger log =
LoggerFactory.getLogger(LazyTextExtractorField.class);
/**
* The extracted text content of the given binary value.
* Set to non-null when the text extraction task finishes.
*/
private volatile String extract = null;
/**
* Creates a new <code>LazyTextExtractorField</code>.
*
* @param parser
* @param value
* @param metadata
* @param executor
* @param highlighting
* set to <code>true</code> to enable result highlighting support
* @param maxFieldLength
* @param withNorms
*/
public LazyTextExtractorField(
Parser parser, InternalValue value, Metadata metadata,
Executor executor, boolean highlighting, int maxFieldLength,
boolean withNorms) {
super(FieldNames.FULLTEXT,
highlighting ? Store.YES : Store.NO,
withNorms ? Field.Index.ANALYZED : Field.Index.ANALYZED_NO_NORMS,
highlighting ? TermVector.WITH_OFFSETS : TermVector.NO);
executor.execute(new ParsingTask(parser, value, metadata,
maxFieldLength) {
public void setExtractedText(String value) {
LazyTextExtractorField.this.setExtractedText(value);
}
});
}
/**
* Returns the extracted text. This method blocks until the text
* extraction task has been completed.
*
* @return the string value of this field
*/
public synchronized String stringValue() {
try {
while (!isExtractorFinished()) {
wait();
}
return extract;
} catch (InterruptedException e) {
log.error("Text extraction thread was interrupted", e);
return "";
}
}
/**
* @return always <code>null</code>
*/
public Reader readerValue() {
return null;
}
/**
* @return always <code>null</code>
*/
public byte[] binaryValue() {
return null;
}
/**
* @return always <code>null</code>
*/
public TokenStream tokenStreamValue() {
return null;
}
/**
* Checks whether the text extraction task has finished.
*
* @return <code>true</code> if the extracted text is available
*/
public boolean isExtractorFinished() {
return extract != null;
}
private synchronized void setExtractedText(String value) {
extract = value;
notify();
}
/**
* Releases all resources associated with this field.
*/
public void dispose() {
// TODO: Cause the ContentHandler below to throw an exception
}
/**
* The background task for extracting text from a binary value.
*/
abstract static class ParsingTask extends BodyContentHandler implements LowPriorityTask {
private final Parser parser;
private final InternalValue value;
private final Metadata metadata;
private final WriteOutContentHandler writeOutContentHandler;
public ParsingTask(Parser parser, InternalValue value,
Metadata metadata, int maxFieldLength) {
this(new WriteOutContentHandler(maxFieldLength), parser, value,
metadata);
}
private ParsingTask(WriteOutContentHandler writeOutContentHandler,
Parser parser, InternalValue value, Metadata metadata) {
super(writeOutContentHandler);
this.writeOutContentHandler = writeOutContentHandler;
this.parser = parser;
this.value = value;
this.metadata = metadata;
}
public void run() {
try {
InputStream stream = value.getStream();
try {
parser.parse(stream, this, metadata, new ParseContext());
} finally {
stream.close();
}
} catch (LinkageError e) {
// Capture and ignore errors caused by extraction libraries
// not being present. This is equivalent to disabling
// selected media types in configuration, so we can simply
// ignore these errors.
} catch (Throwable t) {
// Capture and report any other full text extraction problems.
// The special STOP exception is used for normal termination.
if (!writeOutContentHandler.isWriteLimitReached(t)) {
log.debug("Failed to extract text from a binary property."
+ " This is a fairly common case, and nothing to"
+ " worry about. The stack trace is included to"
+ " help improve the text extraction feature.", t);
setExtractedText("TextExtractionError");
return;
}
} finally {
value.discard();
}
setExtractedText(writeOutContentHandler.toString());
}
protected abstract void setExtractedText(String value);
}
}