/* * (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Contributors: * Florent Guillaume * Stephane Lacoin */ package org.nuxeo.ecm.core.storage; import java.io.IOException; import java.util.LinkedList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.nuxeo.ecm.core.api.Blob; import org.nuxeo.ecm.core.api.DocumentLocation; import org.nuxeo.ecm.core.api.DocumentModel; import org.nuxeo.ecm.core.api.IdRef; import org.nuxeo.ecm.core.api.blobholder.BlobHolder; import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; import org.nuxeo.ecm.core.api.impl.DocumentLocationImpl; import org.nuxeo.ecm.core.api.impl.blob.StringBlob; import org.nuxeo.ecm.core.convert.api.ConversionException; import org.nuxeo.ecm.core.convert.api.ConversionService; import org.nuxeo.ecm.core.storage.FulltextUpdaterWork.IndexAndText; import org.nuxeo.ecm.core.utils.BlobsExtractor; import org.nuxeo.ecm.core.work.AbstractWork; import org.nuxeo.ecm.core.work.api.Work; import org.nuxeo.ecm.core.work.api.WorkManager; import org.nuxeo.runtime.api.Framework; /** * Work task that does fulltext extraction from the blobs of the given document. * <p> * The extracted fulltext is then passed to the single-threaded {@link FulltextUpdaterWork}. * <p> * This base abstract class must be subclassed in order to implement the proper * {@link #initFulltextConfigurationAndParser} depending on the storage. * * @since 5.7 */ public abstract class FulltextExtractorWork extends AbstractWork { private static final long serialVersionUID = 1L; private static final Log log = LogFactory.getLog(FulltextExtractorWork.class); protected static final String ANY2TEXT = "any2text"; protected static final String CATEGORY = "fulltextExtractor"; protected static final String TITLE = "fulltextExtractor"; protected final boolean excludeProxies; protected transient FulltextConfiguration fulltextConfiguration; protected transient FulltextParser fulltextParser; public FulltextExtractorWork(String repositoryName, String docId, String id, boolean excludeProxies) { super(id); setDocument(repositoryName, docId); this.excludeProxies = excludeProxies; } @Override public String getCategory() { return CATEGORY; } @Override public String getTitle() { return TITLE; } @Override public int getRetryCount() { // even read-only threads may encounter concurrent update exceptions // when trying to read a previously deleted complex property // due to read committed semantics, cf NXP-17384 return 1; } @Override public void work() { openSystemSession(); // if the runtime has shutdown (normally because tests are finished) // this can happen, see NXP-4009 if (session.getPrincipal() == null) { return; } initFulltextConfigurationAndParser(); setStatus("Extracting"); setProgress(Progress.PROGRESS_0_PC); extractBinaryText(); setProgress(Progress.PROGRESS_100_PC); setStatus("Done"); } /** * Initializes the fulltext configuration and parser. * * @since 5.9.5 */ public abstract void initFulltextConfigurationAndParser(); protected void extractBinaryText() { IdRef docRef = new IdRef(docId); if (!session.exists(docRef)) { // doc is gone return; } DocumentModel doc = session.getDocument(docRef); if (excludeProxies && doc.isProxy()) { // VCS proxies don't have any fulltext attached, it's // the target document that carries it return; } if (!fulltextConfiguration.isFulltextIndexable(doc.getType())) { // excluded by config return; } // Iterate on each index to set the binaryText column BlobsExtractor extractor = new BlobsExtractor(); DocumentLocation docLocation = new DocumentLocationImpl(doc); List<IndexAndText> indexesAndText = new LinkedList<IndexAndText>(); for (String indexName : fulltextConfiguration.indexNames) { if (!fulltextConfiguration.indexesAllBinary.contains(indexName) && fulltextConfiguration.propPathsByIndexBinary.get(indexName) == null) { // nothing to do: index not configured for blob continue; } extractor.setExtractorProperties(fulltextConfiguration.propPathsByIndexBinary.get(indexName), fulltextConfiguration.propPathsExcludedByIndexBinary.get(indexName), fulltextConfiguration.indexesAllBinary.contains(indexName)); List<Blob> blobs = extractor.getBlobs(doc); StringBlob stringBlob = blobsToStringBlob(blobs, docId); String text = fulltextParser.parse(stringBlob.getString(), null, stringBlob.getMimeType(), docLocation); int fullTextFieldSizeLimit = fulltextConfiguration.fulltextFieldSizeLimit; if (fullTextFieldSizeLimit != 0 && text.length() > fullTextFieldSizeLimit) { if (log.isDebugEnabled()) { log.debug(String.format( "Fulltext extract of length: %s for indexName: %s of document: %s truncated to length: %s", text.length(), indexName, docId, fullTextFieldSizeLimit)); } text = text.substring(0, fullTextFieldSizeLimit); } indexesAndText.add(new IndexAndText(indexName, text)); } if (!indexesAndText.isEmpty()) { Work work = new FulltextUpdaterWork(repositoryName, docId, false, true, indexesAndText); if (!fulltextConfiguration.fulltextSearchDisabled) { WorkManager workManager = Framework.getLocalService(WorkManager.class); workManager.schedule(work, true); } else { ((FulltextUpdaterWork)work).updateWithSession(session); } } } @Override public void cleanUp(boolean ok, Exception e) { super.cleanUp(ok, e); fulltextConfiguration = null; fulltextParser = null; } protected StringBlob blobsToStringBlob(List<Blob> blobs, String docId) { String mimeType = null; List<String> strings = new LinkedList<String>(); for (Blob blob : blobs) { try { SimpleBlobHolder bh = new SimpleBlobHolder(blob); BlobHolder result = convert(bh); if (result == null) { continue; } blob = result.getBlob(); if (blob == null) { continue; } if (StringUtils.isEmpty(mimeType) && StringUtils.isNotEmpty(blob.getMimeType())) { mimeType = blob.getMimeType(); } String string = new String(blob.getByteArray(), "UTF-8"); // strip '\0 chars from text if (string.indexOf('\0') >= 0) { string = string.replace("\0", " "); } strings.add(string); } catch (ConversionException | IOException e) { String msg = "Could not extract fulltext of file '" + blob.getFilename() + "' for document: " + docId + ": " + e; log.warn(msg); log.debug(msg, e); continue; } } return new StringBlob(StringUtils.join(strings, " "), mimeType); } protected BlobHolder convert(BlobHolder blobHolder) throws ConversionException { ConversionService conversionService = Framework.getLocalService(ConversionService.class); if (conversionService == null) { log.debug("No ConversionService available"); return null; } return conversionService.convert(ANY2TEXT, blobHolder, null); } }