/* * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.xwiki.search.solr.internal.metadata; import java.util.List; import java.util.Locale; import java.util.Map; import javax.inject.Inject; import javax.inject.Named; import javax.inject.Singleton; import org.apache.commons.lang3.StringUtils; import org.apache.solr.common.SolrInputDocument; import org.xwiki.component.annotation.Component; import org.xwiki.model.EntityType; import org.xwiki.model.reference.DocumentReference; import org.xwiki.model.reference.EntityReference; import org.xwiki.model.reference.EntityReferenceSerializer; import org.xwiki.rendering.renderer.BlockRenderer; import org.xwiki.rendering.renderer.printer.DefaultWikiPrinter; import org.xwiki.rendering.renderer.printer.WikiPrinter; import org.xwiki.rendering.syntax.Syntax; import org.xwiki.search.solr.internal.api.FieldUtils; import org.xwiki.search.solr.internal.api.SolrFieldNameEncoder; import com.xpn.xwiki.XWikiContext; import com.xpn.xwiki.XWikiException; import com.xpn.xwiki.doc.XWikiAttachment; import com.xpn.xwiki.doc.XWikiDocument; import com.xpn.xwiki.objects.BaseObject; import com.xpn.xwiki.objects.BaseProperty; /** * Extract the metadata to be indexed from document. * * @version $Id: a6cf8e4deab3b10e030a10964e70eb2ccffe24f6 $ * @since 4.3M2 */ @Component @Named("document") @Singleton public class DocumentSolrMetadataExtractor extends AbstractSolrMetadataExtractor { /** * BlockRenderer component used to render the wiki content before indexing. */ @Inject @Named("plain/1.0") private BlockRenderer renderer; @Inject private EntityReferenceSerializer<String> entityReferenceSerializer; /** * Used to serialize entity reference to be used in dynamic field names. */ @Inject @Named("solr") private EntityReferenceSerializer<String> fieldNameSerializer; /** * Used to encode dynamic field names that may contain special characters. */ @Inject private SolrFieldNameEncoder fieldNameEncoder; @Override public boolean setFieldsInternal(LengthSolrInputDocument solrDocument, EntityReference entityReference) throws Exception { DocumentReference documentReference = new DocumentReference(entityReference); XWikiContext xcontext = this.xcontextProvider.get(); XWikiDocument translatedDocument = getTranslatedDocument(documentReference); if (translatedDocument == null) { return false; } Locale locale = getLocale(documentReference); solrDocument.setField(FieldUtils.FULLNAME, localSerializer.serialize(documentReference)); // Rendered title. String plainTitle = translatedDocument.getRenderedTitle(Syntax.PLAIN_1_0, xcontext); solrDocument.setField(FieldUtils.getFieldName(FieldUtils.TITLE, locale), plainTitle); // Raw Content solrDocument.setField(FieldUtils.getFieldName(FieldUtils.DOCUMENT_RAW_CONTENT, locale), translatedDocument.getContent()); // Rendered content WikiPrinter plainContentPrinter = new DefaultWikiPrinter(); this.renderer.render(translatedDocument.getXDOM(), plainContentPrinter); solrDocument.setField(FieldUtils.getFieldName(FieldUtils.DOCUMENT_RENDERED_CONTENT, locale), plainContentPrinter.toString()); solrDocument.setField(FieldUtils.VERSION, translatedDocument.getVersion()); solrDocument.setField(FieldUtils.COMMENT, translatedDocument.getComment()); solrDocument.setField(FieldUtils.DOCUMENT_LOCALE, translatedDocument.getLocale().toString()); // Add locale inheritance addLocales(translatedDocument, translatedDocument.getLocale(), solrDocument); // Get both serialized user reference string and pretty user name setAuthors(solrDocument, translatedDocument, entityReference); // Document dates. solrDocument.setField(FieldUtils.CREATIONDATE, translatedDocument.getCreationDate()); solrDocument.setField(FieldUtils.DATE, translatedDocument.getContentUpdateDate()); // Document translations have their own hidden fields solrDocument.setField(FieldUtils.HIDDEN, translatedDocument.isHidden()); // Add any extra fields (about objects, etc.) that can improve the findability of the document. setExtras(documentReference, solrDocument, locale); return true; } /** * @param solrDocument the Solr document * @param translatedDocument the XWiki document * @param entityReference the document reference */ private void setAuthors(SolrInputDocument solrDocument, XWikiDocument translatedDocument, EntityReference entityReference) { XWikiContext xcontext = this.xcontextProvider.get(); String authorString = entityReferenceSerializer.serialize(translatedDocument.getAuthorReference()); solrDocument.setField(FieldUtils.AUTHOR, authorString); String authorDisplayString = xcontext.getWiki().getPlainUserName(translatedDocument.getAuthorReference(), xcontext); solrDocument.setField(FieldUtils.AUTHOR_DISPLAY, authorDisplayString); String creatorString = entityReferenceSerializer.serialize(translatedDocument.getCreatorReference()); solrDocument.setField(FieldUtils.CREATOR, creatorString); String creatorDisplayString = xcontext.getWiki().getPlainUserName(translatedDocument.getCreatorReference(), xcontext); solrDocument.setField(FieldUtils.CREATOR_DISPLAY, creatorDisplayString); } /** * @param documentReference the document's reference. * @param solrDocument the Solr document where to add the data. * @param locale the locale of which to index the extra data. * @throws XWikiException if problems occur. */ protected void setExtras(DocumentReference documentReference, SolrInputDocument solrDocument, Locale locale) throws XWikiException { // We need to support the following types of queries: // * search for documents matching specific values in multiple XObject properties // * search for documents matching specific values in attachment meta data // In order to avoid using joins we have to index the XObjects and the attachments both separately and on the // document rows in the Solr index. This means we'll have duplicated information but we believe the increase in // the index size pays off if you take into account the simplified query syntax and the search speed. // Use the original document to get the objects and the attachments because the translated document is just a // lightweight document containing just the translated content and title. XWikiDocument originalDocument = getDocument(documentReference); // NOTE: To be able to still find translated documents, we need to redundantly index the same objects (including // comments) and attachments for each translation. If we don`t do this then only the original document will be // found. That's why we pass the locale of the translated document to the following method calls. setObjects(solrDocument, locale, originalDocument); setAttachments(solrDocument, locale, originalDocument); } /** * @param solrDocument the Solr document where to add the objects. * @param locale the locale for which to index the objects. * @param originalDocument the original document where the objects come from. */ protected void setObjects(SolrInputDocument solrDocument, Locale locale, XWikiDocument originalDocument) { for (Map.Entry<DocumentReference, List<BaseObject>> objects : originalDocument.getXObjects().entrySet()) { boolean hasObjectsOfThisType = false; for (BaseObject object : objects.getValue()) { // Yes, the old core can return null objects. hasObjectsOfThisType |= object != null; setObjectContent(solrDocument, object, locale); } if (hasObjectsOfThisType) { solrDocument.addField(FieldUtils.CLASS, localSerializer.serialize(objects.getKey())); } } } @Override protected void setPropertyValue(SolrInputDocument solrDocument, BaseProperty<EntityReference> property, TypedValue typedValue, Locale locale) { Object value = typedValue.getValue(); String type = typedValue.getType(); // We need to be able to query an object property alone. EntityReference classReference = property.getObject().getRelativeXClassReference(); EntityReference propertyReference = new EntityReference(property.getName(), EntityType.CLASS_PROPERTY, classReference); String serializedPropertyReference = fieldNameEncoder.encode(fieldNameSerializer.serialize(propertyReference)); String prefix = "property." + serializedPropertyReference; // Note that we're using "addField" because we want to collect all the property values, even from multiple // objects of the same type. solrDocument.addField(FieldUtils.getFieldName(prefix, type, locale), value); // We need to be able to sort by a property value and for this we need a dedicated (single valued) field because // the field we just added is multiValued and multiValued fields are not sortable. // We don't need to sort on properties that hold large localized texts or large strings (e.g. TextArea). if ((type != TypedValue.TEXT && type != TypedValue.STRING) || String.valueOf(value).length() <= SHORT_TEXT_LIMIT) { // Short localized texts are indexed as strings because a sort field is either non-tokenized (i.e. has no // Analyzer) or uses an Analyzer that only produces a single Term (i.e. uses the KeywordTokenizer). String sortType = "sort" + StringUtils.capitalize(type == TypedValue.TEXT ? TypedValue.STRING : type); // We're using "setField" because the sort fields must be single valued. The consequence is that for // properties with multiple values the last value we set will be used for sorting (e.g. if a document has // two objects of the same type then the value from the second object will be used for sorting). solrDocument.setField(FieldUtils.getFieldName(prefix, sortType, locale), value); } // We need to be able to query all properties of a specific type of object at once. String serializedClassReference = fieldNameEncoder.encode(fieldNameSerializer.serialize(classReference)); String objectOfTypeFieldName = "object." + serializedClassReference; // The current method can be called multiple times for the same property value (but with a different type). // Since we don't care about the value type here (all the values are collected in a localized field) we need to // make sure we don't add the same value twice. addFieldValueOnce(solrDocument, FieldUtils.getFieldName(objectOfTypeFieldName, locale), value); // We need to be able to query all objects from a document at once. super.setPropertyValue(solrDocument, property, typedValue, locale); } /** * @param solrDocument the Solr document where to add the attachments data * @param locale the locale for which to index the attachments * @param originalDocument the original document, that should be used to access the attachments */ private void setAttachments(SolrInputDocument solrDocument, Locale locale, XWikiDocument originalDocument) { for (XWikiAttachment attachment : originalDocument.getAttachmentList()) { setAttachment(solrDocument, locale, attachment); } } /** * Extracts the meta data from the given attachment and adds it to the given Solr document. * * @param solrDocument the Solr document where to add the attachment data * @param locale the locale for which to index the attachments * @param attachment the attachment to index */ private void setAttachment(SolrInputDocument solrDocument, Locale locale, XWikiAttachment attachment) { XWikiContext xcontext = xcontextProvider.get(); solrDocument.addField(FieldUtils.FILENAME, attachment.getFilename()); solrDocument.addField(FieldUtils.MIME_TYPE, attachment.getMimeType(xcontext)); solrDocument.addField(FieldUtils.ATTACHMENT_DATE, attachment.getDate()); solrDocument.addField(FieldUtils.ATTACHMENT_SIZE, attachment.getLongSize()); String attachmentTextContent = getContentAsText(attachment); solrDocument.addField(FieldUtils.getFieldName(FieldUtils.ATTACHMENT_CONTENT, locale), attachmentTextContent); // Index the full author reference for exact matching (faceting). String authorStringReference = entityReferenceSerializer.serialize(attachment.getAuthorReference()); solrDocument.addField(FieldUtils.ATTACHMENT_AUTHOR, authorStringReference); try { // Index the author display name for free text search. String authorDisplayName = xcontext.getWiki().getPlainUserName(attachment.getAuthorReference(), xcontext); solrDocument.addField(FieldUtils.ATTACHMENT_AUTHOR_DISPLAY, authorDisplayName); } catch (Exception e) { this.logger.error("Failed to get author display name for attachment [{}]", attachment.getReference(), e); } } }