/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.provenance.index.lucene; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.provenance.ProvenanceEventRecord; import org.apache.nifi.provenance.ProvenanceEventType; import org.apache.nifi.provenance.SearchableFields; import org.apache.nifi.provenance.lucene.LuceneUtil; import org.apache.nifi.provenance.search.SearchableField; import org.apache.nifi.provenance.serialization.StorageSummary; public class ConvertEventToLuceneDocument { private final Set<SearchableField> searchableEventFields; private final Set<SearchableField> searchableAttributeFields; public ConvertEventToLuceneDocument(final List<SearchableField> searchableEventFields, final List<SearchableField> searchableAttributes) { this.searchableEventFields = Collections.unmodifiableSet(new HashSet<>(searchableEventFields)); this.searchableAttributeFields = Collections.unmodifiableSet(new HashSet<>(searchableAttributes)); } private void addField(final Document doc, final SearchableField field, final String value) { if (value == null || (!field.isAttribute() && !searchableEventFields.contains(field))) { return; } doc.add(new StringField(field.getSearchableFieldName(), value.toLowerCase(), Store.NO)); } public Document convert(final ProvenanceEventRecord record, final StorageSummary persistedEvent) { final Document doc = new Document(); addField(doc, SearchableFields.FlowFileUUID, record.getFlowFileUuid()); addField(doc, SearchableFields.Filename, record.getAttribute(CoreAttributes.FILENAME.key())); addField(doc, SearchableFields.ComponentID, record.getComponentId()); addField(doc, SearchableFields.AlternateIdentifierURI, record.getAlternateIdentifierUri()); addField(doc, SearchableFields.EventType, record.getEventType().name()); addField(doc, SearchableFields.Relationship, record.getRelationship()); addField(doc, SearchableFields.Details, record.getDetails()); addField(doc, SearchableFields.ContentClaimSection, record.getContentClaimSection()); addField(doc, SearchableFields.ContentClaimContainer, record.getContentClaimContainer()); addField(doc, SearchableFields.ContentClaimIdentifier, record.getContentClaimIdentifier()); addField(doc, SearchableFields.SourceQueueIdentifier, record.getSourceQueueIdentifier()); addField(doc, SearchableFields.TransitURI, record.getTransitUri()); for (final SearchableField searchableField : searchableAttributeFields) { addField(doc, searchableField, LuceneUtil.truncateIndexField(record.getAttribute(searchableField.getSearchableFieldName()))); } // Index the fields that we always index (unless there's nothing else to index at all) if (!doc.getFields().isEmpty()) { // Always include Lineage Start Date because it allows us to make our Lineage queries more efficient. doc.add(new LongField(SearchableFields.LineageStartDate.getSearchableFieldName(), record.getLineageStartDate(), Store.NO)); // Always include Event Time because most queries are bound by a start and end time. doc.add(new LongField(SearchableFields.EventTime.getSearchableFieldName(), record.getEventTime(), Store.NO)); // We always include File Size because the UI wants to always render the controls for specifying this. This idea could be revisited. doc.add(new LongField(SearchableFields.FileSize.getSearchableFieldName(), record.getFileSize(), Store.NO)); // We always store the event Event ID in the Document but do not index it. It doesn't make sense to query based on Event ID because // if we want a particular Event ID, we can just obtain it directly from the EventStore. But when we obtain a Document, this info must // be stored so that we know how to lookup the event in the store. doc.add(new UnIndexedLongField(SearchableFields.Identifier.getSearchableFieldName(), persistedEvent.getEventId())); // If it's event is a FORK, or JOIN, add the FlowFileUUID for all child/parent UUIDs. final ProvenanceEventType eventType = record.getEventType(); if (eventType == ProvenanceEventType.FORK || eventType == ProvenanceEventType.CLONE || eventType == ProvenanceEventType.REPLAY) { for (final String uuid : record.getChildUuids()) { if (!uuid.equals(record.getFlowFileUuid())) { addField(doc, SearchableFields.FlowFileUUID, uuid); } } } else if (eventType == ProvenanceEventType.JOIN) { for (final String uuid : record.getParentUuids()) { if (!uuid.equals(record.getFlowFileUuid())) { addField(doc, SearchableFields.FlowFileUUID, uuid); } } } else if (eventType == ProvenanceEventType.RECEIVE && record.getSourceSystemFlowFileIdentifier() != null) { // If we get a receive with a Source System FlowFile Identifier, we add another Document that shows the UUID // that the Source System uses to refer to the data. final String sourceIdentifier = record.getSourceSystemFlowFileIdentifier(); final String sourceFlowFileUUID; final int lastColon = sourceIdentifier.lastIndexOf(":"); if (lastColon > -1 && lastColon < sourceIdentifier.length() - 2) { sourceFlowFileUUID = sourceIdentifier.substring(lastColon + 1); } else { sourceFlowFileUUID = null; } if (sourceFlowFileUUID != null) { addField(doc, SearchableFields.FlowFileUUID, sourceFlowFileUUID); } } return doc; } return null; } private static class UnIndexedLongField extends Field { static final FieldType TYPE = new FieldType(); static { TYPE.setIndexed(false); TYPE.setTokenized(true); TYPE.setOmitNorms(true); TYPE.setIndexOptions(IndexOptions.DOCS_ONLY); TYPE.setNumericType(FieldType.NumericType.LONG); TYPE.setStored(true); TYPE.freeze(); } public UnIndexedLongField(String name, long value) { super(name, TYPE); fieldsData = Long.valueOf(value); } } }