/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nifi.provenance.lucene; import java.io.File; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.file.Path; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.nifi.processor.DataUnit; import org.apache.nifi.provenance.SearchableFields; import org.apache.nifi.provenance.search.SearchTerm; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Version; public class LuceneUtil { public static final Version LUCENE_VERSION = Version.LATEST; public static String substringBefore(final String value, final String searchValue) { final int index = value.indexOf(searchValue); return (index < 0) ? value : value.substring(0, index); } public static String substringAfter(final String value, final String searchValue) { final int index = value.indexOf(searchValue); return (index < 0) ? value : (index > value.length() - 2) ? "" : value.substring(index + 1); } public static String substringBeforeLast(final String value, final String searchValue) { final int index = value.lastIndexOf(searchValue); return (index < 0) ? value : value.substring(0, index); } public static String substringAfterLast(final String value, final String searchValue) { final int index = value.lastIndexOf(searchValue); return (index < 0 || index >= value.length()) ? value : value.substring(index + 1); } public static File getProvenanceLogFile(final String baseName, final Collection<Path> allProvenanceLogs) { final List<File> logFiles = getProvenanceLogFiles(baseName, allProvenanceLogs); if (logFiles.size() != 1) { return null; } return logFiles.get(0); } public static List<File> getProvenanceLogFiles(final String baseName, final Collection<Path> allProvenanceLogs) { final List<File> matchingFiles = new ArrayList<>(); final String searchString = baseName + "."; for (final Path path : allProvenanceLogs) { if (path.toFile().getName().startsWith(searchString)) { final File file = path.toFile(); if ( file.exists() ) { matchingFiles.add(file); } else { final File dir = file.getParentFile(); final File gzFile = new File(dir, file.getName() + ".gz"); if ( gzFile.exists() ) { matchingFiles.add(gzFile); } } } } return matchingFiles; } public static org.apache.lucene.search.Query convertQuery(final org.apache.nifi.provenance.search.Query query) { if (query.getStartDate() == null && query.getEndDate() == null && query.getSearchTerms().isEmpty()) { return new MatchAllDocsQuery(); } final BooleanQuery luceneQuery = new BooleanQuery(); for (final SearchTerm searchTerm : query.getSearchTerms()) { final String searchValue = searchTerm.getValue(); if (searchValue == null) { throw new IllegalArgumentException("Empty search value not allowed (for term '" + searchTerm.getSearchableField().getFriendlyName() + "')"); } if (searchValue.contains("*") || searchValue.contains("?")) { luceneQuery.add(new BooleanClause(new WildcardQuery(new Term(searchTerm.getSearchableField().getSearchableFieldName(), searchTerm.getValue().toLowerCase())), Occur.MUST)); } else { luceneQuery.add(new BooleanClause(new TermQuery(new Term(searchTerm.getSearchableField().getSearchableFieldName(), searchTerm.getValue().toLowerCase())), Occur.MUST)); } } final Long minBytes = query.getMinFileSize() == null ? null : DataUnit.parseDataSize(query.getMinFileSize(), DataUnit.B).longValue(); final Long maxBytes = query.getMaxFileSize() == null ? null : DataUnit.parseDataSize(query.getMaxFileSize(), DataUnit.B).longValue(); if (minBytes != null || maxBytes != null) { luceneQuery.add(NumericRangeQuery.newLongRange(SearchableFields.FileSize.getSearchableFieldName(), minBytes, maxBytes, true, true), Occur.MUST); } final Long minDateTime = query.getStartDate() == null ? null : query.getStartDate().getTime(); final Long maxDateTime = query.getEndDate() == null ? null : query.getEndDate().getTime(); if (maxDateTime != null || minDateTime != null) { luceneQuery.add(NumericRangeQuery.newLongRange(SearchableFields.EventTime.getSearchableFieldName(), minDateTime, maxDateTime, true, true), Occur.MUST); } return luceneQuery; } /** * Will sort documents by filename and then file offset so that we can * retrieve the records efficiently * * @param documents * list of {@link Document}s */ public static void sortDocsForRetrieval(final List<Document> documents) { Collections.sort(documents, new Comparator<Document>() { @Override public int compare(final Document o1, final Document o2) { final String filename1 = o1.get(FieldNames.STORAGE_FILENAME); final String filename2 = o2.get(FieldNames.STORAGE_FILENAME); final int filenameComp = filename1.compareTo(filename2); if (filenameComp != 0) { return filenameComp; } final IndexableField fileOffset1 = o1.getField(FieldNames.BLOCK_INDEX); final IndexableField fileOffset2 = o1.getField(FieldNames.BLOCK_INDEX); if ( fileOffset1 != null && fileOffset2 != null ) { final int blockIndexResult = Long.compare(fileOffset1.numericValue().longValue(), fileOffset2.numericValue().longValue()); if ( blockIndexResult != 0 ) { return blockIndexResult; } final long eventId1 = o1.getField(SearchableFields.Identifier.getSearchableFieldName()).numericValue().longValue(); final long eventId2 = o2.getField(SearchableFields.Identifier.getSearchableFieldName()).numericValue().longValue(); return Long.compare(eventId1, eventId2); } final long offset1 = o1.getField(FieldNames.STORAGE_FILE_OFFSET).numericValue().longValue(); final long offset2 = o2.getField(FieldNames.STORAGE_FILE_OFFSET).numericValue().longValue(); return Long.compare(offset1, offset2); } }); } /** * Will group documents based on the {@link FieldNames#STORAGE_FILENAME}. * * @param documents * list of {@link Document}s which will be sorted via * {@link #sortDocsForRetrieval(List)} for more efficient record * retrieval. * @return a {@link Map} of document groups with * {@link FieldNames#STORAGE_FILENAME} as key and {@link List} of * {@link Document}s as value. */ public static Map<String, List<Document>> groupDocsByStorageFileName(final List<Document> documents) { Map<String, List<Document>> documentGroups = new HashMap<>(); for (Document document : documents) { String fileName = document.get(FieldNames.STORAGE_FILENAME); if (!documentGroups.containsKey(fileName)) { documentGroups.put(fileName, new ArrayList<Document>()); } documentGroups.get(fileName).add(document); } for (List<Document> groupedDocuments : documentGroups.values()) { sortDocsForRetrieval(groupedDocuments); } return documentGroups; } /** * Truncate a single field so that it does not exceed Lucene's byte size limit on indexed terms. * * @param field the string to be indexed * @return a string that can be indexed which is within Lucene's byte size limit, or null if anything goes wrong */ public static String truncateIndexField(String field) { if (field == null) { return field; } Charset charset = Charset.defaultCharset(); byte[] bytes = field.getBytes(charset); if (bytes.length <= IndexWriter.MAX_TERM_LENGTH) { return field; } // chop the field to maximum allowed byte length ByteBuffer bbuf = ByteBuffer.wrap(bytes, 0, IndexWriter.MAX_TERM_LENGTH); try { // decode the chopped byte buffer back into original charset CharsetDecoder decoder = charset.newDecoder(); decoder.onMalformedInput(CodingErrorAction.IGNORE); decoder.reset(); CharBuffer cbuf = decoder.decode(bbuf); return cbuf.toString(); } catch (CharacterCodingException shouldNotHappen) {} // if we get here, something bad has happened return null; } }