/* * Licensed under the Apache License, Version 2.0 (the "License"); * * You may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * limitations under the License. * * Contributions from 2013-2017 where performed either by US government * employees, or under US Veterans Health Administration contracts. * * US Veterans Health Administration contributions by government employees * are work of the U.S. Government and are not subject to copyright * protection in the United States. Portions contributed by government * employees are USGovWork (17USC ยง105). Not subject to copyright. * * Contribution by contractors to the US Veterans Health Administration * during this period are contractually contributed under the * Apache License, Version 2.0. * * See: https://www.usa.gov/government-works * * Contributions prior to 2013: * * Copyright (C) International Health Terminology Standards Development Organisation. * Licensed under the Apache License, Version 2.0. * */ package sh.isaac.provider.query.lucene.indexers; //~--- JDK imports ------------------------------------------------------------ import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Predicate; //~--- non-JDK imports -------------------------------------------------------- import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.glassfish.hk2.runlevel.RunLevel; import org.jvnet.hk2.annotations.Service; import sh.isaac.api.chronicle.ObjectChronology; import sh.isaac.api.component.sememe.SememeChronology; import sh.isaac.api.component.sememe.SememeType; import sh.isaac.api.component.sememe.version.DescriptionSememe; import sh.isaac.api.component.sememe.version.DynamicSememe; import sh.isaac.api.component.sememe.version.SememeVersion; import sh.isaac.api.constants.DynamicSememeConstants; import sh.isaac.api.index.IndexServiceBI; import sh.isaac.api.index.SearchResult; import sh.isaac.MetaData; import sh.isaac.provider.query.lucene.LuceneDescriptionType; import sh.isaac.provider.query.lucene.LuceneIndexer; import sh.isaac.provider.query.lucene.PerFieldAnalyzer; //~--- classes ---------------------------------------------------------------- /** * Lucene Manager for a Description index. Provides the description indexing * service. * * This has been redesigned such that is now creates multiple columns within the * index * * There is a 'everything' column, which gets all descriptions, to support the * standard search where you want to match on a text value anywhere it appears. * * There are 3 columns to support FSN / Synonym / Definition - to support * searching that subset of descriptions. There are also data-defined columns to * support extended definition types - for example - loinc description types - * to support searching terminology specific fields. * * Each of the columns above is also x2, as everything is indexed both with a * standard analyzer, and with a whitespace analyzer. * * @author aimeefurber * @author <a href="mailto:daniel.armbrust.list@gmail.com">Dan Armbrust</a> */ @Service(name = "description indexer") @RunLevel(value = 2) public class DescriptionIndexer extends LuceneIndexer implements IndexServiceBI { /** The Constant setupNidsSemaphore. */ private static final Semaphore setupNidsSemaphore = new Semaphore(1); /** The Constant sequencesSetup. */ private static final AtomicBoolean sequencesSetup = new AtomicBoolean(false); /** The Constant FIELD_INDEXED_STRING_VALUE. */ private static final String FIELD_INDEXED_STRING_VALUE = "_string_content_"; //~--- fields -------------------------------------------------------------- /** The sequence type map. */ private final HashMap<Integer, String> sequenceTypeMap = new HashMap<>(); /** The desc extended type sequence. */ private int descExtendedTypeSequence; //~--- constructors -------------------------------------------------------- /** * Instantiates a new description indexer. * * @throws IOException Signals that an I/O exception has occurred. */ // for HK2 only private DescriptionIndexer() throws IOException { super("descriptions"); } //~--- methods ------------------------------------------------------------- /** * Search the specified description type. * * @param query The query to apply * @param descriptionType - The type of description to search. If this is * passed in as null, this falls back to a standard description search that * searches all description types * @param sizeLimit The maximum size of the result list. * @param targetGeneration target generation that must be included in the * search or Long.MIN_VALUE if there is no need to wait for a target * generation. Long.MAX_VALUE can be passed in to force this query to wait * until any in progress indexing operations are completed - and then use * the latest index. * @return a List of <code>SearchResult</codes> that contains the nid of the * component that matched, and the score of that match relative to other * matches. */ public final List<SearchResult> query(String query, LuceneDescriptionType descriptionType, int sizeLimit, Long targetGeneration) { if (descriptionType == null) { return super.query(query, (Integer[]) null, sizeLimit, targetGeneration); } else { return search(buildTokenizedStringQuery(query, FIELD_INDEXED_STRING_VALUE + "_" + descriptionType.name(), false), sizeLimit, targetGeneration, null); } } /** * Search the specified description type. * * * @param query The query to apply * @param extendedDescriptionType - The UUID of an extended description type * - should be a child of the concept "Description type in source * terminology (ISAAC)" If this is passed in as null, * this falls back to a standard description search that searches all * description types * @param sizeLimit The maximum size of the result list. * @param targetGeneration target generation that must be included in the * search or Long.MIN_VALUE if there is no need to wait for a target * generation. Long.MAX_VALUE can be passed in to force this query to wait * until any in progress indexing operations are completed - and then use * the latest index. * @return a List of <code>SearchResult</codes> that contains the nid of the * component that matched, and the score of that match relative to other * matches. */ public final List<SearchResult> query(String query, UUID extendedDescriptionType, int sizeLimit, Long targetGeneration) { if (extendedDescriptionType == null) { return super.query(query, (Integer[]) null, sizeLimit, targetGeneration); } else { return search(buildTokenizedStringQuery(query, FIELD_INDEXED_STRING_VALUE + "_" + extendedDescriptionType.toString(), false), sizeLimit, targetGeneration, null); } } /** * A generic query API that handles most common cases. The cases handled for various component property types * are detailed below. * * NOTE - subclasses of LuceneIndexer may have other query(...) methods that allow for more specific and or complex * queries. Specifically both {@link DynamicSememeIndexer} and {@link DescriptionIndexer} have their own * query(...) methods which allow for more advanced queries. * * @param query The query to apply. * @param prefixSearch if true, utilize a search algorithm that is optimized for prefix searching, such as the searching * that would be done to implement a type-ahead style search. Does not use the Lucene Query parser. Every term (or token) * that is part of the query string will be required to be found in the result. * * Note, it is useful to NOT trim the text of the query before it is sent in - if the last word of the query has a * space character following it, that word will be required as a complete term. If the last word of the query does not * have a space character following it, that word will be required as a prefix match only. * * For example: * The query "family test" will return results that contain 'Family Testudinidae' * The query "family test " will not match on 'Testudinidae', so that will be excluded. * @param sememeConceptSequence the sememe concept sequence * @param sizeLimit The maximum size of the result list. * @param targetGeneration target generation that must be included in the search or Long.MIN_VALUE if there is no need * to wait for a target generation. Long.MAX_VALUE can be passed in to force this query to wait until any in progress * indexing operations are completed - and then use the latest index. * @return a List of {@link SearchResult} that contains the nid of the component that matched, and the score of that match relative * to other matches. */ @Override public List<SearchResult> query(String query, boolean prefixSearch, Integer[] sememeConceptSequence, int sizeLimit, Long targetGeneration) { return search(restrictToSememe(buildTokenizedStringQuery(query, FIELD_INDEXED_STRING_VALUE, prefixSearch), sememeConceptSequence), sizeLimit, targetGeneration, null); } /** * A generic query API that handles most common cases. The cases handled for various component property types * are detailed below. * * NOTE - subclasses of LuceneIndexer may have other query(...) methods that allow for more specific and or complex * queries. Specifically both {@link DynamicSememeIndexer} and {@link DescriptionIndexer} have their own * query(...) methods which allow for more advanced queries. * * @param query The query to apply. * @param prefixSearch if true, utilize a search algorithm that is optimized for prefix searching, such as the searching * that would be done to implement a type-ahead style search. Does not use the Lucene Query parser. Every term (or token) * that is part of the query string will be required to be found in the result. * * Note, it is useful to NOT trim the text of the query before it is sent in - if the last word of the query has a * space character following it, that word will be required as a complete term. If the last word of the query does not * have a space character following it, that word will be required as a prefix match only. * * For example: * The query "family test" will return results that contain 'Family Testudinidae' * The query "family test " will not match on 'Testudinidae', so that will be excluded. * @param sememeConceptSequence the sememe concept sequence * @param sizeLimit The maximum size of the result list. * @param targetGeneration target generation that must be included in the search or Long.MIN_VALUE if there is no need * to wait for a target generation. Long.MAX_VALUE can be passed in to force this query to wait until any in progress * indexing operations are completed - and then use the latest index. * @param filter - an optional filter on results - if provided, the filter should expect nids, and can return true, if * the nid should be allowed in the result, false otherwise. Note that this may cause large performance slowdowns, depending * on the implementation of your filter * @return a List of {@link SearchResult} that contains the nid of the component that matched, and the score of that match relative * to other matches. */ public List<SearchResult> query(String query, boolean prefixSearch, Integer[] sememeConceptSequence, int sizeLimit, Long targetGeneration, Predicate<Integer> filter) { return search(restrictToSememe(buildTokenizedStringQuery(query, FIELD_INDEXED_STRING_VALUE, prefixSearch), sememeConceptSequence), sizeLimit, targetGeneration, filter); } /** * Adds the fields. * * @param chronicle the chronicle * @param doc the doc */ @SuppressWarnings("unchecked") @Override protected void addFields(ObjectChronology<?> chronicle, Document doc) { if (chronicle instanceof SememeChronology) { final SememeChronology<?> sememeChronology = (SememeChronology<?>) chronicle; if (sememeChronology.getSememeType() == SememeType.DESCRIPTION) { indexDescription(doc, (SememeChronology<DescriptionSememe<? extends DescriptionSememe<?>>>) sememeChronology); incrementIndexedItemCount("Description"); } } } /** * Index chronicle. * * @param chronicle the chronicle * @return true, if successful */ @Override protected boolean indexChronicle(ObjectChronology<?> chronicle) { setupNidConstants(); if (chronicle instanceof SememeChronology) { final SememeChronology<?> sememeChronology = (SememeChronology<?>) chronicle; if (sememeChronology.getSememeType() == SememeType.DESCRIPTION) { return true; } } return false; } /** * Adds the field. * * @param doc the doc * @param fieldName the field name * @param value the value * @param tokenize the tokenize */ private void addField(Document doc, String fieldName, String value, boolean tokenize) { // index twice per field - once with the standard analyzer, once with the whitespace analyzer. if (tokenize) { doc.add(new TextField(fieldName, value, Field.Store.NO)); } doc.add(new TextField(fieldName + PerFieldAnalyzer.WHITE_SPACE_FIELD_MARKER, value, Field.Store.NO)); } /** * Index description. * * @param doc the doc * @param sememeChronology the sememe chronology */ private void indexDescription(Document doc, SememeChronology<DescriptionSememe<? extends DescriptionSememe<?>>> sememeChronology) { doc.add(new TextField(FIELD_SEMEME_ASSEMBLAGE_SEQUENCE, sememeChronology.getAssemblageSequence() + "", Field.Store.NO)); String lastDescText = null; String lastDescType = null; final TreeMap<Long, String> uniqueTextValues = new TreeMap<>(); for (final DescriptionSememe<? extends DescriptionSememe<?>> descriptionVersion: sememeChronology.getVersionList()) { final String descType = this.sequenceTypeMap.get(descriptionVersion.getDescriptionTypeConceptSequence()); // No need to index if the text is the same as the previous version. if ((lastDescText == null) || (lastDescType == null) || !lastDescText.equals(descriptionVersion.getText()) || !lastDescType.equals(descType)) { // Add to the field that carries all text addField(doc, FIELD_INDEXED_STRING_VALUE, descriptionVersion.getText(), true); // Add to the field that carries type-only text addField(doc, FIELD_INDEXED_STRING_VALUE + "_" + descType, descriptionVersion.getText(), true); uniqueTextValues.put(descriptionVersion.getTime(), descriptionVersion.getText()); lastDescText = descriptionVersion.getText(); lastDescType = descType; } } // index the extended description types - matching the text values and times above with the times of these annotations. String lastExtendedDescType = null; String lastValue = null; for (final SememeChronology<? extends SememeVersion<?>> sememeChronicle: sememeChronology.getSememeList()) { if (sememeChronicle.getSememeType() == SememeType.DYNAMIC) { @SuppressWarnings("unchecked") final SememeChronology<DynamicSememe<?>> sememeDynamicChronicle = (SememeChronology<DynamicSememe<?>>) sememeChronicle; for (final DynamicSememe<?> sememeDynamic: sememeDynamicChronicle.getVersionList()) { // If this sememe is the sememe recording a dynamic sememe extended type.... if (sememeDynamic.getAssemblageSequence() == this.descExtendedTypeSequence) { // this is a UUID, but we want to treat it as a string anyway final String extendedDescType = sememeDynamic.getData()[0] .getDataObject() .toString(); String value = null; // Find the text that was active at the time of this refex - timestamp on the refex must not be // greater than the timestamp on the value for (final Entry<Long, String> x: uniqueTextValues.entrySet()) { if ((value == null) || (x.getKey() <= sememeDynamic.getTime())) { value = x.getValue(); } else if (x.getKey() > sememeDynamic.getTime()) { break; } } if ((lastExtendedDescType == null) || (lastValue == null) || !lastExtendedDescType.equals(extendedDescType) || !lastValue.equals(value)) { if ((extendedDescType == null) || (value == null)) { throw new RuntimeException("design failure"); } // This is a UUID, but we only do exact matches - indexing ints as strings is faster when doing exact-match only addField(doc, FIELD_INDEXED_STRING_VALUE + "_" + extendedDescType, value, false); // Don't tokenize this lastValue = value; lastExtendedDescType = extendedDescType; } } } } } } /** * Setup nid constants. */ private void setupNidConstants() { // Can't put these in the start me, because if the database is not yet imported, then these calls will fail. if (!sequencesSetup.get()) { setupNidsSemaphore.acquireUninterruptibly(); try { if (!sequencesSetup.get()) { this.sequenceTypeMap.put(MetaData.FULLY_SPECIFIED_NAME.getConceptSequence(), LuceneDescriptionType.FSN.name()); this.sequenceTypeMap.put(MetaData.DEFINITION_DESCRIPTION_TYPE.getConceptSequence(), LuceneDescriptionType.DEFINITION.name()); this.sequenceTypeMap.put(MetaData.SYNONYM.getConceptSequence(), LuceneDescriptionType.SYNONYM.name()); this.descExtendedTypeSequence = DynamicSememeConstants.get().DYNAMIC_SEMEME_EXTENDED_DESCRIPTION_TYPE .getConceptSequence(); } sequencesSetup.set(true); } finally { setupNidsSemaphore.release(); } } } }