package lux.index; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Map; import lux.index.analysis.DefaultAnalyzer; import lux.index.analysis.ElementVisibility; import lux.index.field.AttributeQNameField; import lux.index.field.AttributeTextField; import lux.index.field.DocumentField; import lux.index.field.ElementQNameField; import lux.index.field.ElementTextField; import lux.index.field.FieldDefinition; import lux.index.field.IDField; import lux.index.field.PathField; import lux.index.field.PathOccurrenceField; import lux.index.field.PathValueField; import lux.index.field.QNameValueField; import lux.index.field.TinyBinaryField; import lux.index.field.TinyBinarySolrField; import lux.index.field.URIField; import lux.index.field.XmlTextField; import lux.xml.tinybin.TinyBinary; import org.apache.lucene.util.Version; /** * Maintains a list of field definitions and index options that inform indexing and search. * The configuration options and core indexing setup are immutable, but new fields may be added, * fields may be renamed, and namespace mappings may be defined. */ public class IndexConfiguration { public static final Version LUCENE_VERSION = Version.LUCENE_46; /** causes a document node to be built during indexing. Must be set if any XPathFields are to be defined. */ public final static int BUILD_DOCUMENT = 0x00000001; /** Configure for use in solr; eg TinyBinarySolrField instead of TinyBinaryField*/ public final static int SOLR = 0x00000002; /** causes QNames indexes to include the full namespace uri. If not set, QNames are indexed lexically, * as {prefix}:{localname} without regard for any prefix mappings. Currently namespace-unaware indexing * and search is not fully supported. */ public final static int NAMESPACE_AWARE = 0x00000004; /** causes a document to be stored in the index. This should generally always be enabled */ public final static int STORE_DOCUMENT = 0x00000008; /** indicates that documents are to be stored in {@link TinyBinary} format. If this is not set, * documents are stored as serialized XML. */ public final static int STORE_TINY_BINARY = 0x00000010; /** enables the lux_elt_name and lux_att_name fields, causing element and attribute * QNames to be indexed. If paths are indexed, this isn't really needed. */ public final static int INDEX_QNAMES = 0x00000020; /** enables the lux_path field, causing element and attribute QName paths to be indexed. */ public final static int INDEX_PATHS = 0x00000040; /** enables the lux_text, lux_elt_text, and lux_att_text fields, * causing element and attribute text to be indexed. */ public final static int INDEX_FULLTEXT = 0x00000080; /** enables the lux_path_value field (if INDEX_PATHS is set), and the lux_qname_value field (if * INDEX_QNAMES is set), causing values to be indexed. This is an experimental feature that is not * fully supported. */ public final static int INDEX_VALUES = 0x00000100; /** enables the computation and storage of term offsets in the index. Currently there is no reason to enable * this flag. In the future term offsets may be used to accelerate highlighting. */ public final static int COMPUTE_OFFSETS = 0x00000200; /** causes all namespace information to be stripped from incoming documents */ public final static int STRIP_NAMESPACES = 0x00000400; /** experimental: index each occurrence of each path as an unparsed string, * rather than indexing unique paths and tokenizing */ public final static int INDEX_EACH_PATH = 0x00000800; /** mask covering all of the indexing options */ public final static int INDEXES = INDEX_QNAMES | INDEX_PATHS | INDEX_FULLTEXT | INDEX_VALUES; /** the default indexing options */ public final static int DEFAULT_OPTIONS = STORE_DOCUMENT | INDEX_QNAMES | INDEX_PATHS | INDEX_FULLTEXT | NAMESPACE_AWARE; /** unique identifier field that identifies a document */ public final FieldDefinition URI = new URIField(); /** field that stores xml documents */ private final FieldDefinition XML_STORE = new DocumentField(); /** field that stores xml documents */ private final FieldDefinition TINY_BINARY_STORE = new TinyBinaryField(); /** element QName field */ private final FieldDefinition ELT_QNAME = new ElementQNameField(); /** attribute QName field */ private final FieldDefinition ATT_QNAME = new AttributeQNameField(); /** path field */ private final FieldDefinition PATH = new PathField(); /** element text field indexes all the text along with element QNames. */ private final FieldDefinition ELEMENT_TEXT = new ElementTextField(); /** attribute text field indexes all the text along with attribute QNames. */ private final FieldDefinition ATTRIBUTE_TEXT = new AttributeTextField(); /** full text field that indexes all the text in a document (not including attribute values). */ private final FieldDefinition XML_TEXT = new XmlTextField(); // not fully supported? private final FieldDefinition PATH_VALUE = new PathValueField(); private final FieldDefinition QNAME_VALUE = new QNameValueField(); private long options; private final HashMap<FieldRole, FieldDefinition> fieldsByRole; // maintains which field fulfills a given role private final HashMap<String, FieldDefinition> fieldsByName; // map of fields by their lucene field name private MultiFieldAnalyzer fieldAnalyzers; private final HashMap<String,String> namespaceMap; // element visibility private HashMap<String,ElementVisibility> eltVis; private ElementVisibility defVis; /** @return the analyzers associated with the fields to be indexed */ public MultiFieldAnalyzer getFieldAnalyzers() { return fieldAnalyzers; } public IndexConfiguration (long options) { namespaceMap = new HashMap<String, String>(); fieldsByRole = new HashMap<FieldRole, FieldDefinition>(); fieldsByName = new HashMap<String, FieldDefinition>(); fieldAnalyzers = new MultiFieldAnalyzer(); fieldAnalyzers.put(null, new DefaultAnalyzer()); eltVis = new HashMap<String, ElementVisibility>(); defVis = ElementVisibility.OPAQUE; addField (URI); this.options = options | NAMESPACE_AWARE; init(); } public IndexConfiguration () { this (DEFAULT_OPTIONS); } private void init () { if (isOption (INDEX_QNAMES)) { addField(ELT_QNAME); addField(ATT_QNAME); if (isOption (INDEX_VALUES)) { addField(QNAME_VALUE); } } if (isOption (INDEX_PATHS)) { if (isOption (INDEX_EACH_PATH)) { addField (new PathOccurrenceField()); } else { addField(PATH); } if (isOption (INDEX_VALUES)) { addField(PATH_VALUE); } } if (isOption (INDEX_FULLTEXT)) { addField (XML_TEXT); if (isOption (INDEX_QNAMES) || isOption(INDEX_PATHS)) { addField (ELEMENT_TEXT); addField (ATTRIBUTE_TEXT); } /* if (// FIXME: do we need offsets ever? Perhaps if we make use of a better highlighter XML_TEXT.getTermVector().withOffsets() || ELEMENT_TEXT.getTermVector().withOffsets() || ATTRIBUTE_TEXT.getTermVector().withOffsets() ) { // We may not need to bother computing offsets at all options |= COMPUTE_OFFSETS; } */ } if (isOption (STORE_DOCUMENT)) { if (isOption (STORE_TINY_BINARY )) { if (isOption(SOLR)) { addField(new TinyBinarySolrField()); } else { addField(TINY_BINARY_STORE); } } else { addField(XML_STORE); } } addField (new IDField()); } /** adds a new field * @param field the field to add */ public void addField (FieldDefinition field) { FieldRole role = field.getFieldRole (); FieldDefinition existing = null; if (role != null) { existing = fieldsByRole.get(role); } if (existing == null) { existing = fieldsByName.get(field.getName()); } if (existing != null) { if (existing != field) { throw new IllegalStateException ("Duplicate field name: " + field); } return; } if (role != null) { fieldsByRole.put(role, field); } fieldsByName.put(field.getName(), field); // get query analyzer fieldAnalyzers.put(field.getName(), field.getQueryAnalyzer()); } /** * Get the effective name of a field, given its canonical name. Fields may be renamed, or aliased, for * compatibility with existing schemas. * @param role * @return the effective name of the field */ public String getFieldName (FieldRole role) { FieldDefinition field = fieldsByRole.get(role); if (field == null) { return ""; } return field.getName(); } /** * rename an existing field; the new name is used in the index. * @param field the definition of a field * @param name the new name to use */ public void renameField (FieldDefinition field, String name) { if (! field.isRenameable()) { throw new IllegalArgumentException("Attempt to rename field " + field + " whose name is fixed"); } String currentName = field.getName(); if (currentName.equals(name)) { return; } field.setName(name); fieldsByName.remove(currentName); fieldsByName.put(name, field); } /** * @return a list of all the fields whose values are to provided by this indexer. */ public Collection<FieldDefinition> getFields () { return fieldsByName.values(); } public FieldDefinition getField (FieldRole fieldName) { return fieldsByRole.get(fieldName); } public FieldDefinition getField (String fieldName) { return fieldsByName.get(fieldName); } public String getDefaultFieldName () { FieldDefinition textField = fieldsByRole.get(FieldRole.XML_TEXT); if (textField != null) { return textField.getName(); } return ""; } /** * @param option an option bit constant * @return whether the option is set */ public boolean isOption (int option) { return (options & option) != 0; } public boolean isIndexingEnabled() { return (options & INDEXES) != 0; } public Map<String,String> getNamespaceMap () { return Collections.unmodifiableMap(namespaceMap); } public void defineNamespaceMapping (String prefix, String namespaceURI) { namespaceMap.put(prefix, namespaceURI); } public String getUriFieldName () { return URI.getName(); } public String getXmlFieldName () { return XML_STORE.getName(); } public String getTextFieldName () { return XML_TEXT.getName(); } public String getElementTextFieldName () { return ELEMENT_TEXT.getName(); } public String getAttributeTextFieldName () { return ATTRIBUTE_TEXT.getName(); } /** * @param clarkName the name of an element in clark-notation: {namespace}local-name, or simply local-name * if the element name is in no namespace. * @return the explicitly-specified visibility of the element name, or null if the element has the default * visibility. */ public ElementVisibility getElementVisibility (String clarkName) { return eltVis.get(clarkName); } /** sets the visibility of elements with the given name * @param clarkName the name of an element in clark-notation: {namespace}local-name, or simply local-name * if the element name is in no namespace. * @param vis the visibility of the element's content from the perspective of containing elements. * visibility. */ public void setElementVisibility (String clarkName, ElementVisibility vis) { eltVis.put(clarkName, vis); } /** @return the visibility of elements not explicitly specified using setElementVisibility. * Always {@link ElementVisibility#OPAQUE}. */ public ElementVisibility getDefaultVisibility() { return defVis; } /* public void setDefaultVisibility(ElementVisibility vis) { this.defVis = vis; } */ public Map<String,ElementVisibility> getVisibilityMap () { return Collections.unmodifiableMap(eltVis); } } /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */