package lux.solr; import static lux.index.IndexConfiguration.*; import java.net.MalformedURLException; import java.net.URL; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ArrayBlockingQueue; import lux.Compiler; import lux.exception.LuxException; import lux.index.FieldRole; import lux.index.IndexConfiguration; import lux.index.XmlIndexer; import lux.index.analysis.DefaultAnalyzer; import lux.index.analysis.WhitespaceGapAnalyzer; import lux.index.field.FieldDefinition; import lux.index.field.FieldDefinition.Type; import lux.index.field.XPathField; import lux.index.field.XmlTextField; import net.sf.saxon.s9api.Serializer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.PluginInfo; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrInfoMBean; import org.apache.solr.schema.BinaryField; import org.apache.solr.schema.CopyField; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.StrField; import org.apache.solr.schema.TextField; import org.apache.solr.schema.TrieIntField; import org.apache.solr.schema.TrieLongField; import org.apache.solr.update.processor.UpdateRequestProcessorChain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Wraps a {@link IndexConfiguration}, adding field definitions from information in Solr's configuration files: * solrconfig.xml and schema.xml */ public class SolrIndexConfig implements SolrInfoMBean { private static final String SOURCE_URL = "https://github.com/msokolov/lux"; private final IndexConfiguration indexConfig; private NamedList<String> xpathFieldConfig; private Compiler compiler; private ArrayBlockingQueue<XmlIndexer> indexerPool; private ArrayBlockingQueue<Serializer> serializerPool; private IndexSchema schema; private final Logger logger; public SolrIndexConfig (final IndexConfiguration indexConfig, NamedList<?> args) { this.indexConfig = indexConfig; indexerPool = new ArrayBlockingQueue<XmlIndexer>(8); serializerPool = new ArrayBlockingQueue<Serializer>(8); logger = LoggerFactory.getLogger(getClass()); if (args != null) { applySolrConfig(args); } compiler = new Compiler (indexConfig); } public Compiler getCompiler () { return compiler; } public XmlIndexer checkoutXmlIndexer () { // In tests it didn't seem to make any appreciable difference whether this // pool was present or not, but it salves my conscience XmlIndexer indexer = indexerPool.poll(); if (indexer == null) { indexer = new XmlIndexer (indexConfig, compiler); logger.debug("created new XmlIndexer"); } return indexer; } public void returnXmlIndexer (XmlIndexer doneWithIt) { indexerPool.offer(doneWithIt); // if the pool was full, we just drop the indexer as garbage } public Serializer checkoutSerializer() { Serializer serializer = serializerPool.poll(); if (serializer == null) { serializer = new Serializer(); serializer.setOutputProperty(Serializer.Property.ENCODING, "utf-8"); serializer.setOutputProperty(Serializer.Property.BYTE_ORDER_MARK, "no"); serializer.setOutputProperty(Serializer.Property.OMIT_XML_DECLARATION, "yes"); } return serializer; } public void returnSerializer (Serializer doneWithIt) { serializerPool.offer(doneWithIt); // if the pool was full, we just drop the serializer } public static SolrIndexConfig registerIndexConfiguration (SolrCore core) { // Read the init args from the LuxUpdateProcessorFactory's configuration NamedList<?> initArgs = null; for (PluginInfo info : core.getSolrConfig().getPluginInfos(UpdateRequestProcessorChain.class.getName())) { // FIXME: if there are multiple processors, we prefer the 'default' one, otherwise // just take the last? This is a bit lame, but it provides back-compat. We should at least // raise a warning if this is ambiguous initArgs = info.initArgs; if ("true".equals(info.attributes.get("default"))) { break; } } String configName = SolrIndexConfig.class.getName(); SolrInfoMBean configBean = core.getInfoRegistry().get(configName); SolrIndexConfig indexConfig; if (configBean != null) { indexConfig = (SolrIndexConfig) configBean; } else { int options = (INDEX_PATHS | INDEX_FULLTEXT | STORE_DOCUMENT | SOLR); indexConfig = SolrIndexConfig.makeIndexConfiguration(options, initArgs, configName); indexConfig.inform(core); core.getInfoRegistry().put(configName, indexConfig); } return indexConfig; } public static SolrIndexConfig makeIndexConfiguration (int options, final NamedList<?> args, String configName) { if (args != null) { if ("yes".equals(args.get("strip-namespaces"))) { options |= STRIP_NAMESPACES; } if ("yes".equals(args.get("namespace-aware"))) { options |= NAMESPACE_AWARE; } Object format = args.get("xml-format"); if (format != null) { if ("tiny".equals(format)) { options |= STORE_TINY_BINARY; } else if (! "xml".equals(format)) { throw new LuxException("invalid xml-format: " + format + ", must be one of: (xml,tiny)"); } } } IndexConfiguration indexConfig = new IndexConfiguration(options); return new SolrIndexConfig(indexConfig, args); } public void applyFieldConfiguration (NamedList<String> fields) { if (fields != null) { xpathFieldConfig = new NamedList<String>(); for (Entry<String,String> f : fields) { xpathFieldConfig.add(f.getKey(), f.getValue()); } } } private void applySolrConfig (@SuppressWarnings("rawtypes") final NamedList args) { NamedList<?> aliases = (NamedList<?>) args.get ("fieldAliases"); if (aliases == null) { return; } for (int i = 0; i < aliases.size(); i++) { String name = aliases.getName(i); Object value = aliases.getVal(i); if ("xmlFieldName".equals(name)) { indexConfig.renameField(indexConfig.getField(FieldRole.XML_STORE), value.toString()); logger.info("XML storage field name: {}", value.toString()); } else if ("uriFieldName".equals(name)) { logger.info("URI field name: {}", value.toString()); indexConfig.renameField(indexConfig.getField(FieldRole.URI), value.toString()); } else if ("textFieldName".equals(name)) { logger.info("XML text field name: {}", value.toString()); indexConfig.renameField(indexConfig.getField(FieldRole.XML_TEXT), value.toString()); } } @SuppressWarnings("unchecked") NamedList<String> fields = (NamedList<String>) args.get("fields"); if (fields != null) { applyFieldConfiguration(fields); } @SuppressWarnings("unchecked") NamedList<String> namespaces = (NamedList<String>) args.get("namespaces"); if (namespaces != null) { for (Entry<String,String> ns : namespaces) { indexConfig.defineNamespaceMapping(ns.getKey(), ns.getValue()); } } } public void inform(SolrCore core) { schema = core.getLatestSchema(); // XML_STORE is not listed explicitly by the indexer informField (indexConfig.getField(FieldRole.XML_STORE), core); // This must be run before informField() registers default analyzers with the Schema registerXmlTextFields(); for (FieldDefinition xmlField : indexConfig.getFields()) { informField (xmlField, core); } if (xpathFieldConfig != null) { addXPathFields(); } SchemaField uniqueKeyField = schema.getUniqueKeyField(); if (uniqueKeyField == null) { logger.error("{} schema does not define any unique field", core.getName()); } else if (! uniqueKeyField.getName().equals(indexConfig.getFieldName(FieldRole.URI))) { logger.error("{} schema defines a different unique field than the uri field declared in lux configuration", core.getName()); } // must call this after making changes to the field map: schema.refreshAnalyzers(); } private void informField (FieldDefinition xmlField, SolrCore core) { Map<String,SchemaField> schemaFields = schema.getFields(); Map<String,FieldType> fieldTypes = schema.getFieldTypes(); String fieldName = xmlField.getName(); if (schemaFields.containsKey(fieldName) && xmlField.getType() != Type.TOKENS) { // The Solr schema has a definition for this field, but it's not a TOKENS field: // We're only interested in TOKENS fields here; these need to install their own special field type since they wrap the // analyzer defined by the schema return; } // look up the type of this field using the mapping in this class FieldType fieldType = getFieldType(xmlField); if (! fieldTypes.containsKey(fieldType.getTypeName())) { // The Solr schema does not define this field type, so add it logger.info("{} defining fieldType: {}", core.getName(), fieldType.getTypeName()); fieldTypes.put(fieldType.getTypeName(), fieldType); } else { fieldType = fieldTypes.get(fieldType.getTypeName()); } // Add the field to the schema logger.info(core.getName() + " defining field: {} of type {}", fieldName, fieldType.getTypeName()); schemaFields.put(fieldName, new SchemaField (fieldName, fieldType, xmlField.getSolrFieldProperties(), "")); } private void registerXmlTextFields() { String xmlFieldName = indexConfig.getFieldName(FieldRole.XML_TEXT); SchemaField schemaField = schema.getFieldOrNull(xmlFieldName); Analyzer xmlAnalyzer = null; Analyzer xmlQueryAnalyzer = null; if (schemaField != null) { xmlAnalyzer = schemaField.getType().getAnalyzer(); xmlQueryAnalyzer = schemaField.getType().getQueryAnalyzer(); if (xmlAnalyzer != null) { for (FieldRole role : new FieldRole [ ] { FieldRole.XML_TEXT, FieldRole.ELEMENT_TEXT, FieldRole.ATTRIBUTE_TEXT }) { FieldDefinition field = indexConfig.getField(role); field.setAnalyzer(xmlAnalyzer); // this analyzer is used when indexing field.setQueryAnalyzer(xmlQueryAnalyzer); // this analyzer is used when indexing indexConfig.getFieldAnalyzers().put(field.getName(), xmlQueryAnalyzer); // this analyzer is used when parsing queries } } } for (CopyField copyField : schema.getCopyFieldsList(xmlFieldName)) { // register fields copied from lux_text with the indexer so that we feed them an XdmNode SchemaField destination = copyField.getDestination(); Analyzer analyzer = destination.getType().getAnalyzer(); if (analyzer == null) { // can this happen?? what about the query analyzer? can that be null? if (xmlAnalyzer != null) { analyzer = xmlAnalyzer; // why would you copy it then? } else { analyzer = new DefaultAnalyzer(); } } // TODO: should there be additional element and attribute text fields as well? XmlTextField xmlCopyField = new XmlTextField (destination.getName(), analyzer); xmlCopyField.setQueryAnalyzer(analyzer); indexConfig.addField(xmlCopyField); } } /** Add the xpathFields to the indexConfig using information about the field drawn from the schema. */ private void addXPathFields() { for (Entry<String,String> f : xpathFieldConfig) { SchemaField field = schema.getField(f.getKey()); FieldType fieldType = field.getType(); if (fieldType == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "Field " + f.getKey() + " declared in lux config, but not defined in schema"); } XPathField xpathField = new XPathField(f.getKey(), f.getValue(), fieldType.getAnalyzer(), field.stored() ? Store.YES : Store.NO, field); indexConfig.addField(xpathField); } } private FieldType getFieldType(FieldDefinition xmlField) { // TODO - we should store a field type name in XmlField and just look that up instead // of trying to infer from the analyzer Analyzer analyzer = xmlField.getAnalyzer(); String fieldName = xmlField.getName(); if (analyzer == null) { if (! (xmlField.isStored() == Store.YES)) { throw new SolrException(ErrorCode.BAD_REQUEST, "invalid xml field: " + fieldName + "; no analyzer and not stored"); } switch (xmlField.getType()) { case STRING: return new StoredStringField (); case INT: return new NamedIntField(); case LONG: return new NamedLongField(); case BYTES: return new NamedBinaryField(); default: throw new SolrException (ErrorCode.BAD_REQUEST, "invalid stored field: " + fieldName + " with type: " + xmlField.getType()); } } if (xmlField.getType() == Type.TOKENS) { return new FieldableField(xmlField); } if (analyzer == null || analyzer instanceof KeywordAnalyzer) { return new StringField(); } if (analyzer instanceof WhitespaceGapAnalyzer) { return new PathField (); } throw new SolrException(ErrorCode.BAD_REQUEST, "invalid xml field: " + fieldName + "; unknown analyzer type: " + analyzer); } public IndexSchema getSchema () { return schema; } // subclasses of built-in Solr field types exist purely so we can name them. // Is that actually necessary? class StoredStringField extends StrField { StoredStringField () { typeName = "lux_stored_string"; } } class NamedIntField extends TrieIntField { public NamedIntField() { typeName = "int"; } } class NamedLongField extends TrieLongField { public NamedLongField() { typeName = "long"; } } class NamedBinaryField extends BinaryField { public NamedBinaryField() { typeName = "binary"; } } class StringField extends StrField { StringField () { typeName = "string"; } } class PathField extends TextField { PathField () { typeName = "lux_text_ws"; setAnalyzer(new WhitespaceGapAnalyzer()); setQueryAnalyzer(new WhitespaceGapAnalyzer()); } /* * REVIEW: do we need this? @Override protected Field.Index getFieldIndex(SchemaField field, String internalVal) { return Field.Index.ANALYZED; } */ } /** * enable pass-through of a Fieldable to Solr; this enables analysis to be performed outside of Solr */ class FieldableField extends TextField { FieldableField (FieldDefinition xmlField) { typeName = xmlField.getName() + "-fieldable-type"; this.analyzer = xmlField.getAnalyzer(); this.queryAnalyzer = xmlField.getQueryAnalyzer(); properties &= ~STORED; properties |= INDEXED|TOKENIZED; } @Override public Field createField(SchemaField field, Object val, float boost) { return (Field) val; } } public IndexConfiguration getIndexConfig() { return indexConfig; } @Override public String getName() { return SolrIndexConfig.class.getName(); } @Override public String getVersion() { return "1.0"; } @Override public String getDescription() { return "Lux index configuration"; } @Override public Category getCategory() { return Category.OTHER; } @Override public String getSource() { return SOURCE_URL; } private static URL[] docs; @Override public URL[] getDocs() { if (docs == null) { try { docs = new URL [] { new URL(SOURCE_URL) }; } catch (MalformedURLException e) { } } return docs; } @Override public NamedList<?> getStatistics() { return null; } } /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */