/**
* Copyright 2014 National University of Ireland, Galway.
*
* This file is part of the SIREn project. Project and contact information:
*
* https://github.com/rdelbru/SIREn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sindice.siren.solr.schema;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.schema.FieldProperties;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaAware;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
import org.apache.solr.search.QParser;
import org.sindice.siren.index.codecs.siren10.Siren10AForPostingsFormat;
import org.sindice.siren.solr.analysis.DatatypeAnalyzerFilterFactory;
import org.sindice.siren.solr.analysis.PositionAttributeFilterFactory;
import org.sindice.siren.solr.analysis.SirenPayloadFilterFactory;
import org.xml.sax.InputSource;
/**
* <code>SirenField</code> is the basic type for configurable tree-based data
* analysis.
*
* <p>
*
* This field type relies on:
* <ul>
* <li> an index analyzer configuration (no query analyzer is required)
* <li> a datatype analyzers configuration using the parameter
* <code>datatypeConfig</code>
* </ul>
*
* <p>
*
* This field type enforces certain field properties
* by throwing a {@link SolrException} if the field type does not set properly
* the properties. By default all the properties are set properly, i.e.,
* a user should not modify these properties. This field type enforces also
* the <code>postingsFormat</code> to <code>Siren10Afor</code>. The list of
* enforced field properties are:
* <ul>
* <li> indexed = true
* <li> tokenized = true
* <li> omitNorm = true
* <li> multiValued = false
* <li> omitTermFreqAndPositions = false
* <li> omitPositions = false
* <li> termVectors = false
* </ul>
*
* <p>
*
* A {@link SchemaField} can overwrite these properties, however an exception
* will be thrown when converting it to a Lucene's
* {@link org.apache.lucene.document.FieldType} in
* {@link #createField(SchemaField, Object, float)}.
*
* <p>
*
* This field type extends {@link TextField} to have the
* {@link FieldProperties#OMIT_TF_POSITIONS} set to false by default.
*/
public class SirenField extends TextField implements SchemaAware {
private String datatypeAnalyzerConfigPath;
private final AtomicReference<SirenDatatypeAnalyzerConfig> datatypeConfigRef = new AtomicReference<SirenDatatypeAnalyzerConfig>();
@Override
protected void init(final IndexSchema schema, final Map<String,String> args) {
// first call TextField.init to set omitTermFreqAndPositions to false
super.init(schema, args);
this.checkFieldTypeProperties();
// initialise specific SIREn's properties
this.datatypeAnalyzerConfigPath = args.get("datatypeConfig");
args.remove("datatypeConfig");
if (datatypeAnalyzerConfigPath == null) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField types require a 'datatypeConfig' " +
"parameter: " + this.typeName);
}
// set the posting format
this.postingsFormat = Siren10AForPostingsFormat.NAME;
super.init(schema, args);
}
private void checkFieldTypeProperties() {
if (this.isMultiValued()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField types can not be multiValued: " +
this.typeName);
}
if (!this.hasProperty(FieldProperties.OMIT_NORMS)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField types must omit norms: " +
this.typeName);
}
if (this.hasProperty(FieldProperties.OMIT_TF_POSITIONS)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField types can not omit term frequencies " +
"and positions: " + this.typeName);
}
if (this.hasProperty(FieldProperties.OMIT_POSITIONS)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField types can not omit positions: " +
this.typeName);
}
if (this.hasProperty(FieldProperties.STORE_TERMVECTORS)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField types can not store term vectors: " +
this.typeName);
}
}
@Override
public IndexableField createField(final SchemaField field, final Object value,
final float boost) {
if (!field.indexed()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must be indexed: " +
field.getName());
}
if (field.multiValued()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances can not be multivalued: " +
field.getName());
}
if (!field.omitNorms()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must omit norms: " +
field.getName());
}
if (field.omitTermFreqAndPositions()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must not omit term " +
"frequencies and positions: " + field.getName());
}
if (field.omitPositions()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must not omit term " +
"positions: " + field.getName());
}
if (field.storeTermVector()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances can not store term vectors: " +
field.getName());
}
return super.createField(field, value, boost);
}
@Override
protected IndexableField createField(final String name, final String val,
final org.apache.lucene.document.FieldType type,
final float boost){
if (!type.indexed()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must be indexed: " + name);
}
if (!type.tokenized()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must be tokenised: " + name);
}
if (!type.omitNorms()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must omit norms: " + name);
}
if (!type.indexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances must not omit term " +
"frequencies and positions: " + name);
}
if (type.storeTermVectors()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"SirenField instances can not store term vectors: " +
name);
}
return super.createField(name, val, type, boost);
}
@Override
public SortField getSortField(final SchemaField field, final boolean reverse) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Unsupported operation. Can not sort on SIREn field: "
+ field.getName());
}
@Override
public void write(final TextResponseWriter writer, final String name, final IndexableField f)
throws IOException {
writer.writeStr(name, f.stringValue(), true);
}
@Override
public Query getFieldQuery(final QParser parser, final SchemaField field,
final String externalVal) {
// Not useful for now in SIREn
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Not implemented operation."
+ field.getName());
}
@Override
public Query getRangeQuery(final QParser parser, final SchemaField field,
final String part1, final String part2,
final boolean minInclusive, final boolean maxInclusive) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Unsupported operation. Can not do range on SIREn field: "
+ field.getName());
}
@Override
public Analyzer getMultiTermAnalyzer() {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Unsupported operation. Use getAnalyzer instead.");
}
@Override
public boolean getAutoGeneratePhraseQueries() {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Unsupported operation.");
}
@Override
public void setAnalyzer(final Analyzer analyzer) {
this.analyzer = analyzer;
}
@Override
public void setQueryAnalyzer(final Analyzer analyzer) {
// SirenField does not require a query analyzer
queryAnalyzer = null;
}
public Map<String, Datatype> getDatatypes() {
return this.datatypeConfigRef.get().getDatatypes();
}
/**
* Load the datatype analyzer config file specified by the schema.
* <p/>
* This should be called whenever the datatype analyzer configuration file changes.
*/
private void loadDatatypeConfig(final IndexSchema schema) {
InputStream is;
log.info("Loading datatype analyzer configuration file at " + datatypeAnalyzerConfigPath);
try {
is = schema.getResourceLoader().openResource(datatypeAnalyzerConfigPath);
} catch (final IOException e) {
log.error("Error loading datatype analyzer configuration file at " + datatypeAnalyzerConfigPath, e);
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
try {
final SirenDatatypeAnalyzerConfig newConfig =
new SirenDatatypeAnalyzerConfig(schema.getResourceLoader(),
datatypeAnalyzerConfigPath, new InputSource(is),
schema.getDefaultLuceneMatchVersion());
log.info("Read new datatype analyzer configuration " + newConfig);
datatypeConfigRef.set(newConfig);
} finally {
if (is != null) {
try {
is.close();
} catch (final IOException ignored) {
}
}
}
}
/**
* When index schema is informed, load the datatype config and append the
* SIREn's filters to the tokenizer chain.
*/
@Override
public void inform(final IndexSchema schema) {
// load the datatypes
this.loadDatatypeConfig(schema);
// Append the SIREn's filters and update the index analyzer reference
this.setAnalyzer(this.appendSirenFilters(
this.getAnalyzer(),
this.datatypeConfigRef.get().getDatatypes(),
schema.getDefaultLuceneMatchVersion()));
// tell the {@link IndexSchema} to refresh its analyzers
schema.refreshAnalyzers();
}
/**
* Append the mandatory SIREn filters, i.e.,
* {@link DatatypeAnalyzerFilterFactory},
* {@link PositionAttributeFilterFactory} and
* {@link SirenPayloadFilterFactory}, to the tokenizer chain.
*/
private Analyzer appendSirenFilters(final Analyzer analyzer,
final Map<String, Datatype> datatypes,
final Version luceneDefaultVersion) {
if (!(analyzer instanceof TokenizerChain)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Invalid index analyzer '" + analyzer.getClass() + "' received");
}
final TokenizerChain chain = (TokenizerChain) analyzer;
// copy the existing list of token filters
final TokenFilterFactory[] old = chain.getTokenFilterFactories();
final TokenFilterFactory[] filterFactories = new TokenFilterFactory[old.length + 3];
System.arraycopy(old, 0, filterFactories, 0, old.length);
// append the datatype analyzer filter factory
final DatatypeAnalyzerFilterFactory datatypeFactory = new DatatypeAnalyzerFilterFactory(luceneDefaultVersion);
datatypeFactory.register(datatypes);
filterFactories[old.length] = datatypeFactory;
// append the position attribute filter factory
filterFactories[old.length + 1] = new PositionAttributeFilterFactory();
// append the siren payload filter factory
filterFactories[old.length + 2] = new SirenPayloadFilterFactory();
// create a new tokenizer chain with the updated list of filter factories
return new TokenizerChain(chain.getCharFilterFactories(),
chain.getTokenizerFactory(), filterFactories);
}
}