/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.schema;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.search.*;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import java.util.Map;
import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
import java.io.StringReader;
/** <code>TextField</code> is the basic type for configurable text analysis.
* Analyzers for field types using this implementation should be defined in the schema.
*
*/
public class TextField extends FieldType {
protected boolean autoGeneratePhraseQueries;
/**
* Analyzer set by schema for text types to use when searching fields
* of this type, subclasses can set analyzer themselves or override
* getAnalyzer()
* This analyzer is used to process wildcard, prefix, regex and other multiterm queries. It
* assembles a list of tokenizer +filters that "make sense" for this, primarily accent folding and
* lowercasing filters, and charfilters.
*
* @see #getMultiTermAnalyzer
* @see #setMultiTermAnalyzer
*/
protected Analyzer multiTermAnalyzer=null;
@Override
protected void init(IndexSchema schema, Map<String,String> args) {
properties |= TOKENIZED;
if (schema.getVersion()> 1.1f) properties &= ~OMIT_TF_POSITIONS;
if (schema.getVersion() > 1.3f) {
autoGeneratePhraseQueries = false;
} else {
autoGeneratePhraseQueries = true;
}
String autoGeneratePhraseQueriesStr = args.remove("autoGeneratePhraseQueries");
if (autoGeneratePhraseQueriesStr != null)
autoGeneratePhraseQueries = Boolean.parseBoolean(autoGeneratePhraseQueriesStr);
super.init(schema, args);
}
/**
* Returns the Analyzer to be used when searching fields of this type when mult-term queries are specified.
* <p>
* This method may be called many times, at any time.
* </p>
* @see #getAnalyzer
*/
public Analyzer getMultiTermAnalyzer() {
return multiTermAnalyzer;
}
public void setMultiTermAnalyzer(Analyzer analyzer) {
this.multiTermAnalyzer = analyzer;
}
public boolean getAutoGeneratePhraseQueries() {
return autoGeneratePhraseQueries;
}
@Override
public SortField getSortField(SchemaField field, boolean reverse) {
/* :TODO: maybe warn if isTokenized(), but doesn't use LimitTokenCountFilter in it's chain? */
return getStringSort(field, reverse);
}
@Override
public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException {
writer.writeStr(name, f.stringValue(), true);
}
@Override
public Query getFieldQuery(QParser parser, SchemaField field, String externalVal) {
return parseFieldQuery(parser, getQueryAnalyzer(), field.getName(), externalVal);
}
@Override
public Object toObject(SchemaField sf, BytesRef term) {
return term.utf8ToString();
}
@Override
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
@Override
public void setQueryAnalyzer(Analyzer analyzer) {
this.queryAnalyzer = analyzer;
}
@Override
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
Analyzer multiAnalyzer = getMultiTermAnalyzer();
BytesRef lower = analyzeMultiTerm(field.getName(), part1, multiAnalyzer);
BytesRef upper = analyzeMultiTerm(field.getName(), part2, multiAnalyzer);
return new TermRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive);
}
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
if (part == null) return null;
TokenStream source;
try {
source = analyzerIn.tokenStream(field, new StringReader(part));
source.reset();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
}
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
BytesRef bytes = termAtt.getBytesRef();
try {
if (!source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part);
termAtt.fillBytesRef();
if (source.incrementToken())
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e);
}
try {
source.end();
source.close();
} catch (IOException e) {
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
}
return BytesRef.deepCopyOf(bytes);
}
static Query parseFieldQuery(QParser parser, Analyzer analyzer, String field, String queryText) {
int phraseSlop = 0;
// most of the following code is taken from the Lucene QueryParser
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
TokenStream source;
try {
source = analyzer.tokenStream(field, new StringReader(queryText));
source.reset();
} catch (IOException e) {
throw new RuntimeException("Unable to initialize TokenStream to analyze query text", e);
}
CachingTokenFilter buffer = new CachingTokenFilter(source);
CharTermAttribute termAtt = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
buffer.reset();
if (buffer.hasAttribute(CharTermAttribute.class)) {
termAtt = buffer.getAttribute(CharTermAttribute.class);
}
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
boolean hasMoreTokens = false;
if (termAtt != null) {
try {
hasMoreTokens = buffer.incrementToken();
while (hasMoreTokens) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
hasMoreTokens = buffer.incrementToken();
}
} catch (IOException e) {
// ignore
}
}
try {
// rewind the buffer stream
buffer.reset();
// close original stream - all tokens buffered
source.close();
}
catch (IOException e) {
// ignore
}
if (numTokens == 0)
return null;
else if (numTokens == 1) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
// return newTermQuery(new Term(field, term));
return new TermQuery(new Term(field, term));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
// BooleanQuery q = newBooleanQuery(true);
BooleanQuery q = new BooleanQuery(true);
for (int i = 0; i < numTokens; i++) {
String term = null;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
// Query currentQuery = newTermQuery(new Term(field, term));
Query currentQuery = new TermQuery(new Term(field, term));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
}
else {
// phrase query:
// MultiPhraseQuery mpq = newMultiPhraseQuery();
MultiPhraseQuery mpq = new MultiPhraseQuery();
mpq.setSlop(phraseSlop);
List multiTerms = new ArrayList();
int position = -1;
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
mpq.add((Term[])multiTerms.toArray(new Term[multiTerms.size()]),position);
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new Term(field, term));
}
mpq.add((Term[])multiTerms.toArray(new Term[multiTerms.size()]),position);
return mpq;
}
}
else {
// PhraseQuery pq = newPhraseQuery();
PhraseQuery pq = new PhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
for (int i = 0; i < numTokens; i++) {
String term = null;
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
term = termAtt.toString();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
position += positionIncrement;
pq.add(new Term(field, term),position);
}
return pq;
}
}
}
}