package org.apache.lucene.facet.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.attributes.CategoryAttributesIterable;
import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
import org.apache.lucene.facet.index.categorypolicy.PathPolicy;
import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.index.streaming.CategoryAttributesStream;
import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
import org.apache.lucene.facet.index.streaming.CategoryTokenizer;
import org.apache.lucene.facet.index.streaming.CountingListTokenizer;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A utility class which allows attachment of {@link CategoryPath}s or
* {@link CategoryAttribute}s to a given document using a taxonomy.<br>
* Construction could be done with either a given {@link FacetIndexingParams} or
* the default implementation {@link DefaultFacetIndexingParams}.<br>
* A CategoryDocumentBuilder can be reused by repeatedly setting the categories
* and building the document. Categories are provided either as
* {@link CategoryAttribute} elements through {@link #setCategories(Iterable)},
* or as {@link CategoryPath} elements through
* {@link #setCategoryPaths(Iterable)}.
* <p>
* Note that both {@link #setCategories(Iterable)} and
* {@link #setCategoryPaths(Iterable)} return this
* {@link CategoryDocumentBuilder}, allowing the following pattern: {@code new
* CategoryDocumentBuilder(taxonomy,
* params).setCategories(categories).build(doc)}.
*
* @lucene.experimental
*/
public class CategoryDocumentBuilder {
/**
* A {@link TaxonomyWriter} for adding categories and retrieving their
* ordinals.
*/
protected final TaxonomyWriter taxonomyWriter;
/**
* Parameters to be used when indexing categories.
*/
protected final FacetIndexingParams indexingParams;
/**
* A list of fields which is filled at ancestors' construction and used
* during {@link CategoryDocumentBuilder#build(Document)}.
*/
protected final ArrayList<Field> fieldList = new ArrayList<Field>();
protected Map<String, List<CategoryAttribute>> categoriesMap;
/**
* Creating a facets document builder with default facet indexing
* parameters.<br>
* See:
* {@link #CategoryDocumentBuilder(TaxonomyWriter, FacetIndexingParams)}
*
* @param taxonomyWriter
* to which new categories will be added, as well as translating
* known categories to ordinals
*
*/
public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter) {
this(taxonomyWriter, new DefaultFacetIndexingParams());
}
/**
* Creating a facets document builder with a given facet indexing parameters
* object.<br>
*
* @param taxonomyWriter
* to which new categories will be added, as well as translating
* known categories to ordinals
* @param params
* holds all parameters the indexing process should use such as
* category-list parameters
*/
public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter,
FacetIndexingParams params) {
this.taxonomyWriter = taxonomyWriter;
this.indexingParams = params;
this.categoriesMap = new HashMap<String, List<CategoryAttribute>>();
}
/**
* Set the categories of the document builder from an {@link Iterable} of
* {@link CategoryPath} objects.
*
* @param categoryPaths
* An iterable of CategoryPath objects which holds the categories
* (facets) which will be added to the document at
* {@link #build(Document)}
* @return This CategoryDocumentBuilder, to enable this one line call:
* {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}.
* {@link #setCategoryPaths(Iterable)}.{@link #build(Document)}.
* @throws IOException If there is a low-level I/O error.
*/
public CategoryDocumentBuilder setCategoryPaths(
Iterable<CategoryPath> categoryPaths) throws IOException {
if (categoryPaths == null) {
fieldList.clear();
return this;
}
return setCategories(new CategoryAttributesIterable(categoryPaths));
}
/**
* Set the categories of the document builder from an {@link Iterable} of
* {@link CategoryAttribute} objects.
*
* @param categories
* An iterable of {@link CategoryAttribute} objects which holds
* the categories (facets) which will be added to the document at
* {@link #build(Document)}
* @return This CategoryDocumentBuilder, to enable this one line call:
* {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}.
* {@link #setCategories(Iterable)}.{@link #build(Document)}.
* @throws IOException If there is a low-level I/O error.
*/
public CategoryDocumentBuilder setCategories(
Iterable<CategoryAttribute> categories) throws IOException {
fieldList.clear();
if (categories == null) {
return this;
}
// get field-name to a list of facets mapping as different facets could
// be added to different category-lists on different fields
fillCategoriesMap(categories);
// creates a different stream for each different field
for (Entry<String, List<CategoryAttribute>> e : categoriesMap
.entrySet()) {
// create a category attributes stream for the array of facets
CategoryAttributesStream categoryAttributesStream = new CategoryAttributesStream(
e.getValue());
// Set a suitable {@link TokenStream} using
// CategoryParentsStream, followed by CategoryListTokenizer and
// CategoryTokenizer composition (the ordering of the last two is
// not mandatory).
CategoryParentsStream parentsStream = (CategoryParentsStream) getParentsStream(categoryAttributesStream);
CategoryListTokenizer categoryListTokenizer = getCategoryListTokenizer(parentsStream);
CategoryTokenizer stream = getCategoryTokenizer(categoryListTokenizer);
// Finally creating a suitable field with stream and adding it to a
// master field-list, used during the build process (see
// super.build())
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setOmitNorms(true);
fieldList.add(new Field(e.getKey(), stream, ft));
}
return this;
}
/**
* Get a stream of categories which includes the parents, according to
* policies defined in indexing parameters.
*
* @param categoryAttributesStream
* The input stream
* @return The parents stream.
* @see OrdinalPolicy OrdinalPolicy (for policy of adding category tokens for parents)
* @see PathPolicy PathPolicy (for policy of adding category <b>list</b> tokens for parents)
*/
protected TokenStream getParentsStream(
CategoryAttributesStream categoryAttributesStream) {
return new CategoryParentsStream(categoryAttributesStream,
taxonomyWriter, indexingParams);
}
/**
* Fills the categories mapping between a field name and a list of
* categories that belongs to it according to this builder's
* {@link FacetIndexingParams} object
*
* @param categories
* Iterable over the category attributes
*/
protected void fillCategoriesMap(Iterable<CategoryAttribute> categories)
throws IOException {
categoriesMap.clear();
// for-each category
for (CategoryAttribute category : categories) {
// extracting the field-name to which this category belongs
String fieldName = indexingParams.getCategoryListParams(
category.getCategoryPath()).getTerm().field();
// getting the list of categories which belongs to that field
List<CategoryAttribute> list = categoriesMap.get(fieldName);
// if no such list exists
if (list == null) {
// adding a new one to the map
list = new ArrayList<CategoryAttribute>();
categoriesMap.put(fieldName, list);
}
// adding the new category to the list
list.add(category.clone());
}
}
/**
* Get a category list tokenizer (or a series of such tokenizers) to create
* the <b>category list tokens</b>.
*
* @param categoryStream
* A stream containing {@link CategoryAttribute} with the
* relevant data.
* @return The category list tokenizer (or series of tokenizers) to be used
* in creating category list tokens.
*/
protected CategoryListTokenizer getCategoryListTokenizer(
TokenStream categoryStream) {
return getCountingListTokenizer(categoryStream);
}
/**
* Get a {@link CountingListTokenizer} for creating counting list token.
*
* @param categoryStream
* A stream containing {@link CategoryAttribute}s with the
* relevant data.
* @return A counting list tokenizer to be used in creating counting list
* token.
*/
protected CountingListTokenizer getCountingListTokenizer(
TokenStream categoryStream) {
return new CountingListTokenizer(categoryStream, indexingParams);
}
/**
* Get a {@link CategoryTokenizer} to create the <b>category tokens</b>.
* This method can be overridden for adding more attributes to the category
* tokens.
*
* @param categoryStream
* A stream containing {@link CategoryAttribute} with the
* relevant data.
* @return The {@link CategoryTokenizer} to be used in creating category
* tokens.
* @throws IOException If there is a low-level I/O error.
*/
protected CategoryTokenizer getCategoryTokenizer(TokenStream categoryStream)
throws IOException {
return new CategoryTokenizer(categoryStream, indexingParams);
}
/** Adds the fields created in one of the "set" methods to the document */
public Document build(Document doc) {
for (Field f : fieldList) {
doc.add(f);
}
return doc;
}
}