package org.apache.solr.handler.component; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.TermVectorParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.StrUtils; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocList; import org.apache.solr.search.DocListAndSet; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.SolrPluginUtils; import org.apache.solr.util.plugin.SolrCoreAware; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Return term vectors for the documents in a query result set. * <p/> * Info available: * term, frequency, position, offset, IDF. * <p/> * <b>Note</b> Returning IDF can be expensive. */ public class TermVectorComponent extends SearchComponent implements SolrCoreAware { public static final String COMPONENT_NAME = "tv"; protected NamedList initParams; public static final String TERM_VECTORS = "termVectors"; public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return; } NamedList termVectors = new NamedList(); rb.rsp.add(TERM_VECTORS, termVectors); FieldOptions allFields = new FieldOptions(); //figure out what options we have, and try to get the appropriate vector allFields.termFreq = params.getBool(TermVectorParams.TF, false); allFields.positions = params.getBool(TermVectorParams.POSITIONS, false); allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false); allFields.docFreq = params.getBool(TermVectorParams.DF, false); allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false); //boolean cacheIdf = params.getBool(TermVectorParams.IDF, false); //short cut to all values. boolean all = params.getBool(TermVectorParams.ALL, false); if (all == true) { allFields.termFreq = true; allFields.positions = true; allFields.offsets = true; allFields.docFreq = true; allFields.tfIdf = true; } String fldLst = params.get(TermVectorParams.FIELDS); if (fldLst == null) { fldLst = params.get(CommonParams.FL); } //use this to validate our fields IndexSchema schema = rb.req.getSchema(); //Build up our per field mapping Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>(); NamedList warnings = new NamedList(); List<String> noTV = new ArrayList<String>(); List<String> noPos = new ArrayList<String>(); List<String> noOff = new ArrayList<String>(); //we have specific fields to retrieve if (fldLst != null) { String [] fields = SolrPluginUtils.split(fldLst); for (String field : fields) { SchemaField sf = schema.getFieldOrNull(field); if (sf != null) { if (sf.storeTermVector()) { FieldOptions option = fieldOptions.get(field); if (option == null) { option = new FieldOptions(); option.fieldName = field; fieldOptions.put(field, option); } //get the per field mappings option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq); option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq); option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf); //Validate these are even an option option.positions = params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions); if (option.positions == true && sf.storeTermPositions() == false){ noPos.add(field); } option.offsets = params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets); if (option.offsets == true && sf.storeTermOffsets() == false){ noOff.add(field); } } else {//field doesn't have term vectors noTV.add(field); } } else { //field doesn't exist throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field); } } } //else, deal with all fields boolean hasWarnings = false; if (noTV.isEmpty() == false) { warnings.add("noTermVectors", noTV); hasWarnings = true; } if (noPos.isEmpty() == false) { warnings.add("noPositions", noPos); hasWarnings = true; } if (noOff.isEmpty() == false) { warnings.add("noOffsets", noOff); hasWarnings = true; } if (hasWarnings == true) { termVectors.add("warnings", warnings); } DocListAndSet listAndSet = rb.getResults(); List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS)); Iterator<Integer> iter; if (docIds != null && docIds.isEmpty() == false) { iter = docIds.iterator(); } else { DocList list = listAndSet.docList; iter = list.iterator(); } SolrIndexSearcher searcher = rb.req.getSearcher(); IndexReader reader = searcher.getReader(); //the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors SchemaField keyField = schema.getUniqueKeyField(); String uniqFieldName = null; if (keyField != null) { uniqFieldName = keyField.getName(); } //Only load the id field to get the uniqueKey of that field SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet()); TVMapper mapper = new TVMapper(reader); mapper.fieldOptions = allFields; //this will only stay set if fieldOptions.isEmpty() (in other words, only if the user didn't set any fields) while (iter.hasNext()) { Integer docId = iter.next(); NamedList docNL = new NamedList(); mapper.docNL = docNL; termVectors.add("doc-" + docId, docNL); if (keyField != null) { Document document = reader.document(docId, fieldSelector); Fieldable uniqId = document.getField(uniqFieldName); String uniqVal = null; if (uniqId != null) { uniqVal = keyField.getType().storedToReadable(uniqId); } if (uniqVal != null) { docNL.add("uniqueKey", uniqVal); termVectors.add("uniqueKeyFieldName", uniqFieldName); } } if (fieldOptions.isEmpty() == false) { for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) { mapper.fieldOptions = entry.getValue(); reader.getTermFreqVector(docId, entry.getKey(), mapper); } } else { //deal with all fields by using the allFieldMapper reader.getTermFreqVector(docId, mapper); } } } private List<Integer> getInts(String[] vals) { List<Integer> result = null; if (vals != null && vals.length > 0) { result = new ArrayList<Integer>(vals.length); for (int i = 0; i < vals.length; i++) { try { result.add(new Integer(vals[i])); } catch (NumberFormatException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e); } } } return result; } @Override public int distributedProcess(ResponseBuilder rb) throws IOException { int result = ResponseBuilder.STAGE_DONE; if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) { //Go ask each shard for it's vectors // for each shard, collect the documents for that shard. HashMap<String, Collection<ShardDoc>> shardMap = new HashMap<String, Collection<ShardDoc>>(); for (ShardDoc sdoc : rb.resultIds.values()) { Collection<ShardDoc> shardDocs = shardMap.get(sdoc.shard); if (shardDocs == null) { shardDocs = new ArrayList<ShardDoc>(); shardMap.put(sdoc.shard, shardDocs); } shardDocs.add(sdoc); } // Now create a request for each shard to retrieve the stored fields for (Collection<ShardDoc> shardDocs : shardMap.values()) { ShardRequest sreq = new ShardRequest(); sreq.purpose = ShardRequest.PURPOSE_GET_FIELDS; sreq.shards = new String[]{shardDocs.iterator().next().shard}; sreq.params = new ModifiableSolrParams(); // add original params sreq.params.add(rb.req.getParams()); sreq.params.remove(CommonParams.Q);//remove the query ArrayList<String> ids = new ArrayList<String>(shardDocs.size()); for (ShardDoc shardDoc : shardDocs) { ids.add(shardDoc.id.toString()); } sreq.params.add(TermVectorParams.DOC_IDS, StrUtils.join(ids, ',')); rb.addRequest(this, sreq); } result = ResponseBuilder.STAGE_DONE; } return result; } private static class TVMapper extends TermVectorMapper { private IndexReader reader; private NamedList docNL; //needs to be set for each new field FieldOptions fieldOptions; //internal vars not passed in by construction private boolean useOffsets, usePositions; //private Map<String, Integer> idfCache; private NamedList fieldNL; private Term currentTerm; public TVMapper(IndexReader reader) { this.reader = reader; } public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { NamedList termInfo = new NamedList(); fieldNL.add(term.utf8ToString(), termInfo); if (fieldOptions.termFreq == true) { termInfo.add("tf", frequency); } if (useOffsets == true) { NamedList theOffsets = new NamedList(); termInfo.add("offsets", theOffsets); for (int i = 0; i < offsets.length; i++) { TermVectorOffsetInfo offset = offsets[i]; theOffsets.add("start", offset.getStartOffset()); theOffsets.add("end", offset.getEndOffset()); } } if (usePositions == true) { NamedList positionsNL = new NamedList(); for (int i = 0; i < positions.length; i++) { positionsNL.add("position", positions[i]); } termInfo.add("positions", positionsNL); } if (fieldOptions.docFreq == true) { termInfo.add("df", getDocFreq(term)); } if (fieldOptions.tfIdf == true) { double tfIdfVal = ((double) frequency) / getDocFreq(term); termInfo.add("tf-idf", tfIdfVal); } } private int getDocFreq(BytesRef term) { int result = 1; currentTerm = currentTerm.createTerm(term); try { Terms terms = MultiFields.getTerms(reader, currentTerm.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); if (termsEnum.seek(term) == TermsEnum.SeekStatus.FOUND) { result = termsEnum.docFreq(); } } } catch (IOException e) { throw new RuntimeException(e); } return result; } public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { if (fieldOptions.docFreq == true && reader != null) { this.currentTerm = new Term(field); } useOffsets = storeOffsets && fieldOptions.offsets; usePositions = storePositions && fieldOptions.positions; fieldNL = new NamedList(); docNL.add(field, fieldNL); } @Override public boolean isIgnoringPositions() { return fieldOptions.positions == false; // if we are not interested in positions, then return true telling Lucene to skip loading them } @Override public boolean isIgnoringOffsets() { return fieldOptions.offsets == false; // if we are not interested in offsets, then return true telling Lucene to skip loading them } } public void prepare(ResponseBuilder rb) throws IOException { } //////////////////////// NamedListInitializedPlugin methods ////////////////////// @Override public void init(NamedList args) { super.init(args); this.initParams = args; } public void inform(SolrCore core) { } public String getVersion() { return "$Revision$"; } public String getSourceId() { return "$Id:$"; } public String getSource() { return "$Revision:$"; } public String getDescription() { return "A Component for working with Term Vectors"; } } class FieldOptions { String fieldName; boolean termFreq, positions, offsets, docFreq, tfIdf; }