/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.component; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.TermVectorParams; import org.apache.solr.common.util.Base64; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocList; import org.apache.solr.search.DocListAndSet; import org.apache.solr.search.ReturnFields; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrReturnFields; import org.apache.solr.util.SolrPluginUtils; import org.apache.solr.util.plugin.SolrCoreAware; /** * Return term vectors for the documents in a query result set. * <p> * Info available: * term, frequency, position, offset, payloads, IDF. * <p> * <b>Note</b> Returning IDF can be expensive. * * <pre class="prettyprint"> * <searchComponent name="tvComponent" class="solr.TermVectorComponent"/> * * <requestHandler name="/terms" class="solr.SearchHandler"> * <lst name="defaults"> * <bool name="tv">true</bool> * </lst> * <arr name="last-component"> * <str>tvComponent</str> * </arr> * </requestHandler></pre> * * */ public class TermVectorComponent extends SearchComponent implements SolrCoreAware { public static final String COMPONENT_NAME = "tv"; public static final String TERM_VECTORS = "termVectors"; private static final String TV_KEY_WARNINGS = "warnings"; protected NamedList initParams; /** * Helper method for determining the list of fields that we should * try to find term vectors on. * <p> * Does simple (non-glob-supporting) parsing on the * {@link TermVectorParams#FIELDS} param if specified, otherwise it returns * the concrete field values specified in {@link CommonParams#FL} -- * ignoring functions, transformers, or literals. * </p> * <p> * If "fl=*" is used, or neither param is specified, then <code>null</code> * will be returned. If the empty set is returned, it means the "fl" * specified consisted entirely of things that are not real fields * (ie: functions, transformers, partial-globs, score, etc...) and not * supported by this component. * </p> */ private Set<String> getFields(ResponseBuilder rb) { SolrParams params = rb.req.getParams(); String[] fldLst = params.getParams(TermVectorParams.FIELDS); if (null == fldLst || 0 == fldLst.length || (1 == fldLst.length && 0 == fldLst[0].length())) { // no tv.fl, parse the main fl ReturnFields rf = new SolrReturnFields (params.getParams(CommonParams.FL), rb.req); if (rf.wantsAllFields()) { return null; } Set<String> fieldNames = rf.getLuceneFieldNames(); return (null != fieldNames) ? fieldNames : // return empty set indicating no fields should be used Collections.<String>emptySet(); } // otherwise us the raw fldList as is, no special parsing or globs Set<String> fieldNames = new LinkedHashSet<>(); for (String fl : fldLst) { fieldNames.addAll(Arrays.asList(SolrPluginUtils.split(fl))); } return fieldNames; } @Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return; } NamedList<Object> termVectors = new NamedList<>(); rb.rsp.add(TERM_VECTORS, termVectors); IndexSchema schema = rb.req.getSchema(); SchemaField keyField = schema.getUniqueKeyField(); String uniqFieldName = null; if (keyField != null) { uniqFieldName = keyField.getName(); } FieldOptions allFields = new FieldOptions(); //figure out what options we have, and try to get the appropriate vector allFields.termFreq = params.getBool(TermVectorParams.TF, false); allFields.positions = params.getBool(TermVectorParams.POSITIONS, false); allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false); allFields.payloads = params.getBool(TermVectorParams.PAYLOADS, false); allFields.docFreq = params.getBool(TermVectorParams.DF, false); allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false); //boolean cacheIdf = params.getBool(TermVectorParams.IDF, false); //short cut to all values. if (params.getBool(TermVectorParams.ALL, false)) { allFields.termFreq = true; allFields.positions = true; allFields.offsets = true; allFields.payloads = true; allFields.docFreq = true; allFields.tfIdf = true; } //Build up our per field mapping Map<String, FieldOptions> fieldOptions = new HashMap<>(); NamedList<List<String>> warnings = new NamedList<>(); List<String> noTV = new ArrayList<>(); List<String> noPos = new ArrayList<>(); List<String> noOff = new ArrayList<>(); List<String> noPay = new ArrayList<>(); Set<String> fields = getFields(rb); if ( null != fields ) { //we have specific fields to retrieve, or no fields for (String field : fields) { // workaround SOLR-3523 if (null == field || "score".equals(field)) continue; // we don't want to issue warnings about the uniqueKey field // since it can cause lots of confusion in distributed requests // where the uniqueKey field is injected into the fl for merging final boolean fieldIsUniqueKey = field.equals(uniqFieldName); SchemaField sf = schema.getFieldOrNull(field); if (sf != null) { if (sf.storeTermVector()) { FieldOptions option = fieldOptions.get(field); if (option == null) { option = new FieldOptions(); option.fieldName = field; fieldOptions.put(field, option); } //get the per field mappings option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq); option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq); option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf); //Validate these are even an option option.positions = params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions); if (option.positions && !sf.storeTermPositions() && !fieldIsUniqueKey){ noPos.add(field); } option.offsets = params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets); if (option.offsets && !sf.storeTermOffsets() && !fieldIsUniqueKey){ noOff.add(field); } option.payloads = params.getFieldBool(field, TermVectorParams.PAYLOADS, allFields.payloads); if (option.payloads && !sf.storeTermPayloads() && !fieldIsUniqueKey){ noPay.add(field); } } else {//field doesn't have term vectors if (!fieldIsUniqueKey) noTV.add(field); } } else { //field doesn't exist throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field); } } } //else, deal with all fields // NOTE: currently all types of warnings are schema driven, and guaranteed // to be consistent across all shards - if additional types of warnings // are added that might be different between shards, finishStage() needs // to be changed to account for that. if (!noTV.isEmpty()) { warnings.add("noTermVectors", noTV); } if (!noPos.isEmpty()) { warnings.add("noPositions", noPos); } if (!noOff.isEmpty()) { warnings.add("noOffsets", noOff); } if (!noPay.isEmpty()) { warnings.add("noPayloads", noPay); } if (warnings.size() > 0) { termVectors.add(TV_KEY_WARNINGS, warnings); } DocListAndSet listAndSet = rb.getResults(); List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS)); Iterator<Integer> iter; if (docIds != null && !docIds.isEmpty()) { iter = docIds.iterator(); } else { DocList list = listAndSet.docList; iter = list.iterator(); } SolrIndexSearcher searcher = rb.req.getSearcher(); IndexReader reader = searcher.getIndexReader(); //the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors //Only load the id field to get the uniqueKey of that //field final String finalUniqFieldName = uniqFieldName; final List<String> uniqValues = new ArrayList<>(); // TODO: is this required to be single-valued? if so, we should STOP // once we find it... final StoredFieldVisitor getUniqValue = new StoredFieldVisitor() { @Override public void stringField(FieldInfo fieldInfo, byte[] bytes) { uniqValues.add(new String(bytes, StandardCharsets.UTF_8)); } @Override public void intField(FieldInfo fieldInfo, int value) { uniqValues.add(Integer.toString(value)); } @Override public void longField(FieldInfo fieldInfo, long value) { uniqValues.add(Long.toString(value)); } @Override public Status needsField(FieldInfo fieldInfo) { return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES : Status.NO; } }; while (iter.hasNext()) { Integer docId = iter.next(); NamedList<Object> docNL = new NamedList<>(); if (keyField != null) { reader.document(docId, getUniqValue); String uniqVal = null; if (uniqValues.size() != 0) { uniqVal = uniqValues.get(0); uniqValues.clear(); docNL.add("uniqueKey", uniqVal); termVectors.add(uniqVal, docNL); } } else { // support for schemas w/o a unique key, termVectors.add("doc-" + docId, docNL); } if ( null != fields ) { for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) { final String field = entry.getKey(); final Terms vector = reader.getTermVector(docId, field); if (vector != null) { TermsEnum termsEnum = vector.iterator(); mapOneVector(docNL, entry.getValue(), reader, docId, termsEnum, field); } } } else { // extract all fields final Fields vectors = reader.getTermVectors(docId); for (String field : vectors) { Terms terms = vectors.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(); mapOneVector(docNL, allFields, reader, docId, termsEnum, field); } } } } } private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException { NamedList<Object> fieldNL = new NamedList<>(); docNL.add(field, fieldNL); BytesRef text; PostingsEnum dpEnum = null; while((text = termsEnum.next()) != null) { String term = text.utf8ToString(); NamedList<Object> termInfo = new NamedList<>(); fieldNL.add(term, termInfo); final int freq = (int) termsEnum.totalTermFreq(); if (fieldOptions.termFreq == true) { termInfo.add("tf", freq); } int dpEnumFlags = 0; dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0; //payloads require offsets dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0; dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0; dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); boolean atNextDoc = false; if (dpEnum != null) { dpEnum.nextDoc(); atNextDoc = true; } if (atNextDoc && dpEnumFlags != 0) { NamedList<Integer> positionsNL = null; NamedList<Number> theOffsets = null; NamedList<String> thePayloads = null; for (int i = 0; i < freq; i++) { final int pos = dpEnum.nextPosition(); if (fieldOptions.positions && pos >= 0) { if (positionsNL == null) { positionsNL = new NamedList<>(); termInfo.add("positions", positionsNL); } positionsNL.add("position", pos); } int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1; if (startOffset >= 0) { if (theOffsets == null) { theOffsets = new NamedList<>(); termInfo.add("offsets", theOffsets); } theOffsets.add("start", dpEnum.startOffset()); theOffsets.add("end", dpEnum.endOffset()); } BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null; if (payload != null) { if (thePayloads == null) { thePayloads = new NamedList<>(); termInfo.add("payloads", thePayloads); } thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length)); } } } int df = 0; if (fieldOptions.docFreq || fieldOptions.tfIdf) { df = reader.docFreq(new Term(field, text)); } if (fieldOptions.docFreq) { termInfo.add("df", df); } // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } } private List<Integer> getInts(String[] vals) { List<Integer> result = null; if (vals != null && vals.length > 0) { result = new ArrayList<>(vals.length); for (int i = 0; i < vals.length; i++) { try { result.add(new Integer(vals[i])); } catch (NumberFormatException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e); } } } return result; } @Override public void prepare(ResponseBuilder rb) throws IOException { } @Override public void finishStage(ResponseBuilder rb) { if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) { NamedList<Object> termVectorsNL = new NamedList<>(); Map.Entry<String, Object>[] arr = new NamedList.NamedListEntry[rb.resultIds.size()]; for (ShardRequest sreq : rb.finished) { if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) == 0 || !sreq.params.getBool(COMPONENT_NAME, false)) { continue; } for (ShardResponse srsp : sreq.responses) { NamedList<Object> nl = (NamedList<Object>)srsp.getSolrResponse().getResponse().get(TERM_VECTORS); // Add metadata (that which isn't a uniqueKey value): Object warningsNL = nl.get(TV_KEY_WARNINGS); // assume if that if warnings is already present; we don't need to merge. if (warningsNL != null && termVectorsNL.indexOf(TV_KEY_WARNINGS, 0) < 0) { termVectorsNL.add(TV_KEY_WARNINGS, warningsNL); } // UniqueKey data SolrPluginUtils.copyNamedListIntoArrayByDocPosInResponse(nl, rb.resultIds, arr); } } // remove nulls in case not all docs were able to be retrieved SolrPluginUtils.removeNulls(arr, termVectorsNL); rb.rsp.add(TERM_VECTORS, termVectorsNL); } } @Override public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) return; if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) == 0) { sreq.params.set(COMPONENT_NAME, "false"); } } //////////////////////// NamedListInitializedPlugin methods ////////////////////// @Override public void init(NamedList args) { super.init(args); this.initParams = args; } @Override public void inform(SolrCore core) { } @Override public String getDescription() { return "A Component for working with Term Vectors"; } @Override public Category getCategory() { return Category.QUERY; } } class FieldOptions { String fieldName; boolean termFreq, positions, offsets, payloads, docFreq, tfIdf; }