QueryElevationComponent.java example

Explorer
solr-analytics-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.handler.component;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SentinelIntSet;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.QueryElevationParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.util.DOMUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.Config;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.transform.ElevatedMarkerFactory;
import org.apache.solr.response.transform.ExcludedMarkerFactory;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SortSpec;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.VersionedFile;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;

/**
 * A component to elevate some documents to the top of the result set.
 *
 * @since solr 1.3
 */
public class QueryElevationComponent extends SearchComponent implements SolrCoreAware {
  private static Logger log = LoggerFactory.getLogger(QueryElevationComponent.class);

  // Constants used in solrconfig.xml
  static final String FIELD_TYPE = "queryFieldType";
  static final String CONFIG_FILE = "config-file";
  static final String EXCLUDE = "exclude";
  public static final String BOOSTED = "BOOSTED";
  public static final String EXCLUDED = "EXCLUDED";

  // Runtime param -- should be in common?

  private SolrParams initArgs = null;
  private Analyzer analyzer = null;
  private String idField = null;
  private FieldType idSchemaFT;

  boolean forceElevation = false;
  // For each IndexReader, keep a query->elevation map
  // When the configuration is loaded from the data directory.
  // The key is null if loaded from the config directory, and
  // is never re-loaded.
  final Map<IndexReader, Map<String, ElevationObj>> elevationCache =
      new WeakHashMap<IndexReader, Map<String, ElevationObj>>();

  class ElevationObj {
    final String text;
    final String analyzed;
    final TermQuery [] exclude;//just keep the term query, b/c we will not always explicitly exclude the item based on markExcludes query time param
    final BooleanQuery include;
    final Map<BytesRef, Integer> priority;
    final Set<String> ids;
    final Set<String> excludeIds;

    ElevationObj(String qstr, List<String> elevate, List<String> exclude) throws IOException {
      this.text = qstr;
      this.analyzed = getAnalyzedQuery(this.text);
      this.ids = new HashSet<String>();
      this.excludeIds = new HashSet<String>();

      this.include = new BooleanQuery();
      this.include.setBoost(0);
      this.priority = new HashMap<BytesRef, Integer>();
      int max = elevate.size() + 5;
      for (String id : elevate) {
        id = idSchemaFT.readableToIndexed(id);
        ids.add(id);
        TermQuery tq = new TermQuery(new Term(idField, id));
        include.add(tq, BooleanClause.Occur.SHOULD);
        this.priority.put(new BytesRef(id), max--);
      }

      if (exclude == null || exclude.isEmpty()) {
        this.exclude = null;
      } else {
        this.exclude = new TermQuery[exclude.size()];
        for (int i = 0; i < exclude.size(); i++) {
          String id = idSchemaFT.readableToIndexed(exclude.get(i));
          excludeIds.add(id);
          this.exclude[i] = new TermQuery(new Term(idField, id));
        }
      }
    }
  }

  @Override
  public void init(NamedList args) {
    this.initArgs = SolrParams.toSolrParams(args);
  }

  public void inform(SolrCore core) {
    String a = initArgs.get(FIELD_TYPE);
    if (a != null) {
      FieldType ft = core.getSchema().getFieldTypes().get(a);
      if (ft == null) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
            "Unknown FieldType: '" + a + "' used in QueryElevationComponent");
      }
      analyzer = ft.getQueryAnalyzer();
    }

    SchemaField sf = core.getSchema().getUniqueKeyField();
    if( sf == null) {
      throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, 
          "QueryElevationComponent requires the schema to have a uniqueKeyField." );
    }
    idSchemaFT = sf.getType();
    idField = sf.getName();
    //register the EditorialMarkerFactory
    String excludeName = initArgs.get(QueryElevationParams.EXCLUDE_MARKER_FIELD_NAME, "excluded");
    if (excludeName == null || excludeName.equals("") == true){
      excludeName = "excluded";
    }
    ExcludedMarkerFactory excludedMarkerFactory = new ExcludedMarkerFactory();
    core.addTransformerFactory(excludeName, excludedMarkerFactory);
    ElevatedMarkerFactory elevatedMarkerFactory = new ElevatedMarkerFactory();
    String markerName = initArgs.get(QueryElevationParams.EDITORIAL_MARKER_FIELD_NAME, "elevated");
    if (markerName == null || markerName.equals("") == true) {
      markerName = "elevated";
    }
    core.addTransformerFactory(markerName, elevatedMarkerFactory);
    forceElevation = initArgs.getBool(QueryElevationParams.FORCE_ELEVATION, forceElevation);
    try {
      synchronized (elevationCache) {
        elevationCache.clear();
        String f = initArgs.get(CONFIG_FILE);
        if (f == null) {
          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
              "QueryElevationComponent must specify argument: '" + CONFIG_FILE
                  + "' -- path to elevate.xml");
        }
        boolean exists = false;

        // check if using ZooKeeper
        ZkController zkController = core.getCoreDescriptor().getCoreContainer().getZkController();
        if (zkController != null) {
          // TODO : shouldn't have to keep reading the config name when it has been read before
          exists = zkController.configFileExists(zkController.readConfigName(core.getCoreDescriptor().getCloudDescriptor().getCollectionName()), f);
        } else {
          File fC = new File(core.getResourceLoader().getConfigDir(), f);
          File fD = new File(core.getDataDir(), f);
          if (fC.exists() == fD.exists()) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                "QueryElevationComponent missing config file: '" + f + "\n"
                    + "either: " + fC.getAbsolutePath() + " or " + fD.getAbsolutePath() + " must exist, but not both.");
          }
          if (fC.exists()) {
            exists = true;
            log.info("Loading QueryElevation from: " + fC.getAbsolutePath());
            Config cfg = new Config(core.getResourceLoader(), f);
            elevationCache.put(null, loadElevationMap(cfg));
          }
        }
        //in other words, we think this is in the data dir, not the conf dir
        if (!exists) {
          // preload the first data
          RefCounted<SolrIndexSearcher> searchHolder = null;
          try {
            searchHolder = core.getNewestSearcher(false);
            IndexReader reader = searchHolder.get().getIndexReader();
            getElevationMap(reader, core);
          } finally {
            if (searchHolder != null) searchHolder.decref();
          }
        }
      }
    } catch (Exception ex) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
          "Error initializing QueryElevationComponent.", ex);
    }
  }

  //get the elevation map from the data dir
  Map<String, ElevationObj> getElevationMap(IndexReader reader, SolrCore core) throws Exception {
    synchronized (elevationCache) {
      Map<String, ElevationObj> map = elevationCache.get(null);
      if (map != null) return map;

      map = elevationCache.get(reader);
      if (map == null) {
        String f = initArgs.get(CONFIG_FILE);
        if (f == null) {
          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
              "QueryElevationComponent must specify argument: " + CONFIG_FILE);
        }
        log.info("Loading QueryElevation from data dir: " + f);
        
        Config cfg;
        
        ZkController zkController = core.getCoreDescriptor().getCoreContainer().getZkController();
        if (zkController != null) {
          cfg = new Config(core.getResourceLoader(), f, null, null);
        } else {
          InputStream is = VersionedFile.getLatestFile(core.getDataDir(), f);
          cfg = new Config(core.getResourceLoader(), f, new InputSource(is), null);
        }
  
        map = loadElevationMap(cfg);
        elevationCache.put(reader, map);
      }
      return map;
    }
  }

  //load up the elevation map
  private Map<String, ElevationObj> loadElevationMap(Config cfg) throws IOException {
    XPath xpath = XPathFactory.newInstance().newXPath();
    Map<String, ElevationObj> map = new HashMap<String, ElevationObj>();
    NodeList nodes = (NodeList) cfg.evaluate("elevate/query", XPathConstants.NODESET);
    for (int i = 0; i < nodes.getLength(); i++) {
      Node node = nodes.item(i);
      String qstr = DOMUtil.getAttr(node, "text", "missing query 'text'");

      NodeList children = null;
      try {
        children = (NodeList) xpath.evaluate("doc", node, XPathConstants.NODESET);
      } catch (XPathExpressionException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
            "query requires '<doc .../>' child");
      }

      ArrayList<String> include = new ArrayList<String>();
      ArrayList<String> exclude = new ArrayList<String>();
      for (int j = 0; j < children.getLength(); j++) {
        Node child = children.item(j);
        String id = DOMUtil.getAttr(child, "id", "missing 'id'");
        String e = DOMUtil.getAttr(child, EXCLUDE, null);
        if (e != null) {
          if (Boolean.valueOf(e)) {
            exclude.add(id);
            continue;
          }
        }
        include.add(id);
      }

      ElevationObj elev = new ElevationObj(qstr, include, exclude);
      if (map.containsKey(elev.analyzed)) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
            "Boosting query defined twice for query: '" + elev.text + "' (" + elev.analyzed + "')");
      }
      map.put(elev.analyzed, elev);
    }
    return map;
  }

  /**
   * Helpful for testing without loading config.xml
   *
   * @throws IOException If there is a low-level I/O error.
   */
  void setTopQueryResults(IndexReader reader, String query, String[] ids, String[] ex) throws IOException {
    if (ids == null) {
      ids = new String[0];
    }
    if (ex == null) {
      ex = new String[0];
    }

    Map<String, ElevationObj> elev = elevationCache.get(reader);
    if (elev == null) {
      elev = new HashMap<String, ElevationObj>();
      elevationCache.put(reader, elev);
    }
    ElevationObj obj = new ElevationObj(query, Arrays.asList(ids), Arrays.asList(ex));
    elev.put(obj.analyzed, obj);
  }

  String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
      return query;
    }
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(query));
    tokens.reset();

    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
      norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    tokens.end();
    tokens.close();
    return norm.toString();
  }

  //---------------------------------------------------------------------------------
  // SearchComponent
  //---------------------------------------------------------------------------------

  @Override
  public void prepare(ResponseBuilder rb) throws IOException {
    SolrQueryRequest req = rb.req;
    SolrParams params = req.getParams();
    // A runtime param can skip 
    if (!params.getBool(QueryElevationParams.ENABLE, true)) {
      return;
    }

    boolean exclusive = params.getBool(QueryElevationParams.EXCLUSIVE, false);
    // A runtime parameter can alter the config value for forceElevation
    boolean force = params.getBool(QueryElevationParams.FORCE_ELEVATION, forceElevation);
    boolean markExcludes = params.getBool(QueryElevationParams.MARK_EXCLUDES, false);
    Query query = rb.getQuery();
    String qstr = rb.getQueryString();
    if (query == null || qstr == null) {
      return;
    }

    qstr = getAnalyzedQuery(qstr);
    IndexReader reader = req.getSearcher().getIndexReader();
    ElevationObj booster = null;
    try {
      booster = getElevationMap(reader, req.getCore()).get(qstr);
    } catch (Exception ex) {
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
          "Error loading elevation", ex);
    }

    if (booster != null) {
      rb.req.getContext().put(BOOSTED, booster.ids);

      // Change the query to insert forced documents
      if (exclusive == true) {
        //we only want these results
        rb.setQuery(booster.include);
      } else {
        BooleanQuery newq = new BooleanQuery(true);
        newq.add(query, BooleanClause.Occur.SHOULD);
        newq.add(booster.include, BooleanClause.Occur.SHOULD);
        if (booster.exclude != null) {
          if (markExcludes == false) {
            for (TermQuery tq : booster.exclude) {
              newq.add(new BooleanClause(tq, BooleanClause.Occur.MUST_NOT));
            }
          } else {
            //we are only going to mark items as excluded, not actually exclude them.  This works
            //with the EditorialMarkerFactory
            rb.req.getContext().put(EXCLUDED, booster.excludeIds);
          }
        }
        rb.setQuery(newq);
      }

      ElevationComparatorSource comparator = new ElevationComparatorSource(booster);
      // if the sort is 'score desc' use a custom sorting method to 
      // insert documents in their proper place 
      SortSpec sortSpec = rb.getSortSpec();
      if (sortSpec.getSort() == null) {
        sortSpec.setSort(new Sort(new SortField[]{
            new SortField("_elevate_", comparator, true),
            new SortField(null, SortField.Type.SCORE, false)
        }));
      } else {
        // Check if the sort is based on score
        boolean modify = false;
        SortField[] current = sortSpec.getSort().getSort();
        ArrayList<SortField> sorts = new ArrayList<SortField>(current.length + 1);
        // Perhaps force it to always sort by score
        if (force && current[0].getType() != SortField.Type.SCORE) {
          sorts.add(new SortField("_elevate_", comparator, true));
          modify = true;
        }
        for (SortField sf : current) {
          if (sf.getType() == SortField.Type.SCORE) {
            sorts.add(new SortField("_elevate_", comparator, !sf.getReverse()));
            modify = true;
          }
          sorts.add(sf);
        }
        if (modify) {
          sortSpec.setSort(new Sort(sorts.toArray(new SortField[sorts.size()])));
        }
      }
    }

    // Add debugging information
    if (rb.isDebug()) {
      List<String> match = null;
      if (booster != null) {
        // Extract the elevated terms into a list
        match = new ArrayList<String>(booster.priority.size());
        for (Object o : booster.include.clauses()) {
          TermQuery tq = (TermQuery) ((BooleanClause) o).getQuery();
          match.add(tq.getTerm().text());
        }
      }

      SimpleOrderedMap<Object> dbg = new SimpleOrderedMap<Object>();
      dbg.add("q", qstr);
      dbg.add("match", match);
      if (rb.isDebugQuery()) {
        rb.addDebugInfo("queryBoosting", dbg);
      }
    }
  }

  @Override
  public void process(ResponseBuilder rb) throws IOException {
    // Do nothing -- the real work is modifying the input query
  }

  //---------------------------------------------------------------------------------
  // SolrInfoMBean
  //---------------------------------------------------------------------------------

  @Override
  public String getDescription() {
    return "Query Boosting -- boost particular documents for a given query";
  }

  @Override
  public String getSource() {
    return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_4_0/solr/core/src/java/org/apache/solr/handler/component/QueryElevationComponent.java $";
  }

  @Override
  public URL[] getDocs() {
    try {
      return new URL[]{
          new URL("http://wiki.apache.org/solr/QueryElevationComponent")
      };
    } catch (MalformedURLException e) {
      throw new RuntimeException(e);
    }
  }
  class ElevationComparatorSource extends FieldComparatorSource {
  private QueryElevationComponent.ElevationObj elevations;
  private SentinelIntSet ordSet; //the key half of the map
  private BytesRef[] termValues;//the value half of the map

  public ElevationComparatorSource(final QueryElevationComponent.ElevationObj elevations) {
    this.elevations = elevations;
    int size = elevations.ids.size();
    ordSet = new SentinelIntSet(size, -1);
    termValues = new BytesRef[ordSet.keys.length];
  }

  @Override
  public FieldComparator<Integer> newComparator(String fieldname, final int numHits, int sortPos, boolean reversed) throws IOException {
    return new FieldComparator<Integer>() {
      private final int[] values = new int[numHits];
      private int bottomVal;
      private TermsEnum termsEnum;
      private DocsEnum docsEnum;
      Set<String> seen = new HashSet<String>(elevations.ids.size());

      @Override
      public int compare(int slot1, int slot2) {
        return values[slot1] - values[slot2];  // values will be small enough that there is no overflow concern
      }

      @Override
      public void setBottom(int slot) {
        bottomVal = values[slot];
      }

      private int docVal(int doc) {
        if (ordSet.size() > 0) {
          int slot = ordSet.find(doc);
          if (slot >= 0) {
            BytesRef id = termValues[slot];
            Integer prio = elevations.priority.get(id);
            return prio == null ? 0 : prio.intValue();
          }
        }
        return 0;
      }

      @Override
      public int compareBottom(int doc) {
        return bottomVal - docVal(doc);
      }

      @Override
      public void copy(int slot, int doc) {
        values[slot] = docVal(doc);
      }

      @Override
      public FieldComparator setNextReader(AtomicReaderContext context) throws IOException {
        //convert the ids to Lucene doc ids, the ordSet and termValues needs to be the same size as the number of elevation docs we have
        ordSet.clear();
        Fields fields = context.reader().fields();
        if (fields == null) return this;
        Terms terms = fields.terms(idField);
        if (terms == null) return this;
        termsEnum = terms.iterator(termsEnum);
        BytesRef term = new BytesRef();
        Bits liveDocs = context.reader().getLiveDocs();

        for (String id : elevations.ids) {
          term.copyChars(id);
          if (seen.contains(id) == false  && termsEnum.seekExact(term, false)) {
            docsEnum = termsEnum.docs(liveDocs, docsEnum, 0);
            if (docsEnum != null) {
              int docId = docsEnum.nextDoc();
              if (docId == DocIdSetIterator.NO_MORE_DOCS ) continue;  // must have been deleted
              termValues[ordSet.put(docId)] = BytesRef.deepCopyOf(term);
              seen.add(id);
              assert docsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
            }
          }
        }
        return this;
      }

      @Override
      public Integer value(int slot) {
        return values[slot];
      }

      @Override
      public int compareDocToValue(int doc, Integer valueObj) {
        final int value = valueObj.intValue();
        final int docValue = docVal(doc);
        return docValue - value;  // values will be small enough that there is no overflow concern
      }
    };
  }
  }
}