CloudSearchIterator.java example

Explorer
lux-master
- src
package lux.solr;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;

import lux.Evaluator;
import lux.SearchIteratorBase;
import lux.exception.LuxException;
import lux.functions.SearchBase.QueryParser;
import lux.index.FieldRole;
import lux.index.IndexConfiguration;
import net.sf.saxon.om.DocumentInfo;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.om.SequenceIterator;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.trans.XPathException;

import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortField.Type;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SortSpec;
import org.apache.solr.util.SolrPluginUtils;

/**
 * 
 * Perform distributed XQuery searches.  We mimic lazy evaluation by maintaining an iterator
 * that re-issues requests when its local cache is exhausted.  Note: deep paging may be quite expensive
 * since *all* results starting with the first must be retrieved for each page!  
 * 
 * TODO: We could optimize better if we kept track of per-shard positions.  To do that we'd have to know which shard each result came from.
 * This info is held in ResponseBuilder.resultIds, which is a map from id to ShardDoc; each ShardDoc has
 * a shard id.  We can count the # of docs from each shard and calculate a position from that.
 */
public class CloudSearchIterator extends SearchIteratorBase {
    
    private int limit; // = solr 'rows'
    private SolrQueryResponse response;
    private final String query;
    private final QueryParser queryParser;
    private final String xmlFieldName;
    private final String uriFieldName;
    private final String idFieldName;
    private final HashSet<String> fieldNames;
    private String[] effectiveCriteria;
    
    /**
     * Initialize the iterator
     * @param eval the Evaluator holds context for the query
     * @param query the Lucene query to execute and iterate
     * @param queryParser either blank (for the default qp), or 'xml' for the xml query parser  TODO: enum
     * @param sortCriteria the sort order for the results
     * @param start1 the 1-based start position at which to begin the iteration
     */
    public CloudSearchIterator (Evaluator eval, String query, QueryParser queryParser, String[] sortCriteria, int start1) {
        super (eval, sortCriteria, start1);
        this.limit = 20;
        this.queryParser = queryParser;
        this.query = query;
        IndexConfiguration indexConfig = eval.getCompiler().getIndexConfiguration();
        xmlFieldName = indexConfig.getFieldName(FieldRole.XML_STORE);
        uriFieldName = indexConfig.getFieldName(FieldRole.URI);
        idFieldName = indexConfig.getFieldName(FieldRole.ID);
        fieldNames = new HashSet<String>();
        fieldNames.add(xmlFieldName);
        fieldNames.add(uriFieldName);
        fieldNames.add(idFieldName);
    }

    @Override
    public SequenceIterator<NodeInfo> getAnother() throws XPathException {
        return new CloudSearchIterator(eval, query, queryParser, sortCriteria, start + 1);
    }
    
    public long count() {
        if (response == null) {
            this.limit = 0;
            doCloudSearch();
        }
        return getResultNumFound(response);
    }

    @Override
    public NodeInfo next() throws XPathException {
        for (;;) {
            if (response != null) {
                if (position >= getResultNumFound(response)) {
                    return null;
                }
                Object results = response.getValues().get("response");
                if (results == null) {
                    return null;
                }
                SolrDocumentList docs;
                if (results instanceof DocList) {
                    try {
                        docs = SolrPluginUtils.docListToSolrDocumentList ((DocList) results, 
                                (SolrIndexSearcher) eval.getSearcher().getWrappedSearcher(), fieldNames, null);
                    } catch (IOException e) {
                        throw new XPathException (e);
                    }
                }
                else if (results instanceof SolrDocumentList) {
                    docs = (SolrDocumentList) results;
                }
                else {
                    throw new XPathException ("Solr query response unexpectedly of type " + results.getClass().getName());
                }
                if (position < docs.getStart() + docs.size()) {
                    return getNextDocument (docs);
                }
                // otherwise fall through and get the next page of results
            }
            doCloudSearch();
        }
    }
    
    private NodeInfo getNextDocument (SolrDocumentList docs) {
        // FIXME: test pagination I think there is a bug here if w/start > 0?
        SolrDocument doc = docs.get(position++ - (int) docs.getStart());
        String uri = (String) doc.getFirstValue(uriFieldName);
        Object oxml = doc.getFirstValue(xmlFieldName);
        Long id = (Long) doc.getFirstValue(idFieldName);
        if (id == null) {
            // try to support migrating an old index?
            throw new LuxException("This index has no lux docids: it cannot support Lux on Solr Cloud");
        }
        String xml = (String) ((oxml instanceof String) ? oxml : null);
        byte [] bytes = (byte[]) ((oxml instanceof byte[]) ? oxml : null);
        XdmNode node = eval.getDocReader().createXdmNode(id, uri, xml, bytes);
        DocumentInfo docNode = node.getUnderlyingNode().getDocumentRoot();
        docNode.setUserData(SolrDocument.class.getName(), doc);
        return docNode;
    }

    /* Make a new query request, using this.query, start calculated based on the passed-in responseBuilder
     * sorting based on sortCriteria, and fields=lux_xml.  Also: if rb asks for debug, pass that along
     */
    private void doCloudSearch () {
        ResponseBuilder origRB = ((SolrQueryContext)eval.getQueryContext()).getResponseBuilder();
        ModifiableSolrParams params = new ModifiableSolrParams();
        params.add((CommonParams.Q), query);
        if (QueryParser.XML == queryParser) {
            params.add("defType", "xml");
        }
        params.add(CommonParams.START, Integer.toString(position));
        params.add(CommonParams.ROWS, Integer.toString(limit));
        //params.add(CommonParams.FL, uriFieldName, xmlFieldName, idFieldName);
        params.add(CommonParams.FL, "*");

        SolrParams origParams = origRB.req.getParams();
        String debug = origParams.get(CommonParams.DEBUG);
        if (debug != null) {
            params.add(CommonParams.DEBUG, debug);
        }
        params.add("distrib", "true");
        params.add("shards", origParams.get("shards"));
        SortSpec sortSpec = makeSortSpec();
        addSortParam (params, sortSpec);
        XQueryComponent xqueryComponent = ((SolrQueryContext)eval.getQueryContext()).getQueryComponent();
        SolrQueryRequest req = new CloudQueryRequest(xqueryComponent.getCore(), params, sortSpec);
        response = new SolrQueryResponse();
        xqueryComponent.getSearchHandler().handleRequest(req, response);
        eval.getQueryStats().docCount += getResultNumFound(response); 
    }
    
    private long getResultNumFound (SolrQueryResponse rsp) {
        Object docs = response.getValues().get("response");
        if (docs != null) {
            if (docs instanceof DocList) {
                return ((DocList)docs).matches();
            } else if (docs instanceof SolrDocumentList) {
                return ((SolrDocumentList)docs).getNumFound();
            }
        }
        return 0;
    }
    
    private void addSortParam(ModifiableSolrParams params, SortSpec sortSpec) {
        for (SortField sortField : sortSpec.getSort().getSort()) {
            String dir = sortField.getReverse() ? "desc" : "asc"; 
            String field = sortField.getField();
            if (field != null) {
                params.add("sort", sortField.getField() + ' ' + dir);
            }
            // FIXME Solr controls sorting missing first/last with a *schema* setting,
            // but we insist on runtime control.  We should raise an error here
            // if the schema is not in line with the runtime setting, since otherwise
            // an incorrect ordering will be the result.  And provide some kind of 
            // "recover from the error" setting? Where?
        }
    }
    
    private String [] getEffectiveSortCriteria () {
        if (effectiveCriteria == null) {
            assert sortCriteria != null;
            ArrayList<String> tmp = new ArrayList<String>();
            for (String s : sortCriteria) {
                if (! s.equals(FieldRole.LUX_DOCID)) {
                    tmp.add(s);
                }
            }
            tmp.add(idFieldName);
            effectiveCriteria = tmp.toArray(new String[tmp.size()]);
        }
        return effectiveCriteria; 
    }

    private SortSpec makeSortSpec () {
        Sort sort;
        // add the uri field as a fallback sorting criterion to enforce a consistent
        // document order
        if (sortCriteria != null) {
            sort = makeSortFromCriteria(getEffectiveSortCriteria());
        } else {
            //sort = new Sort (SortField.FIELD_SCORE, new SortField(uriFieldName, Type.STRING));
            sort = new Sort (new SortField(idFieldName, Type.LONG));
        }
        return new SortSpec (sort, position, limit);
    }
    
    /**
     * @param limit the maximum number of results to retrieve per batch
     */
    public void setLimit (int limit) {
        this.limit = limit;
    }
    
    /**
     * @return the maximum number of results to retrieve per batch
     */
    public int getLimit () {
        return limit;
    }
    
}