/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.client.solrj.io.stream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.io.SolrClientCache; import org.apache.solr.client.solrj.io.Tuple; import org.apache.solr.client.solrj.io.comp.StreamComparator; import org.apache.solr.client.solrj.io.stream.expr.Explanation; import org.apache.solr.client.solrj.io.stream.expr.Explanation.ExpressionType; import org.apache.solr.client.solrj.io.stream.expr.Expressible; import org.apache.solr.client.solrj.io.stream.expr.StreamExplanation; import org.apache.solr.client.solrj.io.stream.expr.StreamExpression; import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionNamedParameter; import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.TermsParams; import org.apache.solr.common.util.NamedList; import static org.apache.solr.common.params.CommonParams.DISTRIB; /** * Iterates over a gatherNodes() expression and scores the Tuples based on tf-idf. * * Expression Syntax: * * Default function call uses the "count(*)" field for termFreq. * * You can use a different value for termFreq by providing the termFreq param * scoreNodes(gatherNodes(...), termFreq="min(weight)") * **/ public class ScoreNodesStream extends TupleStream implements Expressible { private static final long serialVersionUID = 1; protected String zkHost; private TupleStream stream; private transient SolrClientCache clientCache; private Map<String, Tuple> nodes = new HashMap(); private Iterator<Tuple> tuples; private String termFreq; private boolean facet; private String bucket; private String facetCollection; public ScoreNodesStream(TupleStream tupleStream, String nodeFreqField) throws IOException { init(tupleStream, nodeFreqField); } public ScoreNodesStream(StreamExpression expression, StreamFactory factory) throws IOException { // grab all parameters out List<StreamExpression> streamExpressions = factory.getExpressionOperandsRepresentingTypes(expression, Expressible.class, TupleStream.class); StreamExpressionNamedParameter nodeFreqParam = factory.getNamedOperand(expression, "termFreq"); String docFreqField = "count(*)"; if(nodeFreqParam != null) { docFreqField = nodeFreqParam.getParameter().toString(); } if(1 != streamExpressions.size()){ throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - expecting a single stream but found %d",expression, streamExpressions.size())); } zkHost = factory.getDefaultZkHost(); if(null == zkHost){ throw new IOException("zkHost not found"); } TupleStream stream = factory.constructStream(streamExpressions.get(0)); init(stream, docFreqField); } private void init(TupleStream tupleStream, String termFreq) throws IOException{ this.stream = tupleStream; this.termFreq = termFreq; if(stream instanceof FacetStream) { FacetStream facetStream = (FacetStream) stream; if(facetStream.getBuckets().length != 1) { throw new IOException("scoreNodes operates over a single bucket. Num buckets:"+facetStream.getBuckets().length); } this.bucket = facetStream.getBuckets()[0].toString(); this.facetCollection = facetStream.getCollection(); this.facet = true; } } @Override public StreamExpression toExpression(StreamFactory factory) throws IOException{ return toExpression(factory, true); } private StreamExpression toExpression(StreamFactory factory, boolean includeStreams) throws IOException { // function name StreamExpression expression = new StreamExpression(factory.getFunctionName(this.getClass())); // nodeFreqField expression.addParameter(new StreamExpressionNamedParameter("termFreq", termFreq)); if(includeStreams){ // stream if(stream instanceof Expressible){ expression.addParameter(((Expressible)stream).toExpression(factory)); } else{ throw new IOException("This ScoreNodesStream contains a non-expressible TupleStream - it cannot be converted to an expression"); } } else{ expression.addParameter("<stream>"); } return expression; } @Override public Explanation toExplanation(StreamFactory factory) throws IOException { return new StreamExplanation(getStreamNodeId().toString()) .withChildren(new Explanation[]{ stream.toExplanation(factory) }) .withFunctionName(factory.getFunctionName(this.getClass())) .withImplementingClass(this.getClass().getName()) .withExpressionType(ExpressionType.STREAM_DECORATOR) .withExpression(toExpression(factory, false).toString()); } public void setStreamContext(StreamContext context) { this.clientCache = context.getSolrClientCache(); this.stream.setStreamContext(context); } public List<TupleStream> children() { List<TupleStream> l = new ArrayList(); l.add(stream); return l; } public void open() throws IOException { stream.open(); Tuple node = null; StringBuilder builder = new StringBuilder(); String field = null; String collection = null; while(true) { node = stream.read(); if(node.EOF) { break; } if(facet) { //Turn the facet tuple into a node. String nodeId = node.getString(bucket); node.put("node", nodeId); node.remove(bucket); node.put("collection", facetCollection); node.put("field", bucket); } if(!node.fields.containsKey("node")) { throw new IOException("node field not present in the Tuple"); } String nodeId = node.getString("node"); nodes.put(nodeId, node); if(builder.length() > 0) { builder.append(","); field = node.getString("field"); collection = node.getString("collection"); } builder.append(nodeId); } CloudSolrClient client = clientCache.getCloudSolrClient(zkHost); ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CommonParams.QT, "/terms"); params.add(TermsParams.TERMS, "true"); params.add(TermsParams.TERMS_FIELD, field); params.add(TermsParams.TERMS_STATS, "true"); params.add(TermsParams.TERMS_LIST, builder.toString()); params.add(TermsParams.TERMS_LIMIT, Integer.toString(nodes.size())); params.add(DISTRIB, "true"); QueryRequest request = new QueryRequest(params); try { //Get the response from the terms component NamedList response = client.request(request, collection); NamedList<Number> stats = (NamedList<Number>)response.get("indexstats"); long numDocs = stats.get("numDocs").longValue(); NamedList<NamedList<Number>> fields = (NamedList<NamedList<Number>>)response.get("terms"); int size = fields.size(); for(int i=0; i<size; i++) { String fieldName = fields.getName(i); NamedList<Number> terms = fields.get(fieldName); int tsize = terms.size(); for(int t=0; t<tsize; t++) { String term = terms.getName(t); Number docFreq = terms.get(term); Tuple tuple = nodes.get(term); if(!tuple.fields.containsKey(termFreq)) { throw new Exception("termFreq field not present in the Tuple"); } Number termFreqValue = (Number)tuple.get(termFreq); float score = (float)(Math.log(termFreqValue.floatValue())+1.0) * (float) (Math.log((numDocs + 1) / (docFreq.doubleValue() + 1)) + 1.0); tuple.put("nodeScore", score); tuple.put("docFreq", docFreq); tuple.put("numDocs", numDocs); } } } catch (Exception e) { throw new IOException(e); } tuples = nodes.values().iterator(); } public void close() throws IOException { stream.close(); } public StreamComparator getComparator(){ return null; } public Tuple read() throws IOException { if(tuples.hasNext()) { return tuples.next(); } else { Map map = new HashMap(); map.put("EOF", true); return new Tuple(map); } } public StreamComparator getStreamSort(){ return null; } public int getCost() { return 0; } }