package org.apache.solr.handler.component;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class WordCloudComponent extends SearchComponent {
public static final String COMPONENT_NAME = "wordcloud";
@Override
public void prepare(ResponseBuilder rb) throws IOException {
SolrQueryRequest req = rb.req;
SolrParams params = req.getParams();
if (!params.getBool(COMPONENT_NAME, true)) {
return;
}
Query query = rb.getQuery();
if (query == null) return;
}
@Override
public void process(ResponseBuilder rb) throws IOException {
SolrQueryRequest req = rb.req;
SolrParams params = req.getParams();
if (!params.getBool(COMPONENT_NAME, true)) {
return;
}
String wcFields = null;
if ((wcFields = params.get("wordcloud.fl", null)) == null) {
return;
}
Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields,','));
DocList ids = rb.getResults().docList;
SolrIndexSearcher searcher = rb.req.getSearcher();
IndexSchema schema = rb.req.getCore().getLatestSchema();
final Analyzer analyzer = rb.req.getCore().getLatestSchema().getIndexAnalyzer();
final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();
CharTermAttribute termAtt;
Map<String, Map<String, Integer>>tokens = new HashMap<String, Map<String, Integer>>();
for (String f: flds) {
SchemaField field = schema.getFieldOrNull(f);
if (field==null || !field.stored()) {
continue; // ignore this field
}
fieldsToLoad.put(f, field.getType());
tokens.put(f, new HashMap<String, Integer>());
}
DocIterator iterator = ids.iterator();
String w; Integer v;
int sz = ids.size();
for (int i=0; i<sz; i++) {
int id = iterator.nextDoc();
Document doc = searcher.doc(id, fieldsToLoad.keySet());
for (Entry<String,FieldType> en: fieldsToLoad.entrySet()) {
Map<String,Integer> toks = tokens.get(en.getKey());
String[] vals = doc.getValues(en.getKey());
FieldType fType = en.getValue();
if (vals != null) {
for (String s: vals) {
TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s)));
if (!buffer.hasAttribute(CharTermAttribute.class)) {
continue; // empty stream
}
termAtt = buffer.getAttribute(CharTermAttribute.class);
buffer.reset();
while (buffer.incrementToken()) {
w = termAtt.toString();
v = toks.get(w);
if (v == null) v = 0;
toks.put(w, ++v);
}
buffer.close();
}
}
}
}
// TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency)
LeafReader reader = searcher.getSlowAtomicReader();
BytesRef term;
int df;
String f;
Map<String, Map<String, Double>>docFreqs = new HashMap<String, Map<String, Double>>();
for (Entry<String, Map<String, Integer>>field: tokens.entrySet()) {
HashMap<String,Double> idfs = new HashMap<String, Double>();
f = field.getKey();
docFreqs.put(f, idfs);
int N = reader.getDocCount(f);
for (Entry<String, Integer>token: field.getValue().entrySet()) {
w = token.getKey();
df = reader.docFreq(new Term(f, new BytesRef(w)));
if (df != 0) {
idfs.put(w, Math.log10(N/df));
}
}
}
HashMap<String,Object> ret = new HashMap<String, Object>();
for (String fi: fieldsToLoad.keySet()) {
HashMap<String, Object> va = new HashMap<String, Object>();
va.put("tf", tokens.get(fi));
va.put("idf", docFreqs.get(fi));
ret.put(fi, va);
}
rb.rsp.add("wordcloud", ret);
}
@Override
public String getDescription() {
return "return tokens with TF and IDF for wordcloud";
}
@Override
public String getSource() {
return null;
}
}