package org.apache.solr.handler.batch;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LegacyNumericTokenStream.LegacyNumericTermAttribute;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SimpleCollector;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
import org.apache.solr.search.SolrIndexSearcher;
import org.noggit.JSONUtil;
/**
* Provider that dumps selected fields to disk - it can analyze fields
* and show their values as indexed (but only for text/string fields).
*
* This method is INEFFICIENT and is to be avoided for normal operations!
*
* How it works:
*
* 1. query for all documents that satisfy conditions
* 2. in loop (for every doc):
* - read selected fields for the current document (stored values)
* - analyze the values by the appropriate analyzer (for that field)
* - collect all tokens, put them inside array
* - put the array into a fake solr doc
* - dump the doc to disk in JSON
*
* Tokens that are indexed at the same position are joined by '|'
* (eg. "field" : ["foo|fool", "bar", ....])
*
* These are the parameters:
*
* fields: list of fields to dump (comma separated)
* analyze: true/false - whether to dump values as indexed [true]
*/
public class BatchProviderDumpAnalyzedFields extends BatchProvider {
public void run(SolrQueryRequest req, BatchHandlerRequestQueue queue) throws Exception {
SolrParams params = req.getParams();
String jobid = params.get("jobid");
String workDir = params.get("#workdir");
SolrCore core = req.getCore();
IndexSchema schema = core.getLatestSchema();
final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();
final Analyzer analyzer = core.getLatestSchema().getIndexAnalyzer();
String[] fields = params.getParams("fields");
if (fields == null) {
throw new SolrException(ErrorCode.BAD_REQUEST, "'fields' parameter missssssing");
}
for (String f: fields) {
for (String ff: f.split("( |,)")) {
SchemaField field = schema.getFieldOrNull(ff);
if (field==null || !field.stored()) {
throw new SolrException(ErrorCode.BAD_REQUEST, "We cannot dump fields that do not exist or are not stored: " + ff);
}
fieldsToLoad.put(ff, field.getType());
}
}
final boolean analyze = params.getBool("analyze", true);
Query query = this.getQuery(req);
SolrIndexSearcher se = req.getSearcher();
HashMap<String, String> descr = new HashMap<String, String>();
descr.put("query", query.toString());
descr.put("indexDir", se.getPath());
descr.put("indexVersion", se.getVersion());
descr.put("maxDoc", Integer.toString(se.maxDoc()));
descr.put("date", new Date().toString());
File jobFile = new File(workDir + "/" + params.get("jobid"));
final BufferedWriter out = new BufferedWriter(new FileWriter(jobFile), 1024*256);
out.write("{\n");
out.write("\"description\": " + JSONUtil.toJSON(descr).replace("\n", " ") + ",\n");
out.write("\"data\" : [\n");
final BatchHandlerRequestQueue batchQueue = queue;
se.search(query, new SimpleCollector() {
private LeafReader reader;
private int processed = 0;
private CharTermAttribute termAtt;
private LegacyNumericTermAttribute numAtt;
private PositionIncrementAttribute posIncrAtt;
private Map<String, List<String>>document = new HashMap<String, List<String>>();
protected void doSetNextReader(LeafReaderContext context) throws IOException {
reader = context.reader();
}
@Override
public void collect(int i) throws IOException {
Document d;
d = reader.document(i, fieldsToLoad.keySet());
processed++;
document.clear();
if (processed % 10000 == 0) {
if(batchQueue.isStopped()) { // inside, because queue is synchronized
throw new IOException("Collector interrupted - stopping");
}
}
for (Entry<String,FieldType> en: fieldsToLoad.entrySet()) {
List<String> tokens = new ArrayList<String>(500);
String fName = en.getKey();
FieldType fType = en.getValue();
// test this field is analyzable as text field
boolean isText = fType.getClass().isAssignableFrom(StringField.class)
|| fType.getClass().isAssignableFrom(TextField.class)
;
//System.out.println(fName + " " + fType.getClass() + " isText " + isText);
document.put(fName, tokens);
posIncrAtt = null;
String[] vals = d.getValues(fName);
if (!analyze) {
for (String s: vals) {
tokens.add(s);
}
}
else {
if (!isText) {
for (String s: vals) {
tokens.add(s);
}
}
else {
for (String s: vals) {
TokenStream buffer = analyzer.tokenStream(fName, new StringReader(fType.indexedToReadable(s)));
if (!buffer.hasAttribute(CharTermAttribute.class)) {
continue; // empty stream
}
termAtt = buffer.getAttribute(CharTermAttribute.class);
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
buffer.reset();
if (posIncrAtt != null) {
while (buffer.incrementToken()) {
if (posIncrAtt.getPositionIncrement() == 0) {
tokens.set(tokens.size()-1, tokens.get(tokens.size()-1) + "|" + fType.indexedToReadable(termAtt.toString()));
}
else {
tokens.add(fType.indexedToReadable(termAtt.toString()));
}
}
}
else {
while (buffer.incrementToken()) {
tokens.add(fType.indexedToReadable(termAtt.toString()));
}
}
buffer.close();
}
}
}
}
if (processed > 1) {
out.write(",\n");
}
// bummer, it doesn't have api for newlines - according to quick googling
// control chars should be escaped in JSON, so this should be safe
out.write(JSONUtil.toJSON(document, 0).replace("\n", ""));
}
@Override
public boolean needsScores() {
// TODO Auto-generated method stub
return false;
}
});
//System.out.println("written + " + jobFile);
out.write("]\n");
out.write("}");
out.close();
}
@Override
public String getDescription() {
return "Dumps selected fields (for selected docs) to disk in JSON format";
}
}