package org.apache.solr.handler.batch; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.HashSet; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.params.SolrParams; import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; /** * Provider that dumps selected fields to disk. * The resulting file has 3 columns: * * #term #termFreq #docFreq * * You can dump several fields at once, just * separate them by a comma * * Parameters: * * field: string, comma separated list of * field names */ public class BatchProviderDumpTermFreqs extends BatchProvider { public void run(SolrQueryRequest req, BatchHandlerRequestQueue queue) throws Exception { SolrCore core = req.getCore(); SolrParams params = req.getParams(); IndexSchema schema = core.getLatestSchema(); String jobid = params.get("jobid"); String workDir = params.get("#workdir"); final HashSet<String> fieldsToLoad = new HashSet<String>(); String[] fields = params.getParams("fields"); for (String f: fields) { for (String ff: f.split("( |,)")) { SchemaField field = schema.getFieldOrNull(ff); if (field==null || !field.indexed()) { throw new SolrException(ErrorCode.BAD_REQUEST, "We cannot dump fields that do not exist or are not indexed: " + ff); } fieldsToLoad.add(ff); } } File jobFile = new File(workDir + "/" + params.get("jobid")); final BufferedWriter out = new BufferedWriter(new FileWriter(jobFile), 1024*256); out.write("term"); out.write("\t"); out.write("termFreq"); out.write("\t"); out.write("docFreq"); DirectoryReader ir = req.getSearcher().getIndexReader(); TermsEnum reuse = null; int processed = 0; for (String f: fieldsToLoad) { out.write("\n\n# " + f + "\n"); Terms te = MultiFields.getTerms(ir, f); if (te == null) { out.write("# term stats is not available for this field"); continue; } reuse = te.iterator(); BytesRef term; while((term = reuse.next()) != null) { out.write(term.utf8ToString()); out.write("\t"); out.write(Long.toString(reuse.totalTermFreq())); out.write("\t"); out.write(Long.toString(reuse.docFreq())); out.write("\n"); processed++; if (processed % 10000 == 0) { if(queue.isStopped()) { // inside, because queue is synchronized throw new IOException("Collector interrupted - stopping"); } } } } out.close(); } @Override public String getDescription() { return "Dumps term, termFreq, and docFreq (for selected fields) to disk in CSV format"; } }