BatchProviderDumpTermFreqs.java example

Explorer
montysolr-master
- contrib
package org.apache.solr.handler.batch;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;

/**
 * Provider that dumps selected fields to disk.
 * The resulting file has 3 columns:
 * 
 * #term #termFreq #docFreq
 * 
 * You can dump several fields at once, just
 * separate them by a comma
 * 
 * Parameters:
 * 
 *    field: string, comma separated list of 
 *           field names
 */
public class BatchProviderDumpTermFreqs extends BatchProvider {
	public void run(SolrQueryRequest req, BatchHandlerRequestQueue queue) throws Exception {

		SolrCore core = req.getCore();
		SolrParams params = req.getParams();
		IndexSchema schema = core.getLatestSchema();
	  String jobid = params.get("jobid");
	  String workDir = params.get("#workdir");
	  
		final HashSet<String> fieldsToLoad = new HashSet<String>();

		String[] fields = params.getParams("fields");
		for (String f: fields) {
			for (String ff: f.split("( |,)")) {
				SchemaField field = schema.getFieldOrNull(ff);
				if (field==null || !field.indexed()) {
					throw new SolrException(ErrorCode.BAD_REQUEST, "We cannot dump fields that do not exist or are not indexed: " + ff);
				}
				fieldsToLoad.add(ff);
			}
		}

		File jobFile = new File(workDir + "/" + params.get("jobid"));
		final BufferedWriter out = new BufferedWriter(new FileWriter(jobFile), 1024*256);
		out.write("term");
		out.write("\t");
		out.write("termFreq");
		out.write("\t");
		out.write("docFreq");

		DirectoryReader ir = req.getSearcher().getIndexReader();
		TermsEnum reuse = null;
		int processed = 0;
		for (String f: fieldsToLoad) {

			out.write("\n\n# " + f + "\n");

			Terms te = MultiFields.getTerms(ir, f);
			if (te == null) {
				out.write("# term stats is not available for this field");
				continue;
			}
			reuse = te.iterator();

			BytesRef term;
			while((term = reuse.next()) != null) {
				out.write(term.utf8ToString());
				out.write("\t");
				out.write(Long.toString(reuse.totalTermFreq()));
				out.write("\t");
				out.write(Long.toString(reuse.docFreq()));
				out.write("\n");

				processed++;
				if (processed % 10000 == 0) {
					if(queue.isStopped()) { // inside, because queue is synchronized
						throw new IOException("Collector interrupted - stopping");
					}
				}
			}
		}
		out.close();
	}
	
	@Override
  public String getDescription() {
	  return "Dumps term, termFreq, and docFreq (for selected fields) to disk in CSV format";
  }
}