package uk.ac.shef.dcs.jate.indexing; import org.apache.log4j.Logger; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.common.SolrInputDocument; import org.apache.tika.utils.ExceptionUtils; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.io.DocumentCreator; import uk.ac.shef.dcs.jate.model.JATEDocument; import uk.ac.shef.dcs.jate.util.SolrUtil; import java.io.IOException; import java.util.*; public class IndexingHandler { private static final Logger LOG = Logger.getLogger(IndexingHandler.class.getName()); public void index(List<String> tasks, int batchSize, DocumentCreator docCreator, SolrClient solrClient, JATEProperties properties){ StringBuilder msg = new StringBuilder("Beginning indexing dataset").append(", total docs="+tasks.size()); LOG.info(msg.toString()); int total=0, batches=0; StringBuilder skipped=new StringBuilder(); for(String task: tasks){ try { JATEDocument doc = docCreator.create(task); String content=doc.getContent().trim(); if(content.length()==0){ skipped.append(doc.getId()).append("\n"); continue; } total++; SolrInputDocument solrDoc = new SolrInputDocument(); solrDoc.addField(properties.getSolrFieldNameID(), doc.getId()); solrDoc.addField(properties.getSolrFieldNameJATENGramInfo(), doc.getContent()); solrDoc.addField(properties.getSolrFieldNameJATECTerms(), doc.getContent()); for(Map.Entry<String, String> field2Value : doc.getMapField2Content().entrySet()){ String field = field2Value.getKey(); String value = field2Value.getValue(); solrDoc.addField(field, value); } solrClient.add(solrDoc); if(total%batchSize==0) { batches++; LOG.info("Done batches: "+batches*batchSize); SolrUtil.commit(solrClient, LOG, String.valueOf(batches), String.valueOf(batchSize)); } } catch (JATEException e) { StringBuilder message = new StringBuilder("FAILED TO ADD DOC TO SOLR (no commit): "); message.append(task).append("\n") .append(ExceptionUtils.getStackTrace(e)).append("\n"); LOG.error(message.toString()); } catch (IOException e) { StringBuilder message = new StringBuilder("FAILED TO ADD DOC TO SOLR (no commit): "); message.append(task).append("\n") .append(ExceptionUtils.getStackTrace(e)).append("\n"); LOG.error(message.toString()); } catch (SolrServerException e) { StringBuilder message = new StringBuilder("FAILED TO ADD DOC TO SOLR (add): "); message.append(task).append("\n") .append(ExceptionUtils.getStackTrace(e)).append("\n"); LOG.error(message.toString()); } } SolrUtil.commit(solrClient,LOG,String.valueOf(batches+1), String.valueOf(batchSize)); msg=new StringBuilder("Complete indexing dataset. Total processed items = "); msg.append(total); if(skipped.length()!=0) msg.append("\n").append("Some items are skipped because of empty content. If you are not expecting this, check ") .append(DocumentCreator.class.getName()).append(" you have used for indexing, or try a different one.\n"); msg.append(skipped); if(skipped.length()==0) LOG.info(msg.toString()); else LOG.warn(msg.toString()); try { solrClient.close(); } catch (IOException e) { String message = "CANNOT CLOSE SOLR: \n"; LOG.error(message + ExceptionUtils.getStackTrace(e)); } } }