package org.myrobotlab.document.transformer; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.common.SolrInputDocument; import org.myrobotlab.document.Document; import org.myrobotlab.logging.LoggerFactory; import org.slf4j.Logger; /** * This stage will convert an MRL document to a solr document. It then batches * those documents and sends the batches to solr. Upon a flush call any partial * batches will be flushed. * * @author kwatters * */ public class SendToSolr extends AbstractStage { public final static Logger log = LoggerFactory.getLogger(SendToSolr.class); private String idField = "id"; private String fieldsField = "fields"; private boolean addFieldsField = false; private SolrServer solrServer = null; private String solrUrl = "http://localhost:8983/solr/collection1"; private boolean issueCommit = true; private int batchSize = 100; // private LinkedBlockingQueue<SolrInputDocument> batch = new // LinkedBlockingQueue<SolrInputDocument>(); // Synchronized list. needed for thread safety. private List<SolrInputDocument> batch = Collections.synchronizedList(new ArrayList<SolrInputDocument>()); // private String basicAuthUser = null; // private String basicAuthPass = null; // Batch size +/- @Override public void startStage(StageConfiguration config) { solrUrl = config.getProperty("solrUrl", solrUrl); issueCommit = config.getBoolParam("issueCommit", new Boolean(issueCommit)); batchSize = Integer.valueOf(config.getIntegerParam("batchSize", batchSize)); // basicAuthUser = config.getStringParam("basicAuthUser", basicAuthUser); // basicAuthPass = config.getStringParam("basicAuthPass", basicAuthPass); // Initialize a connection to the solr server on startup. if (solrServer == null) { // TODO: support an embeded solr instance log.info("Connecting to Solr at {}", solrUrl); // set credentials. // if (basicAuthUser != null) { // DefaultHttpClient httpClient = new DefaultHttpClient(); // httpClient.getCredentialsProvider().setCredentials(AuthScope.ANY, new // UsernamePasswordCredentials(basicAuthUser, basicAuthPass)); // create solr server with client. // solrServer = new HttpSolrServer( solrUrl , httpClient); // } else { solrServer = new HttpSolrServer(solrUrl); // } } else { log.info("Solr instance already created."); } } @Override public List<Document> processDocument(Document doc) { SolrInputDocument solrDoc = new SolrInputDocument(); // set the id field on the solr doc String docId = doc.getId(); for (String fieldName : doc.getFields()) { for (Object value : doc.getField(fieldName)) { solrDoc.addField(fieldName, value); } if (addFieldsField) { solrDoc.addField(fieldsField, fieldName); } } // prevent id field duplicate values. // remove the id field if it was set, solrDoc.removeField(idField); // make sure we add it back solrDoc.setField(idField, docId); // I guess we have the full document, we should send it // ArrayList<SolrInputDocument> solrDocs = new // ArrayList<SolrInputDocument>(); // solrDocs.add(solrDoc); try { synchronized (batch) { batch.add(solrDoc); if (batch.size() >= batchSize) { // System.out.println("Solr Server Flush Batch..."); // you are blocking? solrServer.add(batch); log.info("Sending Batch to Solr. Size: {}", batch.size()); // System.out.println("Solr batch sent.."); // batch.clear(); batch = Collections.synchronizedList(new ArrayList<SolrInputDocument>()); } else { // System.out.println("Batch Size " + batch.size()); } } } catch (SolrServerException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // TODO: NO COMMITS HERE! // solrServer.commit(); return null; } @Override public void stopStage() { // TODO Auto-generated method stub flush(); } public synchronized void flush() { // Is this where I should flush the last batch? if (solrServer != null && batch.size() > 0) { try { log.info("flushing last batch. Size: {}", batch.size()); solrServer.add(batch); } catch (SolrServerException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { batch.clear(); } } // TODO: should we commit on flush? try { if (issueCommit) { log.info("Committing solr"); solrServer.commit(); } } catch (SolrServerException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // super.flush(); } }