package de.dfki.km.leech.solr; import java.rmi.server.UID; import java.util.HashMap; import java.util.Map.Entry; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrClient; import org.apache.solr.common.SolrInputDocument; import org.apache.tika.metadata.Metadata; import de.dfki.inquisition.collections.MultiValueHashMap; import de.dfki.inquisition.text.StringUtils; import de.dfki.km.leech.metadata.LeechMetadata; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.sax.DataSinkContentHandler; public class ToSolrContentHandler extends DataSinkContentHandler { public static void main(String[] args) { } protected HashMap<String, Integer> m_hsField2MultiValDocCount = new HashMap<>(); protected MultiValueHashMap<String, String> m_hsStaticAttValuePairs = new MultiValueHashMap<String, String>(); protected SolrClient m_solrClient; protected String m_strSolrUrl; protected int m_iErrorEntityCount = 0; /** * Creates a new instance, without a cloudSolrClient (default is ConcurrentUpdateSolrClient) * * @param solrUrl */ public ToSolrContentHandler(String solrUrl) { this(solrUrl, false, null); } /** * Creates a new instance * * @param solrUrl the url(s) to the solr server. In the case cloudSolrClient is true, this is a list of zookeeper servers. In the case it is false, its the URL of the * solr server * @param cloudSolrClient true: the class will create a CloudSolrClient instance. false: creation of ConcurrentUpdateSolrClient * @param defaultCollection only necessary if the CloudSolrClient is used. If you use ConcurrentUpdateSolrClient, specify it either in the solrUrl OR here. Null or * empty values are possible. */ public ToSolrContentHandler(String solrUrl, boolean cloudSolrClient, String defaultCollection) { this.m_strSolrUrl = solrUrl; if(cloudSolrClient) { m_solrClient = new CloudSolrClient(solrUrl); ((CloudSolrClient) m_solrClient).setDefaultCollection(defaultCollection); } else { if(!StringUtils.nullOrWhitespace(defaultCollection)) { if(!solrUrl.endsWith("/")) solrUrl += "/"; solrUrl += defaultCollection; } // hier besser einen ConcurrentUpdateSolrClient nehmen, der soll beim Indexieren besser performen...ist ungefähr Faktor 10 schneller ^^ - allerdings muß man // mit der Fehlermeldung aufpassen // alt: m_solrClient = new HttpSolrClient(solrUrl); int iCores = Runtime.getRuntime().availableProcessors(); m_solrClient = new ConcurrentUpdateSolrClient(solrUrl, 2056, iCores / 2) { private static final long serialVersionUID = -8653784811055510844L; @Override public void handleError(Throwable ex) { m_iErrorEntityCount++; Logger.getLogger(ToSolrContentHandler.class.getName()).log(Level.SEVERE, "Error while insertion to SOLR (" + m_iErrorEntityCount + " errors yet). Check the SOLR logs. Error message: " + ex.getMessage()); } }; } } @Override public void crawlFinished() { try { m_solrClient.commit(); m_solrClient.optimize(); m_solrClient.close(); if(m_hsField2MultiValDocCount.size() > 0) Logger.getLogger(ToSolrContentHandler.class.getName()).info("Fields with according doc number with multivalued entries: " + m_hsField2MultiValDocCount); if(m_iErrorEntityCount > 0) Logger.getLogger(ToSolrContentHandler.class.getName()).warning( StringUtils.beautifyNumber(m_iErrorEntityCount) + " errors while inserting to SOLR. Check the SOLR logs."); else Logger.getLogger(ToSolrContentHandler.class.getName()).info(m_iErrorEntityCount + " errors while inserting to SOLR"); } catch (Exception e) { Logger.getLogger(ToSolrContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } } /** * Sets some attribute value pairs that will be added to every crawled document. * * @return the current static attribute value pairs */ public MultiValueHashMap<String, String> getStaticAttributeValuePairs() { return m_hsStaticAttValuePairs; } @Override public void processErrorData(Metadata metadata) { // NOP } @Override public void processModifiedData(Metadata metadata, String strFulltext) { // sadly, there is no update method this.processRemovedData(metadata); this.processNewData(metadata, strFulltext); } @Override public void processNewData(Metadata metadata, String strFulltext) { try { SolrInputDocument doc = new SolrInputDocument(); if(metadata.getValues(LeechMetadata.id).length == 0) doc.addField(LeechMetadata.id, new UID().toString()); if(strFulltext != null && !strFulltext.isEmpty()) doc.addField(LeechMetadata.body, strFulltext); for (String strFieldName : metadata.names()) { String[] values = metadata.getValues(strFieldName); for (String strFieldValue : values) { doc.addField(strFieldName, strFieldValue); } if(values.length > 1) { Integer iMulti4Field = m_hsField2MultiValDocCount.get(strFieldName); if(iMulti4Field == null) iMulti4Field = 1; else iMulti4Field++; m_hsField2MultiValDocCount.put(strFieldName, iMulti4Field); } } // die statischen AttValue Paare MultiValueHashMap<String, String> mhsStaticAttributeValuePairs = getStaticAttributeValuePairs(); for (Entry<String, String> att2value : mhsStaticAttributeValuePairs.entryList()) doc.addField(att2value.getKey(), att2value.getValue()); m_solrClient.add(doc); } catch (Exception e) { Logger.getLogger(ToSolrContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } } @Override public void processProcessedData(Metadata metadata) { // NOP } @Override public void processRemovedData(Metadata metadata) { try { m_solrClient.deleteById(metadata.get(IncrementalCrawlingHistory.dataEntityId)); } catch (Exception e) { Logger.getLogger(ToSolrContentHandler.class.getName()).log(Level.SEVERE, "Error", e); } } @Override public void processUnmodifiedData(Metadata metadata) { // NOP } /** * Sets some attribute value pairs that will be added to every crawled document. * * @param hsStaticAttValuePairs a multi value map containing the additional attribute value pairs * * @return this */ public ToSolrContentHandler setStaticAttributeValuePairs(MultiValueHashMap<String, String> hsStaticAttValuePairs) { m_hsStaticAttValuePairs = hsStaticAttValuePairs; return this; } }