/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.solr.step; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.stream.XMLStreamException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import no.trank.openpipe.api.BasePipelineStep; import no.trank.openpipe.api.PipelineException; import no.trank.openpipe.api.PipelineStepStatus; import no.trank.openpipe.api.document.AnnotatedField; import no.trank.openpipe.api.document.Document; import no.trank.openpipe.api.document.DocumentOperation; import no.trank.openpipe.config.annotation.NotNull; import no.trank.openpipe.config.annotation.NullNotEmpty; import no.trank.openpipe.solr.SolrHttpDocumentPoster; import no.trank.openpipe.solr.analysis.TokenSerializer; import no.trank.openpipe.solr.xml.XmlInputStream; /** * A <tt>PipelineStep</tt> that posts a document to Solr. * * <p>{@linkplain Document}s are converted to <a href="http://wiki.apache.org/solr/UpdateXmlMessages">solr-update-xml * </a>. An URL to solr's schema.xml can be configured to validate field-names and dynamic fields. Typical URL being: * <tt>http://somehost:8983/solr/admin/get-file.jsp?file=schema.xml</tt></p> * * <p>There are two ways control what fields are included in the XML: * {@link #setExcludeInputFields(Set) excludeInputFields} and {@link #setIncludeInputFields(Set) includeInputFields}. * When <tt>includeInputFields</tt> is a not-empty set, only field-names in this set is included in the XML. When * <tt>includeInputFields</tt> is an empty set and <tt>excludeInputFields</tt> is not empty, field-names in * <tt>excludeInputFields</tt> is excluded from the XML</p> * * <p>It's possible to map a document field-name to a solr field-name using * {@link #setInputToOuputFieldMap(Map) inputToOuputFieldMap}. Mapping is applied after <tt>include</tt>-/ * <tt>excludeInputFields</tt></p> * * <p>To set document boost (<doc boost="2.0"/>), add a field that, after mapping, has the name * <tt>"boost"</tt>.</p> * * <p><b>Note</b> Field boost is currently <em>not</em> supported</p> * * <p><em>Example:</em> * <pre> * Document doc = new Document(); * doc.setOperation(DocumentOperation.ADD_VALUE); * doc.setFieldValue("boost", "2.0"); * doc.setFieldValue("url", "http://this/is/a/url"); * doc.setFieldValue("title", "Title"); * doc.setFieldValue("text", "This is the text"); * doc.setFieldValue("ignored", "This text is ignored"); * ... * SolrDocumentProcessor sdp = new SolrDocumentProcessor(); * sdp.setExcludeInputFields(Collections.singelton("ignored")); * sdp.setInputToOuputFieldMap(Collections.singletonMap("url", "id")); * sdp.execute(doc); * </pre> * gives the XML: * <pre> * <add> * <doc boost="2.0"> * <field name="id">http://this/is/a/url</field> * <field name="title">Title</field> * <field name="text">This is the text</field> * </doc> * </add> * </pre> * </p> * * @version $Revision$ */ public class SolrDocumentProcessor extends BasePipelineStep { protected static final String BOOST_KEY = "boost"; private static final Logger log = LoggerFactory.getLogger(SolrDocumentProcessor.class); private final Set<String> solrFields = new HashSet<String>(); private final Set<Pattern> solrDynamicFields = new HashSet<Pattern>(); @NullNotEmpty private String solrSchemaUrl; @NullNotEmpty private String idFieldName; @NotNull private Map<String, String> inputToOuputFieldMap = Collections.emptyMap(); @NotNull private Set<String> excludeInputFields = Collections.emptySet(); @NotNull private Set<String> includeInputFields = Collections.emptySet(); @NotNull private Set<String> tokenizedFields = Collections.emptySet(); private TokenSerializer serializer; @NotNull private SolrHttpDocumentPoster documentPoster; private HttpClient httpClient; private boolean optimizeOnSuccess; /** * Converts a document to XML and posts it to solr. * * @param doc the document to process. * * @return <tt>PipelineStepStatus.DEFAULT</tt>. * * @throws PipelineException if an error occures during processing or posting. * * @see SolrDocumentProcessor */ @Override public PipelineStepStatus execute(Document doc) throws PipelineException { try { // Post the document if (DocumentOperation.DELETE_VALUE.equals(doc.getOperation())) { if (idFieldName != null) { documentPoster.delete(doc.getFieldValues(idFieldName)); } else { log.warn("idFieldName not set -> delete not supported - ignoring"); } } else { final HashMap<String, List<String>> solrOutputDoc = new HashMap<String, List<String>>(); // Get what field we want to post to solr for (String inputField : doc.getFieldNames()) { if (!includeInputFields.isEmpty()) { if (includeInputFields.contains(inputField)) { addField(doc, inputField, solrOutputDoc); } } else if (!excludeInputFields.isEmpty()) { if (!excludeInputFields.contains(inputField)) { addField(doc, inputField, solrOutputDoc); } } else { addField(doc, inputField, solrOutputDoc); } } documentPoster.add(solrOutputDoc, findDocAttributes(solrOutputDoc)); } return PipelineStepStatus.DEFAULT; } catch (XMLStreamException e) { throw new PipelineException("Could not generate xml", e); } } private static Map<String, String> findDocAttributes(HashMap<String, List<String>> solrOutputDoc) { final List<String> boostList = solrOutputDoc.remove(BOOST_KEY); final Map<String, String> attribs; if (boostList != null && !boostList.isEmpty()) { if (boostList.size() > 1) { log.warn("Got multiple boost values {} for document", boostList); } attribs = Collections.singletonMap(BOOST_KEY, boostList.get(0)); } else { attribs = Collections.emptyMap(); } return attribs; } /** * Loads <tt>schema.xml</tt> if {@link #getSolrSchemaUrl() solrSchemaUrl} is not <tt>null</tt>. * * @throws PipelineException if {@link #getDocumentPoster() documentPoster} is <tt>null</tt>, if schema.xml could not * be parsed or if {@link #getTokenizedFields() tokenizedFields} is <em>not</em> empty and * {@link #getSerializer() serializer} is <tt>null</tt>. */ @Override public void prepare() throws PipelineException { super.prepare(); try { documentPoster.prepare(); } catch (MalformedURLException e) { throw new PipelineException("Post url is malformed", e); } catch (IOException e) { throw new PipelineException(e); } if (httpClient == null) { httpClient = new HttpClient(); } if (solrSchemaUrl != null) { try { loadIndexSchema(new URL(solrSchemaUrl)); } catch (Exception e) { throw new PipelineException(e); } } addField(BOOST_KEY); // Needed even if there is no schemaUrl if (!tokenizedFields.isEmpty() && serializer == null) { throw new PipelineException("TokenizedFields set, but no serializer configured"); } } /** * Finishes this batch, by posting outstanding documents (if any) to solr. Cleans up any resources. * * @throws PipelineException if post to solr failed. */ @Override public void finish(boolean success) throws PipelineException { if (serializer != null) { try { serializer.close(); } catch (IOException e) { // Ignoring } } try { documentPoster.finish(); if (success && optimizeOnSuccess) { documentPoster.optimize(); } } catch (XMLStreamException e) { throw new PipelineException("Could not write xml", e); } } public String getSolrSchemaUrl() { return solrSchemaUrl; } public void setSolrSchemaUrl(String solrSchemaUrl) { this.solrSchemaUrl = solrSchemaUrl; } public Set<String> getExcludeInputFields() { return excludeInputFields; } public void setExcludeInputFields(Set<String> excludeInputFields) { this.excludeInputFields = excludeInputFields; } public Set<String> getIncludeInputFields() { return includeInputFields; } public void setIncludeInputFields(Set<String> includeInputFields) { this.includeInputFields = includeInputFields; } public String getIdFieldName() { return idFieldName; } public void setIdFieldName(String idFieldName) { this.idFieldName = idFieldName; } public SolrHttpDocumentPoster getDocumentPoster() { return documentPoster; } public void setDocumentPoster(SolrHttpDocumentPoster documentPoster) { this.documentPoster = documentPoster; } public Map<String, String> getInputToOuputFieldMap() { return inputToOuputFieldMap; } public void setInputToOuputFieldMap(Map<String, String> inputToOuputFieldMap) { this.inputToOuputFieldMap = inputToOuputFieldMap; } public Set<String> getTokenizedFields() { return tokenizedFields; } public void setTokenizedFields(Set<String> tokenizedFields) { this.tokenizedFields = tokenizedFields; } public TokenSerializer getSerializer() { return serializer; } public void setSerializer(TokenSerializer serializer) { this.serializer = serializer; } /** * Gets an <em>unmodifiable</em> set of field-names. * * @return an <em>unmodifiable</em> set of field-names. */ protected Set<String> getSolrFields() { return Collections.unmodifiableSet(solrFields); } public void setHttpClient(HttpClient httpClient) { this.httpClient = httpClient; } @Override public String getRevision() { return "$Revision$"; } protected void addField(Document doc, String inputField, HashMap<String, List<String>> solrOutputDoc) throws PipelineException { final String ouputField = getOuputFieldName(inputField); if (solrSchemaUrl == null || solrFields.contains(ouputField) || matchesDynamicField(ouputField)) { List<String> fieldValueList = solrOutputDoc.get(ouputField); if (fieldValueList == null) { fieldValueList = new ArrayList<String>(); solrOutputDoc.put(ouputField, fieldValueList); } if (tokenizedFields.contains(inputField)) { fieldValueList.addAll(serialize(doc.getFields(inputField))); } else { fieldValueList.addAll(doc.getFieldValues(inputField)); } } else if (log.isDebugEnabled()) { log.debug("Field '{}' does not exist in solr schema, and does not match a dynamic field. Skipped.", ouputField); } } private List<String> serialize(List<AnnotatedField> fields) { final List<String> list = new ArrayList<String>(fields.size()); for (AnnotatedField field : fields) { list.add(serializer.serialize(field)); } return list; } protected String getOuputFieldName(String inputField) { final String mappedName = inputToOuputFieldMap.get(inputField); return mappedName == null ? inputField : mappedName; } protected boolean matchesDynamicField(String inputField) { for (Pattern dynamicField : solrDynamicFields) { if (dynamicField.matcher(inputField).matches()) { return true; } } return false; } private void loadIndexSchema(URL url) throws IOException, SAXException, ParserConfigurationException, XPathExpressionException { solrFields.clear(); solrDynamicFields.clear(); InputStream sIn; if (url.getProtocol().equals("file")) { sIn = url.openStream(); } else { GetMethod get = new GetMethod(url.toExternalForm()); httpClient.executeMethod(get); sIn = get.getResponseBodyAsStream(); } final InputStream in = new XmlInputStream(sIn); try { DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); org.w3c.dom.Document document = builder.parse(in); final XPath xpath = XPathFactory.newInstance().newXPath(); final NodeList nodes = (NodeList) xpath.evaluate("/schema/fields/field | /schema/fields/dynamicField", document, XPathConstants.NODESET); for (int i = 0; i < nodes.getLength(); i++) { final Node node = nodes.item(i); final String name = ((Element) node).getAttribute("name"); final String nodeName = node.getNodeName(); if ("field".equals(nodeName)) { addField(name); } else if ("dynamicField".equals(nodeName)) { addDynamicField(name); } } if (idFieldName == null) { Node idNode = (Node) xpath.evaluate("/schema/uniqueKey", document, XPathConstants.NODE); idFieldName = idNode.getTextContent().trim(); } } finally { try { in.close(); } catch (Exception e) { // Do nothing } } } protected boolean addField(String fieldName) { return solrFields.add(fieldName); } protected boolean addDynamicField(String fieldPattern) { return solrDynamicFields.add(Pattern.compile(fieldPattern.replaceAll("\\*", "\\.*"))); } public void setOptimizeOnSuccess(boolean optimizeOnSuccess) { this.optimizeOnSuccess = optimizeOnSuccess; } }