SolrDocumentProcessor.java example

Explorer

openpipe-master
- lemmatizer
  - src
    - main
      - java
        no
        trank
        openpipe
        lemmatizer
        LemmatizerStep.java
        model
        LemmaDeSerializer.java
        LemmaSuffix.java
        LemmatizeModel.java
        LemmatizeModelFactory.java
        parser
        Parser.java
        TextParser.java
        util
        TernarySearchTree.java
        TreeEntry.java
        TreeValue.java
        TreeValueFactory.java
    - test
      - java
        no
        trank
        openpipe
        lemmatizer
        model
        LemmaDeSerializerTest.java
        parser
        TextParserTest.java
        util
        TernarySearchTreeTest.java
- openpipe-core
  - src
    - main
      - java
        no
        trank
        openpipe
        MainSpring.java
        api
        BasePipelineExceptionHandler.java
        BasePipelineStep.java
        BaseSubPipeline.java
        DefaultPipelineExceptionHandler.java
        Finishable.java
        LoggingPipelineExceptionListener.java
        MultiInputFieldPipelineStep.java
        MultiInputOutputFieldPipelineStep.java
        MultiPipelineException.java
        Pipeline.java
        PipelineException.java
        PipelineExceptionHandler.java
        PipelineExceptionListener.java
        PipelineFlow.java
        PipelineFlowEnum.java
        PipelineRunner.java
        PipelineStatusCode.java
        PipelineStep.java
        PipelineStepStatus.java
        PipelineStepStatusCode.java
        SubPipeline.java
        document
        AnnotatedField.java
        Annotation.java
        BaseAnnotatedField.java
        BaseAnnotation.java
        BaseResolvedAnnotation.java
        ByteArrayRawData.java
        Document.java
        DocumentOperation.java
        DocumentProducer.java
        DomRawData.java
        FileRawData.java
        PreResolvedAnnotation.java
        RawData.java
        ResolvedAnnotation.java
        package-info.java
        package-info.java
        config
        BeanValidator.java
        annotation
        NotEmpty.java
        NotNull.java
        NullNotEmpty.java
        reader
        FileDocumentReader.java
        MultiXmlDocumentReader.java
        TextFileDocumentReader.java
        step
        AnnotateSentence.java
        AnnotateSpace.java
        AnnotationToField.java
        ChecksumFields.java
        ChopField.java
        ConvertDate.java
        CopyField.java
        Debug.java
        FieldPipelineSelector.java
        HierarchicalSplitter.java
        OperationPipelineSelector.java
        ParseXML.java
        ParseXMLXPath.java
        PipelineSelector.java
        RegexField.java
        RemoveFields.java
        SetField.java
        StripHtml.java
        Uppercase.java
        WriteXML.java
        xml
        DocumentWriter.java
        XMLStreamDocWriter.java
        util
        AcceptAllFileFilter.java
        FilesFirstComparator.java
        HexUtil.java
        IdentityHashSet.java
        Iterators.java
        QNameEditor.java
        RegexFileFilter.java
        log
        DefaultTimedLogger.java
        NoopTimedLogger.java
        TimedLogger.java
        TotalTimedLogger.java
    - test
      - java
        no
        trank
        openpipe
        api
        BaseSubPipelineTest.java
        DefaultPipelineExceptionHandlerTest.java
        PipelineTest.java
        document
        BaseAnnotatedFieldTest.java
        DocumentTest.java
        config
        BeanValidatorTest.java
        reader
        FileDocumentReaderTest.java
        MultiXmlDocumentReaderTest.java
        step
        AnnotationToFieldTest.java
        ChecksumFieldsTest.java
        ChopFieldTest.java
        ConvertDateTest.java
        HierarchicalSplitterTest.java
        OperationPipelineSelectorTest.java
        ParseXMLTest.java
        ParseXMLXPathTest.java
        RegexFieldTest.java
        SetFieldTest.java
        StripHtmlTest.java
        util
        HexUtilTest.java
        RegexFileFilterTest.java
- openpipe-jdbc
  - src
    - main
      - java
        no
        trank
        openpipe
        jdbc
        DocumentMapper.java
        HtmlJdbcStats.java
        JdbcAdmin.java
        JdbcDocumentProducer.java
        JdbcPoller.java
        JdbcStats.java
        MetaDataDocumentMapper.java
        NoopJdbcStats.java
        SimpleJdbcDocumentProducer.java
        store
        IdStateHolder.java
        StateDocumentProducer.java
        StateDocumentStep.java
        StringRowMapper.java
        TableDescription.java
    - test
      - java
        no
        trank
        openpipe
        jdbc
        SimpleJdbcDocumentProducerTest.java
        store
        StateDocumentProducerTest.java
- openpipe-lang
  - src
    - main
      - java
        no
        trank
        openpipe
        lang
        step
        LanguageIdentifier.java
    - test
      - java
        no
        trank
        openpipe
        lang
        step
        LanguageIdentifierTest.java
- openpipe-opennlp
  - src
    - main
      - java
        no
        trank
        openpipe
        opennlp
        io
        InputStreamGISModelReader.java
        step
        ONLPNEDetector.java
        ONLPSentenceDetector.java
        ONLPTokenizer.java
- openpipe-solr
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        SolrDocumentPostException.java
        SolrHttpDocumentPoster.java
        SolrXmlDocumentWriter.java
        UpdateOptions.java
        analysis
        AnnotationTokenStream.java
        Base64TokenSerializer.java
        TokenAnnotation.java
        TokenSerializer.java
        TokenStreamAnnotation.java
        step
        SolrAnalyzerStep.java
        SolrDocumentProcessor.java
        util
        TokenFilterFactoryFactory.java
        xml
        XmlInputStream.java
    - test
      - java
        no
        trank
        openpipe
        solr
        step
        SolrDocumentProcessorTest.java
        xml
        XmlInputStreamTest.java
- parse
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        api
        ParseData.java
        Parser.java
        ParserException.java
        ParserResult.java
        ParserResultImpl.java
        PipelineParseData.java
        package-info.java
        step
        DocumentParser.java
        package-info.java
- parse-misc
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        oo
        OOParser.java
        pdf
        PDFParser.java
        text
        TextDecoder.java
        TextParser.java
        xml
        XMLParser.java
    - test
      - java
        no
        trank
        openpipe
        parse
        oo
        OOParserTest.java
        xml
        XMLParserTest.java
- parse-ms
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        ms
        ExcelParser.java
        POIUtils.java
        PowerPointParser.java
        WordParser.java
    - test
      - java
        no
        trank
        openpipe
        parse
        ms
        AbstractMsParserTest.java
        ExcelParserTest.java
        PowerPointParserTest.java
        WordParserTest.java
- solr-producer
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        producer
        SolrDocumentProducer.java
        SolrUpdateServlet.java
        xml
        XmlStreamDocumentReader.java
    - test
      - java
        no
        trank
        openpipe
        solr
        producer
        SolrUpdateServletTest.java
        xml
        XmlStreamDocumentReaderTest.java
- solr-tokenizer
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        analysis
        BinaryIO.java
        BinaryTokenDeserializer.java
        BinaryTokenDeserializerFactory.java
        io
        Base64InputStream.java
        Base64Output.java
        Base64OutputBuffer.java
        Base64OutputStream.java
        schema
        Base64Type.java
        util
        IOUtil.java
    - test
      - java
        no
        trank
        openpipe
        solr
        analysis
        BinaryIOTest.java
        BinaryTokenDeserializerTest.java
        io
        Base64InputStreamTest.java
        Base64OutputBufferTest.java
        Base64OutputStreamTest.java
        util
        IOUtilTest.java
- tutorial-intranet
  - src
    - main
      - java
        no
        trank
        openpipe
        tutorial
        intranet
        Main.java
    - test
      - java
        TestMainIntranet.java
- wikipedia
  - src
    - main
      - java
        no
        trank
        openpipe
        wikipedia
        WikipediaDumpHandler.java
        download
        DownloadProgressListener.java
        DownloadProgressLogger.java
        DownloadingWikipediaDumpHandler.java
        HttpDownloader.java
        NullProgressListener.java
        producer
        InputStreamPrefixStripper.java
        WikiDocumentSplitter.java
        WikipediaDocumentProducer.java
        meta
        RssMetaParser.java
        step
        WikipediaUrlBuilder.java
    - test
      - java
        no
        trank
        openpipe
        wikipedia
        download
        HttpDownloaderTest.java
        producer
        WikiDocumentSplitterTest.java
        meta
        RssMetaParserTest.java
        step
        WikipediaUrlBuilderTest.java

/*
 * Copyright 2007  T-Rank AS
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package no.trank.openpipe.solr.step;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLStreamException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import no.trank.openpipe.api.BasePipelineStep;
import no.trank.openpipe.api.PipelineException;
import no.trank.openpipe.api.PipelineStepStatus;
import no.trank.openpipe.api.document.AnnotatedField;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.api.document.DocumentOperation;
import no.trank.openpipe.config.annotation.NotNull;
import no.trank.openpipe.config.annotation.NullNotEmpty;
import no.trank.openpipe.solr.SolrHttpDocumentPoster;
import no.trank.openpipe.solr.analysis.TokenSerializer;
import no.trank.openpipe.solr.xml.XmlInputStream;

/**
 * A <tt>PipelineStep</tt> that posts a document to Solr.
 * 
 * <p>{@linkplain Document}s are converted to <a href="http://wiki.apache.org/solr/UpdateXmlMessages">solr-update-xml
 * </a>. An URL to solr's schema.xml can be configured to validate field-names and dynamic fields. Typical URL being: 
 * <tt>http://somehost:8983/solr/admin/get-file.jsp?file=schema.xml</tt></p>
 * 
 * <p>There are two ways control what fields are included in the XML: 
 * {@link #setExcludeInputFields(Set) excludeInputFields} and {@link #setIncludeInputFields(Set) includeInputFields}. 
 * When <tt>includeInputFields</tt> is a not-empty set, only field-names in this set is included in the XML. When 
 * <tt>includeInputFields</tt> is an empty set and <tt>excludeInputFields</tt> is not empty, field-names in 
 * <tt>excludeInputFields</tt> is excluded from the XML</p>
 * 
 * <p>It's possible to map a document field-name to a solr field-name using 
 * {@link #setInputToOuputFieldMap(Map) inputToOuputFieldMap}. Mapping is applied after <tt>include</tt>-/
 * <tt>excludeInputFields</tt></p>
 * 
 * <p>To set document boost (<doc boost="2.0"/>), add a field that, after mapping, has the name 
 * <tt>"boost"</tt>.</p>
 * 
 * <p><b>Note</b> Field boost is currently <em>not</em> supported</p>
 * 
 * <p><em>Example:</em>
 * <pre>
 * Document doc = new Document();
 * doc.setOperation(DocumentOperation.ADD_VALUE);
 * doc.setFieldValue("boost", "2.0");
 * doc.setFieldValue("url", "http://this/is/a/url");
 * doc.setFieldValue("title", "Title");
 * doc.setFieldValue("text", "This is the text");
 * doc.setFieldValue("ignored", "This text is ignored");
 * ...
 * SolrDocumentProcessor sdp = new SolrDocumentProcessor();
 * sdp.setExcludeInputFields(Collections.singelton("ignored"));
 * sdp.setInputToOuputFieldMap(Collections.singletonMap("url", "id"));
 * sdp.execute(doc);
 * </pre>
 * gives the XML:
 * <pre>
 * <add>
 *   <doc boost="2.0">
 *     <field name="id">http://this/is/a/url</field>
 *     <field name="title">Title</field>
 *     <field name="text">This is the text</field>
 *   </doc>
 * </add>
 * </pre>
 * </p>
 * 
 * @version $Revision$
 */
public class SolrDocumentProcessor extends BasePipelineStep {
   protected static final String BOOST_KEY = "boost";
   private static final Logger log = LoggerFactory.getLogger(SolrDocumentProcessor.class);

   private final Set<String> solrFields = new HashSet<String>();
   private final Set<Pattern> solrDynamicFields = new HashSet<Pattern>();

   @NullNotEmpty
   private String solrSchemaUrl;
   @NullNotEmpty
   private String idFieldName;
   @NotNull
   private Map<String, String> inputToOuputFieldMap = Collections.emptyMap();
   @NotNull
   private Set<String> excludeInputFields = Collections.emptySet();
   @NotNull
   private Set<String> includeInputFields = Collections.emptySet();
   @NotNull
   private Set<String> tokenizedFields = Collections.emptySet();
   private TokenSerializer serializer;
   @NotNull
   private SolrHttpDocumentPoster documentPoster;
   private HttpClient httpClient;
   private boolean optimizeOnSuccess;

   /**
    * Converts a document to XML and posts it to solr.
    * 
    * @param doc the document to process.
    * 
    * @return <tt>PipelineStepStatus.DEFAULT</tt>.
    * 
    * @throws PipelineException if an error occures during processing or posting.
    * 
    * @see SolrDocumentProcessor
    */
   @Override
   public PipelineStepStatus execute(Document doc) throws PipelineException {
      try {
         // Post the document
         if (DocumentOperation.DELETE_VALUE.equals(doc.getOperation())) {
            if (idFieldName != null) {
               documentPoster.delete(doc.getFieldValues(idFieldName));
            } else {
               log.warn("idFieldName not set -> delete not supported - ignoring");
            }
         } else {
            final HashMap<String, List<String>> solrOutputDoc = new HashMap<String, List<String>>();
            // Get what field we want to post to solr
            for (String inputField : doc.getFieldNames()) {
               if (!includeInputFields.isEmpty()) {
                  if (includeInputFields.contains(inputField)) {
                     addField(doc, inputField, solrOutputDoc);
                  }
               } else if (!excludeInputFields.isEmpty()) {
                  if (!excludeInputFields.contains(inputField)) {
                     addField(doc, inputField, solrOutputDoc);
                  }
               } else {
                  addField(doc, inputField, solrOutputDoc);
               }
            }
            documentPoster.add(solrOutputDoc, findDocAttributes(solrOutputDoc));
         }
         return PipelineStepStatus.DEFAULT;
      } catch (XMLStreamException e) {
         throw new PipelineException("Could not generate xml", e);
      }
   }

   private static Map<String, String> findDocAttributes(HashMap<String, List<String>> solrOutputDoc) {
      final List<String> boostList = solrOutputDoc.remove(BOOST_KEY);
      final Map<String, String> attribs;
      if (boostList != null && !boostList.isEmpty()) {
         if (boostList.size() > 1) {
            log.warn("Got multiple boost values {} for document", boostList);
         }
         attribs = Collections.singletonMap(BOOST_KEY, boostList.get(0));
      } else {
         attribs = Collections.emptyMap();
      }
      return attribs;
   }

   /**
    * Loads <tt>schema.xml</tt> if {@link #getSolrSchemaUrl() solrSchemaUrl} is not <tt>null</tt>. 
    * 
    * @throws PipelineException if {@link #getDocumentPoster() documentPoster} is <tt>null</tt>, if schema.xml could not 
    * be parsed or if {@link #getTokenizedFields() tokenizedFields} is <em>not</em> empty and 
    * {@link #getSerializer() serializer} is <tt>null</tt>. 
    */
   @Override
   public void prepare() throws PipelineException {
      super.prepare();
      try {
         documentPoster.prepare();
      } catch (MalformedURLException e) {
         throw new PipelineException("Post url is malformed", e);
      } catch (IOException e) {
         throw new PipelineException(e);
      }

      if (httpClient == null) {
         httpClient = new HttpClient();
      }

      if (solrSchemaUrl != null) {
         try {
            loadIndexSchema(new URL(solrSchemaUrl));
         } catch (Exception e) {
            throw new PipelineException(e);
         }
      }
      addField(BOOST_KEY); // Needed even if there is no schemaUrl
      if (!tokenizedFields.isEmpty() && serializer == null) {
         throw new PipelineException("TokenizedFields set, but no serializer configured");
      }
   }

   /**
    * Finishes this batch, by posting outstanding documents (if any) to solr. Cleans up any resources.
    * 
    * @throws PipelineException if post to solr failed.
    */
   @Override
   public void finish(boolean success) throws PipelineException {
      if (serializer != null) {
         try {
            serializer.close();
         } catch (IOException e) {
            // Ignoring
         }
      }
      try {
         documentPoster.finish();
         if (success && optimizeOnSuccess) {
            documentPoster.optimize();
         }
      } catch (XMLStreamException e) {
         throw new PipelineException("Could not write xml", e);
      }
   }

   public String getSolrSchemaUrl() {
      return solrSchemaUrl;
   }

   public void setSolrSchemaUrl(String solrSchemaUrl) {
      this.solrSchemaUrl = solrSchemaUrl;
   }

   public Set<String> getExcludeInputFields() {
      return excludeInputFields;
   }

   public void setExcludeInputFields(Set<String> excludeInputFields) {
      this.excludeInputFields = excludeInputFields;
   }

   public Set<String> getIncludeInputFields() {
      return includeInputFields;
   }

   public void setIncludeInputFields(Set<String> includeInputFields) {
      this.includeInputFields = includeInputFields;
   }

   public String getIdFieldName() {
      return idFieldName;
   }

   public void setIdFieldName(String idFieldName) {
      this.idFieldName = idFieldName;
   }

   public SolrHttpDocumentPoster getDocumentPoster() {
      return documentPoster;
   }

   public void setDocumentPoster(SolrHttpDocumentPoster documentPoster) {
      this.documentPoster = documentPoster;
   }

   public Map<String, String> getInputToOuputFieldMap() {
      return inputToOuputFieldMap;
   }

   public void setInputToOuputFieldMap(Map<String, String> inputToOuputFieldMap) {
      this.inputToOuputFieldMap = inputToOuputFieldMap;
   }

   public Set<String> getTokenizedFields() {
      return tokenizedFields;
   }

   public void setTokenizedFields(Set<String> tokenizedFields) {
      this.tokenizedFields = tokenizedFields;
   }

   public TokenSerializer getSerializer() {
      return serializer;
   }

   public void setSerializer(TokenSerializer serializer) {
      this.serializer = serializer;
   }

   /**
    * Gets an <em>unmodifiable</em> set of field-names.
    * 
    * @return an <em>unmodifiable</em> set of field-names.
    */
   protected Set<String> getSolrFields() {
      return Collections.unmodifiableSet(solrFields);
   }

   public void setHttpClient(HttpClient httpClient) {
      this.httpClient = httpClient;
   }

   @Override
   public String getRevision() {
      return "$Revision$";
   }

   protected void addField(Document doc, String inputField, HashMap<String, List<String>> solrOutputDoc) 
         throws PipelineException {
      final String ouputField = getOuputFieldName(inputField);
      if (solrSchemaUrl == null || solrFields.contains(ouputField) || matchesDynamicField(ouputField)) {
         List<String> fieldValueList = solrOutputDoc.get(ouputField);
         if (fieldValueList == null) {
            fieldValueList = new ArrayList<String>();
            solrOutputDoc.put(ouputField, fieldValueList);
         }
         if (tokenizedFields.contains(inputField)) {
            fieldValueList.addAll(serialize(doc.getFields(inputField)));
         } else {
            fieldValueList.addAll(doc.getFieldValues(inputField));
         }
      } else if (log.isDebugEnabled()) {
         log.debug("Field '{}' does not exist in solr schema, and does not match a dynamic field. Skipped.", ouputField);
      }
   }

   private List<String> serialize(List<AnnotatedField> fields) {
      final List<String> list = new ArrayList<String>(fields.size());
      for (AnnotatedField field : fields) {
         list.add(serializer.serialize(field));
      }
      return list;
   }

   protected String getOuputFieldName(String inputField) {
      final String mappedName = inputToOuputFieldMap.get(inputField);
      return mappedName == null ? inputField : mappedName;
   }

   protected boolean matchesDynamicField(String inputField) {
      for (Pattern dynamicField : solrDynamicFields) {
         if (dynamicField.matcher(inputField).matches()) {
            return true;
         }
      }
      return false;
   }

   private void loadIndexSchema(URL url) throws IOException, SAXException, ParserConfigurationException, XPathExpressionException {
      solrFields.clear();
      solrDynamicFields.clear();

      InputStream sIn;
      if (url.getProtocol().equals("file")) {
         sIn = url.openStream();
      } else {
         GetMethod get = new GetMethod(url.toExternalForm());
         httpClient.executeMethod(get);
         sIn = get.getResponseBodyAsStream();
      }
      final InputStream in = new XmlInputStream(sIn);
      try {

         DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
         org.w3c.dom.Document document = builder.parse(in);

         final XPath xpath = XPathFactory.newInstance().newXPath();
         final NodeList nodes = (NodeList) xpath.evaluate("/schema/fields/field | /schema/fields/dynamicField", document,
               XPathConstants.NODESET);

         for (int i = 0; i < nodes.getLength(); i++) {
            final Node node = nodes.item(i);
            final String name = ((Element) node).getAttribute("name");
            final String nodeName = node.getNodeName();
            if ("field".equals(nodeName)) {
               addField(name);
            } else if ("dynamicField".equals(nodeName)) {
               addDynamicField(name);
            }
         }

         if (idFieldName == null) {
            Node idNode = (Node) xpath.evaluate("/schema/uniqueKey", document, XPathConstants.NODE);
            idFieldName = idNode.getTextContent().trim();
         }

      } finally {
         try {
            in.close();
         } catch (Exception e) {
            // Do nothing
         }
      }
   }

   protected boolean addField(String fieldName) {
      return solrFields.add(fieldName);
   }

   protected boolean addDynamicField(String fieldPattern) {
      return solrDynamicFields.add(Pattern.compile(fieldPattern.replaceAll("\\*", "\\.*")));
   }

   public void setOptimizeOnSuccess(boolean optimizeOnSuccess) {
      this.optimizeOnSuccess = optimizeOnSuccess;
   }
}