ExtractingDocumentLoader.java example

Explorer
solrcene-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.extraction;

import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer;
import org.apache.xml.serialize.XMLSerializer;
import org.apache.xml.serialize.TextSerializer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Locale;


/**
 * The class responsible for loading extracted content into Solr.
 *
 **/
public class ExtractingDocumentLoader extends ContentStreamLoader {
  /**
   * Extract Only supported format
   */
  public static final String TEXT_FORMAT = "text";
  /**
   * Extract Only supported format.  Default
   */
  public static final String XML_FORMAT = "xml";
  /**
   * XHTML XPath parser.
   */
  private static final XPathParser PARSER =
          new XPathParser("xhtml", XHTMLContentHandler.XHTML);

  final IndexSchema schema;
  final SolrParams params;
  final UpdateRequestProcessor processor;
  protected AutoDetectParser autoDetectParser;

  private final AddUpdateCommand templateAdd;

  protected TikaConfig config;
  protected SolrContentHandlerFactory factory;
  //protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;

  public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
                           TikaConfig config, SolrContentHandlerFactory factory) {
    this.params = req.getParams();
    schema = req.getSchema();
    this.config = config;
    this.processor = processor;

    templateAdd = new AddUpdateCommand();
    templateAdd.allowDups = false;
    templateAdd.overwriteCommitted = true;
    templateAdd.overwritePending = true;

    if (params.getBool(UpdateParams.OVERWRITE, true)) {
      templateAdd.allowDups = false;
      templateAdd.overwriteCommitted = true;
      templateAdd.overwritePending = true;
    } else {
      templateAdd.allowDups = true;
      templateAdd.overwriteCommitted = false;
      templateAdd.overwritePending = false;
    }
    //this is lightweight
    autoDetectParser = new AutoDetectParser(config);
    this.factory = factory;
  }


  /**
   * this must be MT safe... may be called concurrently from multiple threads.
   *
   * @param
   * @param
   */
  void doAdd(SolrContentHandler handler, AddUpdateCommand template)
          throws IOException {
    template.solrDoc = handler.newDocument();
    processor.processAdd(template);
  }

  void addDoc(SolrContentHandler handler) throws IOException {
    templateAdd.indexedId = null;
    doAdd(handler, templateAdd);
  }

  /**
   * @param req
   * @param stream
   * @throws java.io.IOException
   */
  public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream) throws IOException {
    errHeader = "ExtractingDocumentLoader: " + stream.getSourceInfo();
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ENGLISH));
      parser = config.getParser(mt);
    } else {
      parser = autoDetectParser;
    }
    if (parser != null) {
      Metadata metadata = new Metadata();
      metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
      metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
      metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
      metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName);
      }

      SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, schema);
      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputCharStream(writer);
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          }
          if (xpathExpr != null) {
            Matcher matcher =
                    PARSER.parse(xpathExpr);
            serializer.startDocument();//The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
          }
        } else if (xpathExpr != null) {
          Matcher matcher =
                  PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } //else leave it as is

        //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
        parser.parse(inputStream, parsingHandler, metadata);
        if (extractOnly == false) {
          addDoc(handler);
        } else {
          //serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null){
            serializer.endDocument();
          }
          rsp.add(stream.getName(), writer.toString());
          writer.close();
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          }
          rsp.add(stream.getName() + "_metadata", metadataNL);
        }
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } catch (TikaException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
        IOUtils.closeQuietly(inputStream);
      }
    } else {
      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
  }


}