/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.extraction; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.lang.invoke.MethodHandles; import java.util.Locale; import org.apache.commons.io.IOUtils; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.UpdateParams; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.handler.loader.ContentStreamLoader; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; import org.apache.xml.serialize.BaseMarkupSerializer; import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.TextSerializer; import org.apache.xml.serialize.XMLSerializer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * The class responsible for loading extracted content into Solr. * **/ public class ExtractingDocumentLoader extends ContentStreamLoader { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); /** * Extract Only supported format */ public static final String TEXT_FORMAT = "text"; /** * Extract Only supported format. Default */ public static final String XML_FORMAT = "xml"; /** * XHTML XPath parser. */ private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML); final SolrCore core; final SolrParams params; final UpdateRequestProcessor processor; final boolean ignoreTikaException; protected AutoDetectParser autoDetectParser; private final AddUpdateCommand templateAdd; protected TikaConfig config; protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor, TikaConfig config, ParseContextConfig parseContextConfig, SolrContentHandlerFactory factory) { this.params = req.getParams(); this.core = req.getCore(); this.config = config; this.parseContextConfig = parseContextConfig; this.processor = processor; templateAdd = new AddUpdateCommand(req); templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); //this is lightweight autoDetectParser = new AutoDetectParser(config); this.factory = factory; ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false); } /** * this must be MT safe... may be called concurrently from multiple threads. * */ void doAdd(SolrContentHandler handler, AddUpdateCommand template) throws IOException { template.solrDoc = handler.newDocument(); processor.processAdd(template); } void addDoc(SolrContentHandler handler) throws IOException { templateAdd.clear(); doAdd(handler, templateAdd); } @Override public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { //Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } if (parser != null) { Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this parameter, // then Tika can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); if (resourceName != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } // Provide stream's content type as hint for auto detection if(stream.getContentType() != null) { metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); } InputStream inputStream = null; try { inputStream = stream.getStream(); metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); if(charset != null){ metadata.add(HttpHeaders.CONTENT_ENCODING, charset); } String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema()); ContentHandler parsingHandler = handler; StringWriter writer = null; BaseMarkupSerializer serializer = null; if (extractOnly == true) { String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); writer = new StringWriter(); if (extractFormat.equals(TEXT_FORMAT)) { serializer = new TextSerializer(); serializer.setOutputCharStream(writer); serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); } else { serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); } if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); serializer.startDocument();//The MatchingContentHandler does not invoke startDocument. See http://tika.markmail.org/message/kknu3hw7argwiqin parsingHandler = new MatchingContentHandler(serializer, matcher); } else { parsingHandler = serializer; } } else if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(handler, matcher); } //else leave it as is try{ //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document. ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); if(pwMapFile != null && pwMapFile.length() > 0) { InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); if(is != null) { log.debug("Password file supplied: "+pwMapFile); epp.parse(is); } } context.set(PasswordProvider.class, epp); String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); if(resourcePassword != null) { epp.setExplicitPassword(resourcePassword); log.debug("Literal password supplied for file "+resourceName); } parser.parse(inputStream, parsingHandler, metadata, context); } catch (TikaException e) { if(ignoreTikaException) log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()) .append(". metadata=").append(metadata.toString()).toString()); else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } if (extractOnly == false) { addDoc(handler); } else { //serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null){ serializer.endDocument(); } rsp.add(stream.getName(), writer.toString()); writer.close(); String[] names = metadata.names(); NamedList metadataNL = new NamedList(); for (int i = 0; i < names.length; i++) { String[] vals = metadata.getValues(names[i]); metadataNL.add(names[i], vals); } rsp.add(stream.getName() + "_metadata", metadataNL); } } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } finally { IOUtils.closeQuietly(inputStream); } } else { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter."); } } public static class MostlyPassthroughHtmlMapper implements HtmlMapper { public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); /** * Keep all elements and their content. * * Apparently <SCRIPT> and <STYLE> elements are blocked elsewhere */ @Override public boolean isDiscardElement(String name) { return false; } /** Lowercases the attribute name */ @Override public String mapSafeAttribute(String elementName, String attributeName) { return attributeName.toLowerCase(Locale.ENGLISH); } /** * Lowercases the element name, but returns null for <BR>, * which suppresses the start-element event for lt;BR> tags. * This also suppresses the <BODY> tags because those * are handled internally by Tika's XHTMLContentHandler. */ @Override public String mapSafeElement(String name) { String lowerName = name.toLowerCase(Locale.ROOT); return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName; } } }