/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.extraction; /** * The various Solr Parameters names to use when extracting content. * **/ public interface ExtractingParams { /** * Map all generated attribute names to field names with lowercase and underscores. */ public static final String LOWERNAMES = "lowernames"; /** * The param prefix for mapping Tika metadata to Solr fields. * <p/> * To map a field, add a name like: * <pre>fmap.title=solr.title</pre> * * In this example, the tika "title" metadata value will be added to a Solr field named "solr.title" * * */ public static final String MAP_PREFIX = "fmap."; /** * The boost value for the name of the field. The boost can be specified by a name mapping. * <p/> * For example * <pre> * map.title=solr.title * boost.solr.title=2.5 * </pre> * will boost the solr.title field for this document by 2.5 * */ public static final String BOOST_PREFIX = "boost."; /** * Pass in literal values to be added to the document, as in * <pre> * literal.myField=Foo * </pre> * */ public static final String LITERALS_PREFIX = "literal."; /** * Restrict the extracted parts of a document to be indexed * by passing in an XPath expression. All content that satisfies the XPath expr. * will be passed to the {@link SolrContentHandler}. * <p/> * See Tika's docs for what the extracted document looks like. * <p/> * @see #CAPTURE_ELEMENTS */ public static final String XPATH_EXPRESSION = "xpath"; /** * Only extract and return the content, do not index it. */ public static final String EXTRACT_ONLY = "extractOnly"; /** * Content output format if extractOnly is true. Default is "xml", alternative is "text". */ public static final String EXTRACT_FORMAT = "extractFormat"; /** * Capture attributes separately according to the name of the element, instead of just adding them to the string buffer */ public static final String CAPTURE_ATTRIBUTES = "captureAttr"; /** * Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different * then the case of passing in an XPath expression. * <p/> * The Capture field is based on the localName returned to the {@link SolrContentHandler} * by Tika, not to be confused by the mapped field. The field name can then * be mapped into the index schema. * <p/> * For instance, a Tika document may look like: * <pre> * <html> * ... * <body> * <p>some text here. <div>more text</div></p> * Some more text * </body> * </pre> * By passing in the p tag, you could capture all P tags separately from the rest of the t * Thus, in the example, the capture of the P tag would be: "some text here. more text" * */ public static final String CAPTURE_ELEMENTS = "capture"; /** * The type of the stream. If not specified, Tika will use mime type detection. */ public static final String STREAM_TYPE = "stream.type"; /** * Optional. The file name. If specified, Tika can take this into account while * guessing the MIME type. */ public static final String RESOURCE_NAME = "resource.name"; /** * Optional. If specified, the prefix will be prepended to all Metadata, such that it would be possible * to setup a dynamic field to automatically capture it */ public static final String UNKNOWN_FIELD_PREFIX = "uprefix"; /** * Optional. If specified and the name of a potential field cannot be determined, the default Field specified * will be used instead. */ public static final String DEFAULT_FIELD = "defaultField"; }