/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse.tika; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import javax.imageio.spi.ServiceRegistry; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.tika.exception.TikaException; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.MimeTypesFactory; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.mortbay.log.Log; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * Parse xml config file. */ public class TikaConfig { private final Map<String, Parser> parsers = new HashMap<String, Parser>(); private final MimeTypes mimeTypes; public TikaConfig(String file) throws TikaException, IOException, SAXException { this(new File(file)); } public TikaConfig(File file) throws TikaException, IOException, SAXException { this(getBuilder().parse(file)); } public TikaConfig(URL url) throws TikaException, IOException, SAXException { this(getBuilder().parse(url.toString())); } public TikaConfig(InputStream stream) throws TikaException, IOException, SAXException { this(getBuilder().parse(stream)); } /** * @deprecated This method will be removed in Apache Tika 1.0 * @see <a * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> */ public TikaConfig(InputStream stream, Parser delegate) throws TikaException, IOException, SAXException { this(stream); } public TikaConfig(Document document) throws TikaException, IOException { this(document.getDocumentElement()); } /** * @deprecated This method will be removed in Apache Tika 1.0 * @see <a * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> */ public TikaConfig(Document document, Parser delegate) throws TikaException, IOException { this(document); } public TikaConfig(Element element) throws TikaException, IOException { Element mtr = getChild(element, "mimeTypeRepository"); if (mtr != null && mtr.hasAttribute("resource")) { mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource")); } else { mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); } NodeList nodes = element.getElementsByTagName("parser"); for (int i = 0; i < nodes.getLength(); i++) { Element node = (Element) nodes.item(i); String name = node.getAttribute("class"); try { Class<?> parserClass = Class.forName(name); Object instance = parserClass.newInstance(); if (!(instance instanceof Parser)) { throw new TikaException( "Configured class is not a Tika Parser: " + name); } Parser parser = (Parser) instance; NodeList mimes = node.getElementsByTagName("mime"); if (mimes.getLength() > 0) { for (int j = 0; j < mimes.getLength(); j++) { parsers.put(getText(mimes.item(j)).trim(), parser); } } else { ParseContext context = new ParseContext(); for (MediaType type : parser.getSupportedTypes(context)) { parsers.put(type.toString(), parser); } } } catch (ClassNotFoundException e) { throw new TikaException("Configured parser class not found: " + name, e); } catch (IllegalAccessException e) { throw new TikaException("Unable to access a parser class: " + name, e); } catch (InstantiationException e) { throw new TikaException( "Unable to instantiate a parser class: " + name, e); } } } public TikaConfig() throws MimeTypeException, IOException { ParseContext context = new ParseContext(); Iterator<Parser> iterator = ServiceRegistry.lookupProviders( Parser.class, this.getClass().getClassLoader()); while (iterator.hasNext()) { Parser parser = iterator.next(); for (MediaType type : parser.getSupportedTypes(context)) { parsers.put(type.toString(), parser); } } mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); } /** * @deprecated This method will be removed in Apache Tika 1.0 * @see <a * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> */ public TikaConfig(Element element, Parser delegate) throws TikaException, IOException { this(element); } private String getText(Node node) { if (node.getNodeType() == Node.TEXT_NODE) { return node.getNodeValue(); } else if (node.getNodeType() == Node.ELEMENT_NODE) { StringBuilder builder = new StringBuilder(); NodeList list = node.getChildNodes(); for (int i = 0; i < list.getLength(); i++) { builder.append(getText(list.item(i))); } return builder.toString(); } else { return ""; } } /** * Returns the parser instance configured for the given MIME type. Returns * <code>null</code> if the given MIME type is unknown. * * @param mimeType * MIME type * @return configured Parser instance, or <code>null</code> */ public Parser getParser(String mimeType) { return parsers.get(mimeType); } public Map<String, Parser> getParsers() { return parsers; } public MimeTypes getMimeRepository() { return mimeTypes; } /** * Provides a default configuration (TikaConfig). Currently creates a new * instance each time it's called; we may be able to have it return a shared * instance once it is completely immutable. * * @return default configuration */ public static TikaConfig getDefaultConfig() { try { return new TikaConfig(); } catch (IOException e) { throw new RuntimeException("Unable to read default configuration", e); } catch (TikaException e) { throw new RuntimeException( "Unable to access default configuration", e); } } /** * @deprecated This method will be removed in Apache Tika 1.0 * @see <a * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> */ public static TikaConfig getDefaultConfig(Parser delegate) throws TikaException { return getDefaultConfig(); } private static DocumentBuilder getBuilder() throws TikaException { try { return DocumentBuilderFactory.newInstance().newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new TikaException("XML parser not available", e); } } private static Element getChild(Element element, String name) { Node child = element.getFirstChild(); while (child != null) { if (child.getNodeType() == Node.ELEMENT_NODE && name.equals(child.getNodeName())) { return (Element) child; } child = child.getNextSibling(); } return null; } }