/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse; // JDK imports import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Vector; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Nutch imports import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.ExtensionPoint; import org.apache.nutch.plugin.PluginRuntimeException; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.ObjectCache; /** Creates and caches {@link Parser} plugins.*/ public final class ParserFactory { public static final Log LOG = LogFactory.getLog(ParserFactory.class); /** Wildcard for default plugins. */ public static final String DEFAULT_PLUGIN = "*"; /** Empty extension list for caching purposes. */ private final List EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST; private Configuration conf; private ExtensionPoint extensionPoint; private ParsePluginList parsePluginList; public ParserFactory(Configuration conf) { this.conf = conf; ObjectCache objectCache = ObjectCache.get(conf); this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( Parser.X_POINT_ID); this.parsePluginList = (ParsePluginList)objectCache.getObject(ParsePluginList.class.getName()); if (this.parsePluginList == null) { this.parsePluginList = new ParsePluginsReader().parse(conf); objectCache.setObject(ParsePluginList.class.getName(), this.parsePluginList); } if (this.extensionPoint == null) { throw new RuntimeException("x point " + Parser.X_POINT_ID + " not found."); } if (this.parsePluginList == null) { throw new RuntimeException( "Parse Plugins preferences could not be loaded."); } } /** * Function returns an array of {@link Parser}s for a given content type. * * The function consults the internal list of parse plugins for the * ParserFactory to determine the list of pluginIds, then gets the * appropriate extension points to instantiate as {@link Parser}s. * * @param contentType The contentType to return the <code>Array</code> * of {@link Parser}s for. * @param url The url for the content that may allow us to get the type from * the file suffix. * @return An <code>Array</code> of {@link Parser}s for the given contentType. * If there were plugins mapped to a contentType via the * <code>parse-plugins.xml</code> file, but never enabled via * the <code>plugin.includes</code> Nutch conf, then those plugins * won't be part of this array, i.e., they will be skipped. * So, if the ordered list of parsing plugins for * <code>text/plain</code> was <code>[parse-text,parse-html, * parse-rtf]</code>, and only <code>parse-html</code> and * <code>parse-rtf</code> were enabled via * <code>plugin.includes</code>, then this ordered Array would * consist of two {@link Parser} interfaces, * <code>[parse-html, parse-rtf]</code>. */ public Parser[] getParsers(String contentType, String url) throws ParserNotFound { List<Parser> parsers = null; List<Extension> parserExts = null; ObjectCache objectCache = ObjectCache.get(conf); // TODO once the MimeTypes is available // parsers = getExtensions(MimeUtils.map(contentType)); // if (parsers != null) { // return parsers; // } // Last Chance: Guess content-type from file url... // parsers = getExtensions(MimeUtils.getMimeType(url)); parserExts = getExtensions(contentType); if (parserExts == null) { throw new ParserNotFound(url, contentType); } parsers = new Vector<Parser>(parserExts.size()); for (Iterator i=parserExts.iterator(); i.hasNext(); ){ Extension ext = (Extension) i.next(); Parser p = null; try { //check to see if we've cached this parser instance yet p = (Parser) objectCache.getObject(ext.getId()); if (p == null) { // go ahead and instantiate it and then cache it p = (Parser) ext.getExtensionInstance(); objectCache.setObject(ext.getId(),p); } parsers.add(p); } catch (PluginRuntimeException e) { if (LOG.isWarnEnabled()) { e.printStackTrace(LogUtil.getWarnStream(LOG)); LOG.warn("ParserFactory:PluginRuntimeException when " + "initializing parser plugin " + ext.getDescriptor().getPluginId() + " instance in getParsers " + "function: attempting to continue instantiating parsers"); } } } return parsers.toArray(new Parser[]{}); } /** * Function returns a {@link Parser} instance with the specified * <code>extId</code>, representing its extension ID. If the Parser * instance isn't found, then the function throws a * <code>ParserNotFound</code> exception. If the function is able to find * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it * will return the already instantiated Parser. Otherwise, if it has to * instantiate the Parser itself , then this function will cache that Parser * in the internal <code>PARSER_CACHE</code>. * * @param id The string extension ID (e.g., * "org.apache.nutch.parse.rss.RSSParser", * "org.apache.nutch.parse.rtf.RTFParseFactory") of the {@link Parser} * implementation to return. * @return A {@link Parser} implementation specified by the parameter * <code>id</code>. * @throws ParserNotFound If the Parser is not found (i.e., registered with * the extension point), or if the there a * {@link PluginRuntimeException} instantiating the {@link Parser}. */ public Parser getParserById(String id) throws ParserNotFound { Extension[] extensions = this.extensionPoint.getExtensions(); Extension parserExt = null; ObjectCache objectCache = ObjectCache.get(conf); if (id != null) { parserExt = getExtension(extensions, id); } if (parserExt == null) { parserExt = getExtensionFromAlias(extensions, id); } if (parserExt == null) { throw new ParserNotFound("No Parser Found for id [" + id + "]"); } // first check the cache if (objectCache.getObject(parserExt.getId()) != null) { return (Parser) objectCache.getObject(parserExt.getId()); // if not found in cache, instantiate the Parser } else { try { Parser p = (Parser) parserExt.getExtensionInstance(); objectCache.setObject(parserExt.getId(), p); return p; } catch (PluginRuntimeException e) { if (LOG.isWarnEnabled()) { LOG.warn("Canno initialize parser " + parserExt.getDescriptor().getPluginId() + " (cause: " + e.toString()); } throw new ParserNotFound("Cannot init parser for id [" + id + "]"); } } } /** * Finds the best-suited parse plugin for a given contentType. * * @param contentType Content-Type for which we seek a parse plugin. * @return a list of extensions to be used for this contentType. * If none, returns <code>null</code>. */ protected List<Extension> getExtensions(String contentType) { ObjectCache objectCache = ObjectCache.get(conf); // First of all, tries to clean the content-type String type = null; type = MimeUtil.cleanMimeType(contentType); List<Extension> extensions = (List<Extension>) objectCache.getObject(type); // Just compare the reference: // if this is the empty list, we know we will find no extension. if (extensions == EMPTY_EXTENSION_LIST) { return null; } if (extensions == null) { extensions = findExtensions(type); if (extensions != null) { objectCache.setObject(type, extensions); } else { // Put the empty extension list into cache // to remember we don't know any related extension. objectCache.setObject(type, EMPTY_EXTENSION_LIST); } } return extensions; } /** * searches a list of suitable parse plugins for the given contentType. * <p>It first looks for a preferred plugin defined in the parse-plugin * file. If none is found, it returns a list of default plugins. * * @param contentType Content-Type for which we seek a parse plugin. * @return List - List of extensions to be used for this contentType. * If none, returns null. */ private List<Extension> findExtensions(String contentType) { Extension[] extensions = this.extensionPoint.getExtensions(); // Look for a preferred plugin. List<String> parsePluginList = this.parsePluginList.getPluginList(contentType); List<Extension> extensionList = matchExtensions(parsePluginList, extensions, contentType); if (extensionList != null) { return extensionList; } // If none found, look for a default plugin. parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN); return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN); } /** * Tries to find a suitable parser for the given contentType. * <ol> * <li>It checks if a parser which accepts the contentType * can be found in the <code>plugins</code> list;</li> * <li>If this list is empty, it tries to find amongst the loaded * extensions whether some of them might suit and warns the user.</li> * </ol> * @param plugins List of candidate plugins. * @param extensions Array of loaded extensions. * @param contentType Content-Type for which we seek a parse plugin. * @return List - List of extensions to be used for this contentType. * If none, returns null. */ private List<Extension> matchExtensions(List<String> plugins, Extension[] extensions, String contentType) { List<Extension> extList = new ArrayList<Extension>(); if (plugins != null) { for (String parsePluginId : plugins) { Extension ext = getExtension(extensions, parsePluginId, contentType); // the extension returned may be null // that means that it was not enabled in the plugin.includes // nutch conf property, but it was mapped in the // parse-plugins.xml // file. // OR it was enabled in plugin.includes, but the plugin's plugin.xml // file does not claim that the plugin supports the specified mimeType // in either case, LOG the appropriate error message to WARN level if (ext == null) { //try to get it just by its pluginId ext = getExtension(extensions, parsePluginId); if (LOG.isWarnEnabled()) { if (ext != null) { // plugin was enabled via plugin.includes // its plugin.xml just doesn't claim to support that // particular mimeType LOG.warn("ParserFactory:Plugin: " + parsePluginId + " mapped to contentType " + contentType + " via parse-plugins.xml, but " + "its plugin.xml " + "file does not claim to support contentType: " + contentType); } else { // plugin wasn't enabled via plugin.includes LOG.warn("ParserFactory: Plugin: " + parsePluginId + " mapped to contentType " + contentType + " via parse-plugins.xml, but not enabled via " + "plugin.includes in nutch-default.xml"); } } } if (ext != null) { // add it to the list extList.add(ext); } } } else { // okay, there were no list of plugins defined for // this mimeType, however, there may be plugins registered // via the plugin.includes nutch conf property that claim // via their plugin.xml file to support this contentType // so, iterate through the list of extensions and if you find // any extensions where this is the case, throw a // NotMappedParserException for (int i=0; i<extensions.length; i++) { if (extensions[i].getAttribute("contentType") != null && extensions[i].getAttribute("contentType").equals( contentType)) { extList.add(extensions[i]); } } if (extList.size() > 0) { if (LOG.isInfoEnabled()) { LOG.info("The parsing plugins: " + extList + " are enabled via the plugin.includes system " + "property, and all claim to support the content type " + contentType + ", but they are not mapped to it in the " + "parse-plugins.xml file"); } } else if (LOG.isDebugEnabled()) { LOG.debug("ParserFactory:No parse plugins mapped or enabled for " + "contentType " + contentType); } } return (extList.size() > 0) ? extList : null; } private boolean match(Extension extension, String id, String type) { return ((id.equals(extension.getId())) && (type.equals(extension.getAttribute("contentType")) || type.equals(DEFAULT_PLUGIN))); } /** Get an extension from its id and supported content-type. */ private Extension getExtension(Extension[] list, String id, String type) { for (int i=0; i<list.length; i++) { if (match(list[i], id, type)) { return list[i]; } } return null; } private Extension getExtension(Extension[] list, String id) { for (int i=0; i<list.length; i++) { if (id.equals(list[i].getId())) { return list[i]; } } return null; } private Extension getExtensionFromAlias(Extension[] list, String id) { return getExtension(list, parsePluginList.getAliases().get(id)); } }