/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse; // Commons Logging imports import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.protocol.Content; /** * A Utility class containing methods to simply perform parsing utilities such * as iterating through a preferred list of {@link Parser}s to obtain * {@link Parse} objects. * * @author mattmann * @author Jérôme Charron * @author Sébastien Le Callonnec */ public class ParseUtil { /* our log stream */ public static final Log LOG = LogFactory.getLog(ParseUtil.class); private ParserFactory parserFactory; /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/ private int MAX_PARSE_TIME = 30; /** * * @param conf */ public ParseUtil(Configuration conf) { this.parserFactory = new ParserFactory(conf); MAX_PARSE_TIME=conf.getInt("parser.timeout", 30); } /** * Performs a parse by iterating through a List of preferred {@link Parser}s * until a successful parse is performed and a {@link Parse} object is * returned. If the parse is unsuccessful, a message is logged to the * <code>WARNING</code> level, and an empty parse is returned. * * @param content The content to try and parse. * @return <key, {@link Parse}> pairs. * @throws ParseException If no suitable parser is found to perform the parse. */ public ParseResult parse(Content content) throws ParseException { Parser[] parsers = null; try { parsers = this.parserFactory.getParsers(content.getContentType(), content.getUrl() != null ? content.getUrl():""); } catch (ParserNotFound e) { if (LOG.isWarnEnabled()) { LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() + " of type " + content.getContentType()); } throw new ParseException(e.getMessage()); } ParseResult parseResult = null; for (int i=0; i<parsers.length; i++) { if (LOG.isDebugEnabled()) { LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]"); } if (MAX_PARSE_TIME!=-1) parseResult = runParser(parsers[i], content); else parseResult = parsers[i].getParse(content); if (parseResult != null && !parseResult.isEmpty()) return parseResult; } if (LOG.isWarnEnabled()) { LOG.warn("Unable to successfully parse content " + content.getUrl() + " of type " + content.getContentType()); } return new ParseStatus(new ParseException("Unable to successfully parse content")).getEmptyParseResult(content.getUrl(), null); } /** * Method parses a {@link Content} object using the {@link Parser} specified * by the parameter <code>extId</code>, i.e., the Parser's extension ID. * If a suitable {@link Parser} is not found, then a <code>WARNING</code> * level message is logged, and a ParseException is thrown. If the parse is * uncessful for any other reason, then a <code>WARNING</code> level * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is * returned. * * @param extId The extension implementation ID of the {@link Parser} to use * to parse the specified content. * @param content The content to parse. * * @return <key, {@link Parse}> pairs if the parse is successful, otherwise, * a single <key, <code>ParseStatus.getEmptyParse()</code>> pair. * * @throws ParseException If there is no suitable {@link Parser} found * to perform the parse. */ public ParseResult parseByExtensionId(String extId, Content content) throws ParseException { Parser p = null; try { p = this.parserFactory.getParserById(extId); } catch (ParserNotFound e) { if (LOG.isWarnEnabled()) { LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() + " of type " + content.getContentType()); } throw new ParseException(e.getMessage()); } ParseResult parseResult = null; if (MAX_PARSE_TIME!=-1) parseResult = runParser(p, content); else parseResult = p.getParse(content); if (parseResult != null && !parseResult.isEmpty()) { return parseResult; } else { if (LOG.isWarnEnabled()) { LOG.warn("Unable to successfully parse content " + content.getUrl() + " of type " + content.getContentType()); } return new ParseStatus(new ParseException("Unable to successfully parse content")).getEmptyParseResult(content.getUrl(), null); } } private ParseResult runParser(Parser p, Content content) { ParseCallable pc = new ParseCallable(p, content); FutureTask<ParseResult> task = new FutureTask<ParseResult>(pc); ParseResult res = null; Thread t = new Thread(task); t.start(); try { res = task.get(MAX_PARSE_TIME, TimeUnit.SECONDS); } catch (TimeoutException e) { LOG.warn("TIMEOUT parsing " + content.getUrl() + " with " + p); } catch (Exception e) { task.cancel(true); res = null; t.interrupt(); } finally { t = null; pc = null; } return res; } }