/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse;
// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// Nutch Imports
import org.apache.nutch.protocol.Content;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
/**
* A Utility class containing methods to simply perform parsing utilities such
* as iterating through a preferred list of {@link Parser}s to obtain
* {@link Parse} objects.
*
* @author mattmann
* @author Jérôme Charron
* @author Sébastien Le Callonnec
*/
public class ParseUtil {
/* our log stream */
public static final Log LOG = LogFactory.getLog(ParseUtil.class);
private ParserFactory parserFactory;
/**
*
* @param conf
*/
public ParseUtil(Configuration conf) {
this.parserFactory = new ParserFactory(conf);
}
/**
* Performs a parse by iterating through a List of preferred {@link Parser}s
* until a successful parse is performed and a {@link Parse} object is
* returned. If the parse is unsuccessful, a message is logged to the
* <code>WARNING</code> level, and an empty parse is returned.
*
* @param content The content to try and parse.
* @return <key, {@link Parse}> pairs.
* @throws ParseException If no suitable parser is found to perform the parse.
*/
public ParseResult parse(Content content) throws ParseException {
Parser[] parsers = null;
try {
parsers = this.parserFactory.getParsers(content.getContentType(),
content.getUrl() != null ? content.getUrl():"");
} catch (ParserNotFound e) {
if (LOG.isWarnEnabled()) {
LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() +
" of type " + content.getContentType());
}
throw new ParseException(e.getMessage());
}
ParseResult parseResult = null;
for (int i=0; i<parsers.length; i++) {
if (LOG.isDebugEnabled()) {
LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
}
parseResult = parsers[i].getParse(content);
if (parseResult != null && !parseResult.isEmpty())
return parseResult;
}
if (LOG.isWarnEnabled()) {
LOG.warn("Unable to successfully parse content " + content.getUrl() +
" of type " + content.getContentType());
}
return null;
}
/**
* Method parses a {@link Content} object using the {@link Parser} specified
* by the parameter <code>extId</code>, i.e., the Parser's extension ID.
* If a suitable {@link Parser} is not found, then a <code>WARNING</code>
* level message is logged, and a ParseException is thrown. If the parse is
* uncessful for any other reason, then a <code>WARNING</code> level
* message is logged, and a <code>ParseStatus.getEmptyParse()</code> is
* returned.
*
* @param extId The extension implementation ID of the {@link Parser} to use
* to parse the specified content.
* @param content The content to parse.
*
* @return <key, {@link Parse}> pairs if the parse is successful, otherwise,
* a single <key, <code>ParseStatus.getEmptyParse()</code>> pair.
*
* @throws ParseException If there is no suitable {@link Parser} found
* to perform the parse.
*/
public ParseResult parseByExtensionId(String extId, Content content)
throws ParseException {
Parser p = null;
try {
p = this.parserFactory.getParserById(extId);
} catch (ParserNotFound e) {
if (LOG.isWarnEnabled()) {
LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() +
" of type " + content.getContentType());
}
throw new ParseException(e.getMessage());
}
ParseResult parseResult = p.getParse(content);
if (parseResult != null && !parseResult.isEmpty()) {
return parseResult;
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Unable to successfully parse content " + content.getUrl() +
" of type " + content.getContentType());
}
return null;
}
}
}