/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; /** * A utility class that stores result of a parse. Internally * a ParseResult stores <{@link Text}, {@link Parse}> pairs. * <p>Parsers may return multiple results, which correspond to parts * or other associated documents related to the original URL.</p> * <p>There will be usually one parse result that corresponds directly * to the original URL, and possibly many (or none) results that correspond * to derived URLs (or sub-URLs). */ public class ParseResult implements Iterable<Map.Entry<Text, Parse>> { private Map<Text, Parse> parseMap; private String originalUrl; public static final Log LOG = LogFactory.getLog(ParseResult.class); /** * Create a container for parse results. * @param originalUrl the original url from which all parse results * have been obtained. */ public ParseResult(String originalUrl) { parseMap = new HashMap<Text, Parse>(); this.originalUrl = originalUrl; } /** * Convenience method for obtaining {@link ParseResult} from a single * {@link Parse} output. * @param url canonical url * @param parse single parse output * @return result containing the single parse output */ public static ParseResult createParseResult(String url, Parse parse) { ParseResult parseResult = new ParseResult(url); parseResult.put(new Text(url), new ParseText(parse.getText()), parse.getData()); return parseResult; } /** * Checks whether the result is empty. * @return */ public boolean isEmpty() { return parseMap.isEmpty(); } /** * Return the number of parse outputs (both successful and failed) */ public int size() { return parseMap.size(); } /** * Retrieve a single parse output. * @param key sub-url under which the parse output is stored. * @return parse output corresponding to this sub-url, or null. */ public Parse get(String key) { return get(new Text(key)); } /** * Retrieve a single parse output. * @param key sub-url under which the parse output is stored. * @return parse output corresponding to this sub-url, or null. */ public Parse get(Text key) { return parseMap.get(key); } /** * Store a result of parsing. * @param key URL or sub-url of this parse result * @param text plain text result * @param data corresponding parse metadata of this result */ public void put(Text key, ParseText text, ParseData data) { put(key.toString(), text, data); } /** * Store a result of parsing. * @param key URL or sub-url of this parse result * @param text plain text result * @param data corresponding parse metadata of this result */ public void put(String key, ParseText text, ParseData data) { parseMap.put(new Text(key), new ParseImpl(text, data, key.equals(originalUrl))); } /** * Iterate over all entries in the <url, Parse> map. */ public Iterator<Entry<Text, Parse>> iterator() { return parseMap.entrySet().iterator(); } /** * Remove all results where status is not successful (as determined * by {@link ParseStatus#isSuccess()}). Note that effects of this operation * cannot be reversed. */ public void filter() { for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { Entry<Text, Parse> entry = i.next(); if (!entry.getValue().getData().getStatus().isSuccess()) { LOG.warn(entry.getKey() + " is not parsed successfully, filtering"); i.remove(); } } } /** * A convenience method which returns true only if all parses are successful. * Parse success is determined by {@link ParseStatus#isSuccess()} */ public boolean isSuccess() { for(Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) { Entry<Text, Parse> entry = i.next(); if (!entry.getValue().getData().getStatus().isSuccess()) { return false; } } return true; } }