/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.microformats.reltag;
// JDK imports
import java.net.URL;
import java.net.URLDecoder;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.StringUtil;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
/**
* Adds microformat rel-tags of document if found.
*
* @see <a href="http://www.microformats.org/wiki/rel-tag">
* http://www.microformats.org/wiki/rel-tag</a>
* @author Jérôme Charron
*/
public class RelTagParser implements HtmlParseFilter {
public final static Log LOG = LogFactory.getLog(RelTagParser.class);
public final static String REL_TAG = "Rel-Tag";
private Configuration conf = null;
/**
* Scan the HTML document looking at possible rel-tags
*/
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
// get parse obj
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
Parser parser = new Parser(doc);
Set tags = parser.getRelTags();
Iterator iter = tags.iterator();
Metadata metadata = parse.getData().getParseMeta();
while (iter.hasNext()) {
metadata.add(REL_TAG, (String) iter.next());
}
return parseResult;
}
private static class Parser {
Set tags = null;
Parser(Node node) {
tags = new TreeSet();
parse(node);
}
Set getRelTags() {
return tags;
}
void parse(Node node) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
// Look for <a> tag
if ("a".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node hrefNode = attrs.getNamedItem("href");
// Checks that it contains a href attribute
if (hrefNode != null) {
Node relNode = attrs.getNamedItem("rel");
// Checks that it contains a rel attribute too
if (relNode != null) {
// Finaly checks that rel=tag
if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
String tag = parseTag(hrefNode.getNodeValue());
if (!StringUtil.isEmpty(tag)) {
tags.add(tag);
}
}
}
}
}
}
// Recurse
NodeList children = node.getChildNodes();
for (int i=0; children != null && i<children.getLength(); i++) {
parse(children.item(i));
}
}
private final static String parseTag(String url) {
String tag = null;
try {
URL u = new URL(url);
String path = u.getPath();
tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
} catch (Exception e) {
// Malformed tag...
tag = null;
}
return tag;
}
}
/* ----------------------------- *
* <implementation:Configurable> *
* ----------------------------- */
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
/* ------------------------------ *
* </implementation:Configurable> *
* ------------------------------ */
}