/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.analysis.lang;
// JDK imports
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
// DOM imports
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* Adds metadata identifying language of document if found
* We could also run statistical analysis here but we'd miss all other formats
*/
public class HTMLLanguageParser implements HtmlParseFilter {
public static final Log LOG = LogFactory.getLog(HTMLLanguageParser.class);
/* A static Map of ISO-639 language codes */
private static Map LANGUAGES_MAP = new HashMap();
static {
try {
Properties p = new Properties();
p.load(HTMLLanguageParser.class
.getResourceAsStream("langmappings.properties"));
Enumeration keys = p.keys();
while (keys.hasMoreElements()) {
String key = (String) keys.nextElement();
String[] values = p.getProperty(key).split(",", -1);
LANGUAGES_MAP.put(key, key);
for (int i=0; i<values.length; i++) {
LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
}
}
} catch (Exception e) {
if (LOG.isFatalEnabled()) { LOG.fatal(e.toString()); }
}
}
private Configuration conf;
/**
* Scan the HTML document looking at possible indications of content language<br>
* <li>1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
* <li>2. meta dc.language (http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language)
* <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
* <br>Only the first occurence of language is stored.
*/
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's language
LanguageParser parser = new LanguageParser(doc);
String lang = parser.getLanguage();
if (lang != null) {
parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
}
return parseResult;
}
static class LanguageParser {
private String dublinCore = null;
private String htmlAttribute = null;
private String httpEquiv = null;
private String language = null;
LanguageParser(Node node) {
parse(node);
if (htmlAttribute != null) { language = htmlAttribute; }
else if (dublinCore != null) { language = dublinCore; }
else {language = httpEquiv; }
}
String getLanguage() {
return language;
}
void parse(Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
String lang = null;
if (nodeType == Node.ELEMENT_NODE) {
// Check for the lang HTML attribute
if (htmlAttribute == null) {
htmlAttribute = parseLanguage(((Element) currentNode).getAttribute("lang"));
}
// Check for Meta
if ("meta".equalsIgnoreCase(nodeName)) {
NamedNodeMap attrs = currentNode.getAttributes();
// Check for the dc.language Meta
if (dublinCore == null) {
for (int i=0; i<attrs.getLength(); i++) {
Node attrnode = attrs.item(i);
if ("name".equalsIgnoreCase(attrnode.getNodeName())) {
if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) {
Node valueattr = attrs.getNamedItem("content");
if (valueattr != null) {
dublinCore = parseLanguage(valueattr.getNodeValue());
}
}
}
}
}
// Check for the http-equiv content-language
if (httpEquiv == null) {
for (int i=0; i<attrs.getLength(); i++){
Node attrnode = attrs.item(i);
if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
if ("content-language".equals(attrnode.getNodeValue().toLowerCase())) {
Node valueattr = attrs.getNamedItem("content");
if (valueattr != null) {
httpEquiv = parseLanguage(valueattr.getNodeValue());
}
}
}
}
}
}
}
if ((dublinCore != null) &&
(htmlAttribute != null) &&
(httpEquiv != null)) {
return;
}
}
}
/**
* Parse a language string and return an ISO 639 primary code,
* or <code>null</code> if something wrong occurs, or if no language is found.
*/
final static String parseLanguage(String lang) {
if (lang == null) { return null; }
String code = null;
String language = null;
// First, split multi-valued values
String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);
int i = 0;
while ((language == null) && (i<langs.length)) {
// Then, get the primary code
code = langs[i].split("-")[0];
code = code.split("_")[0];
// Find the ISO 639 code
language = (String) LANGUAGES_MAP.get(code.toLowerCase());
i++;
}
return language;
}
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}