package uk.bl.wa.parsers;
/*
* #%L
* warc-indexer
* $Id:$
* $HeadURL:$
* %%
* Copyright (C) 2013 - 2014 The UK Web Archive
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.ParseError;
import org.jsoup.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import uk.bl.wa.util.Instrument;
public class HtmlFeatureParser extends AbstractParser {
/** */
private static final long serialVersionUID = 1631417895901342814L;
private static Log log = LogFactory.getLog(HtmlFeatureParser.class);
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.text("html"),
MediaType.application("xhtml")
)));
// The parser to use, preferring the XML variation as it does not 'fix' the
// mark-up.
private Parser parser = Parser.xmlParser();
// Max errors to returm:
private int max_errors;
private final boolean normaliseLinks;
public static final String ORIGINAL_PUB_DATE = "OriginalPublicationDate";
// Explicit property to get faster link handling as it allows for set with multiple values (same as LINKS?)
public static final Property LINK_LIST = Property.internalTextBag("LinkList");
public static final Property LINKS = Property.internalTextBag("LINK-LIST");
public static final String FIRST_PARAGRAPH = "FirstParagraph";
public static final Property DISTINCT_ELEMENTS = Property.internalTextBag("DISTINCT-ELEMENTS");
public static final Property NUM_PARSE_ERRORS = Property
.internalInteger("Html-Parse-Error-Count");
public static final int DEFAULT_MAX_PARSE_ERRORS = 1000;
// Setting this to true also adds the field url_norm to the Solr document in WARCIndexer
public static final String CONF_LINKS_NORMALISE = "warc.index.extract.linked.normalise";
public static final boolean DEFAULT_LINKS_NORMALISE = false;
/**
*
*/
public HtmlFeatureParser() {
this(ConfigFactory.empty());
}
public HtmlFeatureParser(Config conf) {
normaliseLinks = conf.hasPath(CONF_LINKS_NORMALISE) ?
conf.getBoolean(CONF_LINKS_NORMALISE) :
DEFAULT_LINKS_NORMALISE;
this.setMaxParseErrors(DEFAULT_MAX_PARSE_ERRORS);
}
/**
*
* @param max_errors
*/
public void setMaxParseErrors(int max_errors) {
this.max_errors = max_errors;
parser.setTrackErrors(max_errors);
}
/**
*
* @return
*/
public int getMaxParseErrors() {
return this.max_errors;
}
/**
*
* @return
*/
public List<ParseError> getParseErrors() {
return this.parser.getErrors();
}
/**
*
*/
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
/**
*
*/
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
final long start = System.nanoTime();
// Pick up the URL:
String url = metadata.get( Metadata.RESOURCE_NAME_KEY );
// Parse it using JSoup
Document doc = null;
try {
doc = Jsoup.parse(stream, null, url, parser);
} catch (java.nio.charset.IllegalCharsetNameException e ) {
log.warn("Jsoup parse had to assume UTF-8: "+e);
doc = Jsoup.parse(stream, "UTF-8", url );
} catch( Exception e ) {
log.error("Jsoup parse failed: "+e);
} finally {
if( doc == null ) return;
}
Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#jsoupparse", start);
final long nonJsoupStart = System.nanoTime();
// Record the number of errors found:
if (parser.getErrors() != null)
metadata.set(NUM_PARSE_ERRORS, parser.getErrors().size());
// Get the links (no image links):
Set<String> links = this.extractLinks(doc, false);
if( links != null && links.size() > 0 ) {
metadata.set(LINK_LIST, links.toArray(new String[links.size()]));
}
// Get the publication date, from BBC pages:
for( Element meta : doc.select("meta[name=OriginalPublicationDate]") ) {
metadata.set(ORIGINAL_PUB_DATE, meta.attr("content"));
//log.debug(ORIGINAL_PUB_DATE + ": " + meta.attr("content"));
}
// Grab the first paragraph with text, and extract the text:
for( Element p : doc.select("p") ) {
String pt = p.text();
if( pt != null ) {
pt = pt.trim();
if( pt.length() > 0 ) {
metadata.set(FIRST_PARAGRAPH, p.text() );
//log.debug(FIRST_PARAGRAPH + ": " +p.text() );
break;
}
}
}
// Grab the list of distinct elements used in the page:
Set<String> de = new HashSet<String>();
for( Element e : doc.select("*") ) {
if( !"#root".equals(e.tag().getName()) )
de.add(StringUtils.left(e.tag().getName(), 100));
}
// For some elements, dig deeper and record attributes too:
for (Element e : doc.select("link")) {
de.add("link/@rel=" + e.attr("rel"));
}
// Store them:
metadata.set(DISTINCT_ELEMENTS, de.toArray(new String[] {}));
// Licence field, following:
// http://www.whatwg.org/specs/web-apps/current-work/multipage/links.html#link-type-license
for( Element a : doc.select("a[rel=license]") ) {
metadata.add( Metadata.LICENSE_URL, a.attr("href") );
}
for( Element a : doc.select("link[rel=license]") ) {
metadata.add( Metadata.LICENSE_URL, a.attr("href") );
}
for( Element a : doc.select("area[rel=license]") ) {
metadata.add( Metadata.LICENSE_URL, a.attr("href") );
}
Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#featureextract", nonJsoupStart);
}
/**
* Use a tolerant parser to extract all of the absolute a href links from a document.
*
* Does not extract other links, e.g. stylesheets, etc. etc. Image links optional.
*
* @param input The InputStream
* @param charset The character set, e.g. "UTF-8". Value of "null" attempts to extract encoding from the document and falls-back on UTF-8.
* @param baseUri base URI for the page, for resolving relative links. e.g. "http://www.example.com/"
* @return
* @throws java.io.IOException
*/
private Set<String> extractLinks( Document doc, boolean includeImgLinks ) throws IOException {
Set<String> linkset = new HashSet<String>();
// All a with href
for( Element link : doc.select("a[href]") ) {
linkset.add( normaliseLink(link.attr("abs:href")));
}
// All images:
if( includeImgLinks ) {
for( Element link : doc.select("img[src]") ) {
linkset.add( normaliseLink(link.attr("abs:src")));
}
}
// Example of use: all PNG references...
//Elements pngs = doc.select("img[src$=.png]");
//Element masthead = doc.select("div.masthead").first();
return linkset;
}
/**
* Normalises links if the parser has been configured to do so.
* @param link a link from a HTML document.
* @return the link in normalised form or the unchanged link if normalisation has not been enabled.
*/
public String normaliseLink(String link) {
return normaliseLinks ? linkNormaliser.canonicalize(link) : link;
}
private final AggressiveUrlCanonicalizer linkNormaliser = new AggressiveUrlCanonicalizer();
public static Metadata extractMetadata( InputStream in, String url ) {
HtmlFeatureParser hfp = new HtmlFeatureParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, url);
try {
hfp.parse(in, null, metadata, null);
} catch (Exception e) {
log.error("Failed to extract Metadata.");
}
return metadata;
}
}