/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.parser;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class HtmlContentHandler extends DefaultHandler {
private final int MAX_ANCHOR_LENGTH = 100;
private enum Element {
A, AREA, LINK, IFRAME, FRAME, EMBED, IMG, BASE, META, BODY
}
private static class HtmlFactory {
private static Map<String, Element> name2Element;
static {
name2Element = new HashMap<>();
for (Element element : Element.values()) {
name2Element.put(element.toString().toLowerCase(), element);
}
}
public static Element getElement(String name) {
return name2Element.get(name);
}
}
private String base;
private String metaRefresh;
private String metaLocation;
private boolean isWithinBodyElement;
private StringBuilder bodyText;
private List<ExtractedUrlAnchorPair> outgoingUrls;
private ExtractedUrlAnchorPair curUrl = null;
private boolean anchorFlag = false;
private StringBuilder anchorText = new StringBuilder();
public HtmlContentHandler() {
isWithinBodyElement = false;
bodyText = new StringBuilder();
outgoingUrls = new ArrayList<>();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
Element element = HtmlFactory.getElement(localName);
if (element == Element.A || element == Element.AREA || element == Element.LINK) {
String href = attributes.getValue("href");
if (href != null) {
anchorFlag = true;
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(href);
outgoingUrls.add(curUrl);
}
return;
}
if (element == Element.IMG) {
String imgSrc = attributes.getValue("src");
if (imgSrc != null) {
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(imgSrc);
outgoingUrls.add(curUrl);
}
return;
}
if (element == Element.IFRAME || element == Element.FRAME || element == Element.EMBED) {
String src = attributes.getValue("src");
if (src != null) {
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(src);
outgoingUrls.add(curUrl);
}
return;
}
if (element == Element.BASE) {
if (base != null) { // We only consider the first occurrence of the
// Base element.
String href = attributes.getValue("href");
if (href != null) {
base = href;
}
}
return;
}
if (element == Element.META) {
String equiv = attributes.getValue("http-equiv");
String content = attributes.getValue("content");
if (equiv != null && content != null) {
equiv = equiv.toLowerCase();
// http-equiv="refresh" content="0;URL=http://foo.bar/..."
if (equiv.equals("refresh") && (metaRefresh == null)) {
int pos = content.toLowerCase().indexOf("url=");
if (pos != -1) {
metaRefresh = content.substring(pos + 4);
}
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(metaRefresh);
outgoingUrls.add(curUrl);
}
// http-equiv="location" content="http://foo.bar/..."
if (equiv.equals("location") && (metaLocation == null)) {
metaLocation = content;
curUrl = new ExtractedUrlAnchorPair();
curUrl.setHref(metaRefresh);
outgoingUrls.add(curUrl);
}
}
return;
}
if (element == Element.BODY) {
isWithinBodyElement = true;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
Element element = HtmlFactory.getElement(localName);
if (element == Element.A || element == Element.AREA || element == Element.LINK) {
anchorFlag = false;
if (curUrl != null) {
String anchor = anchorText.toString().replaceAll("\n", " ").replaceAll("\t", " ").trim();
if (!anchor.isEmpty()) {
if (anchor.length() > MAX_ANCHOR_LENGTH) {
anchor = anchor.substring(0, MAX_ANCHOR_LENGTH) + "...";
}
curUrl.setAnchor(anchor);
}
anchorText.delete(0, anchorText.length());
}
curUrl = null;
}
if (element == Element.BODY) {
isWithinBodyElement = false;
}
}
@Override
public void characters(char ch[], int start, int length) throws SAXException {
if (isWithinBodyElement) {
bodyText.append(ch, start, length);
if (anchorFlag) {
anchorText.append(new String(ch, start, length));
}
}
}
public String getBodyText() {
return bodyText.toString();
}
public List<ExtractedUrlAnchorPair> getOutgoingUrls() {
return outgoingUrls;
}
public String getBaseUrl() {
return base;
}
}