package guang.crawler.crawlWorker.parser; import guang.crawler.crawlWorker.util.LinkElement; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; /** * 这个类用来简单的处理HTML页面内容,包括:获取页面中去除了标签的纯文本信息;获取静态页面的URL等. * * @author yang */ public class HtmlContentHandler extends DefaultHandler { /** * 锚点文字信息的长度,避免过长的文字造成的影响 */ private final int MAX_ANCHOR_LENGTH = 100; /** * 页面中的base URL */ private String base; /** * 重定向meta信息 */ private String metaRefresh; /** * 重定向的目标地址 */ private String metaLocation; /** * 目前是否在Body元素之中 */ private boolean isWithinBodyElement; /** * body正文内容 */ private StringBuilder bodyText; /** * 对外的链接 */ private List<ExtractedUrlAnchorPair> outgoingUrls; /** * 目前正在处理的URL */ private ExtractedUrlAnchorPair curUrl = null; /** * 当前是否处于anchor之内 */ private boolean anchorFlag = false; /** * 锚点的内容 */ private StringBuilder anchorText = new StringBuilder(); public HtmlContentHandler() { this.isWithinBodyElement = false; this.bodyText = new StringBuilder(); this.outgoingUrls = new ArrayList<ExtractedUrlAnchorPair>(); } @Override public void characters(final char ch[], final int start, final int length) throws SAXException { if (this.isWithinBodyElement) { this.bodyText.append(ch, start, length); if (this.anchorFlag) { this.anchorText.append(new String(ch, start, length)); } } } @Override public void endDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void endElement(final String uri, final String localName, final String qName) throws SAXException { LinkElement element = LinkElement.getElement(localName); // 在这里只有Element.A,Element.AREA,Element.LINK三个元素,是因为需要采集它们的锚点文字信息,并不是说其他元素的链接地址不会被处理,它们同样也是被处理的。 if ((element == LinkElement.A) || (element == LinkElement.AREA) || (element == LinkElement.LINK)) { this.anchorFlag = false; if (this.curUrl != null) { // 去除换行和table字符,替换成空格 String anchor = this.anchorText.toString() .replaceAll("\n", " ") .replaceAll("\t", " ") .trim(); if (!anchor.isEmpty()) { if (anchor.length() > this.MAX_ANCHOR_LENGTH) { anchor = anchor.substring(0, this.MAX_ANCHOR_LENGTH) + "..."; } this.curUrl.setAnchor(anchor); } this.anchorText.delete(0, this.anchorText.length()); } // 由于锚点已经结束了,因此需要将其置为null。 this.curUrl = null; } if (element == LinkElement.BODY) { this.isWithinBodyElement = false; } } @Override public void endPrefixMapping(final String prefix) throws SAXException { // TODO Auto-generated method stub } @Override public void error(final SAXParseException exception) throws SAXException { // TODO Auto-generated method stub } @Override public void fatalError(final SAXParseException exception) throws SAXException { // TODO Auto-generated method stub } public String getBaseUrl() { return this.base; } public String getBodyText() { return this.bodyText.toString(); } public List<ExtractedUrlAnchorPair> getOutgoingUrls() { return this.outgoingUrls; } @Override public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException { // TODO Auto-generated method stub } @Override public void notationDecl(final String name, final String publicId, final String systemId) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(final String target, final String data) throws SAXException { // TODO Auto-generated method stub } @Override public InputSource resolveEntity(final String publicId, final String systemId) throws SAXException, IOException { // TODO Auto-generated method stub return null; } @Override public void setDocumentLocator(final Locator locator) { // TODO Auto-generated method stub } @Override public void skippedEntity(final String name) throws SAXException { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) throws SAXException { LinkElement element = LinkElement.getElement(localName); if ((element == LinkElement.A) || (element == LinkElement.AREA) || (element == LinkElement.LINK)) { String href = attributes.getValue("href"); if (href != null) { this.anchorFlag = true; this.curUrl = new ExtractedUrlAnchorPair(); this.curUrl.setHref(href); this.outgoingUrls.add(this.curUrl); } return; } if (element == LinkElement.IMG) { String imgSrc = attributes.getValue("src"); if (imgSrc != null) { this.curUrl = new ExtractedUrlAnchorPair(); this.curUrl.setHref(imgSrc); this.outgoingUrls.add(this.curUrl); } return; } if ((element == LinkElement.IFRAME) || (element == LinkElement.FRAME) || (element == LinkElement.EMBED)) { String src = attributes.getValue("src"); if (src != null) { this.curUrl = new ExtractedUrlAnchorPair(); this.curUrl.setHref(src); this.outgoingUrls.add(this.curUrl); } return; } if (element == LinkElement.BASE) { if (this.base != null) { // We only consider the first occurrence of // the // Base element. String href = attributes.getValue("href"); if (href != null) { this.base = href; } } return; } if (element == LinkElement.META) { String equiv = attributes.getValue("http-equiv"); String content = attributes.getValue("content"); if ((equiv != null) && (content != null)) { equiv = equiv.toLowerCase(); // http-equiv="refresh" content="0;URL=http://foo.bar/..." if (equiv.equals("refresh") && (this.metaRefresh == null)) { int pos = content.toLowerCase() .indexOf("url="); if (pos != -1) { this.metaRefresh = content.substring(pos + 4); } this.curUrl = new ExtractedUrlAnchorPair(); this.curUrl.setHref(this.metaRefresh); this.outgoingUrls.add(this.curUrl); } // http-equiv="location" content="http://foo.bar/..." if (equiv.equals("location") && (this.metaLocation == null)) { this.metaLocation = content; this.curUrl = new ExtractedUrlAnchorPair(); this.curUrl.setHref(this.metaRefresh); this.outgoingUrls.add(this.curUrl); } } return; } if (element == LinkElement.BODY) { this.isWithinBodyElement = true; } } @Override public void startPrefixMapping(final String prefix, final String uri) throws SAXException { // TODO Auto-generated method stub } @Override public void unparsedEntityDecl(final String name, final String publicId, final String systemId, final String notationName) throws SAXException { // TODO Auto-generated method stub } @Override public void warning(final SAXParseException exception) throws SAXException { // TODO Auto-generated method stub } }