package guang.crawler.crawlWorker.parser; import guang.crawler.commons.Page; import guang.crawler.commons.WebURL; import guang.crawler.commons.parserData.BinaryParseData; import guang.crawler.commons.parserData.HtmlParseData; import guang.crawler.commons.parserData.TextParseData; import guang.crawler.crawlWorker.WorkerConfig; import guang.crawler.crawlWorker.url.URLCanonicalizer; import guang.crawler.crawlWorker.util.Util; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.html.HtmlParser; /** * 解析器,用来对下载的页面进行解析 * * @author sun * */ public class Parser { protected static final Logger logger = Logger.getLogger(Parser.class.getName()); /** * HTML页面的解析器 */ private HtmlParser htmlParser; /** * 解析上下文 */ private ParseContext parseContext; public Parser() { this.htmlParser = new HtmlParser(); this.parseContext = new ParseContext(); } /** * 对页面进行解析 * * @param page * @param contextURL * @return */ public boolean parse(final Page page, final String contextURL) { // 如果页面中含有二进制页面内容. if (Util.hasBinaryContent(page.getContentType())) { if (!WorkerConfig.me() .isIncludeBinaryContentInCrawling()) { return false; } page.setParseData(BinaryParseData.getInstance()); return true; } // 如果页面中含有文本内容(txt,javascript,css) else if (Util.hasPlainTextContent(page.getContentType())) { // 如果只是一般的文本,而不是HTML页面,那么就没有什么好处理的了 try { TextParseData parseData = new TextParseData(); if (page.getContentCharset() == null) { parseData.setTextContent(new String(page.getContentData())); } else { parseData.setTextContent(new String(page.getContentData(), page.getContentCharset())); } page.setParseData(parseData); return true; } catch (Exception e) { Parser.logger.error(e.getMessage() + ", while parsing: " + page.getWebURL() .getURL()); } return false; } else { // 否则,其他类型都被算作HTML页面类型 // 将其当作HTML页面进行解析 Metadata metadata = new Metadata(); HtmlContentHandler contentHandler = new HtmlContentHandler(); InputStream inputStream = null; try { inputStream = new ByteArrayInputStream(page.getContentData()); this.htmlParser.parse(inputStream, contentHandler, metadata, this.parseContext); } catch (Exception e) { Parser.logger.error(e.getMessage() + ", while parsing: " + page.getWebURL() .getURL()); } finally { try { if (inputStream != null) { inputStream.close(); } } catch (IOException e) { Parser.logger.error(e.getMessage() + ", while parsing: " + page.getWebURL() .getURL()); } } // 将解析的结果设置到HtmlParseData以及Page中 if (page.getContentCharset() == null) { page.setContentCharset(metadata.get("Content-Encoding")); } HtmlParseData parseData = new HtmlParseData(); parseData.setText(contentHandler.getBodyText() .trim()); parseData.setTitle(metadata.get(DublinCore.TITLE)); // 处理获取的URL连接 List<WebURL> outgoingUrls = this.parseURLs(contextURL, contentHandler); parseData.setOutgoingUrls(outgoingUrls); try { if (page.getContentCharset() == null) { parseData.setHtml(new String(page.getContentData())); } else { parseData.setHtml(new String(page.getContentData(), page.getContentCharset())); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); return false; } page.setParseData(parseData); return true; } } /** * 对URL进行解析,根据页面的<base>的设置,确定那些相对URL的路径. * * @param contextURL * @param contentHandler * @return */ private List<WebURL> parseURLs(String contextURL, final HtmlContentHandler contentHandler) { List<WebURL> outgoingUrls = new ArrayList<WebURL>(); String baseURL = contentHandler.getBaseUrl(); if (baseURL != null) { contextURL = baseURL; } int urlCount = 0; for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) { String href = urlAnchorPair.getHref(); href = href.trim(); if (href.length() == 0) { continue; } String hrefWithoutProtocol = href.toLowerCase(); if (href.startsWith("http://")) { hrefWithoutProtocol = href.substring(7); } if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("mailto:") && !hrefWithoutProtocol.contains("@")) { String url = URLCanonicalizer.getCanonicalURL(href, contextURL); if (url != null) { WebURL webURL = WebURL.newWebURL() .setURL(url) .setAnchor(urlAnchorPair.getAnchor()); outgoingUrls.add(webURL); urlCount++; if (urlCount > WorkerConfig.me() .getMaxOutgoingLinksToFollow()) { break; } } } } return outgoingUrls; } }