/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.plugin.crawler.generate.parser; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.html.HtmlParser; import com.ewcms.plugin.crawler.generate.crawler.Configurable; import com.ewcms.plugin.crawler.generate.crawler.CrawlConfig; import com.ewcms.plugin.crawler.generate.crawler.Page; import com.ewcms.plugin.crawler.generate.url.URLCanonicalizer; import com.ewcms.plugin.crawler.generate.url.WebURL; import com.ewcms.plugin.crawler.generate.util.Util; /** * @author Yasser Ganjisaffar <lastname at gmail dot com> */ public class Parser extends Configurable { private HtmlParser htmlParser; private ParseContext parseContext; public Parser(CrawlConfig config) { super(config); htmlParser = new HtmlParser(); parseContext = new ParseContext(); } public boolean parse(Page page, String contextURL) { if (Util.hasBinaryContent(page.getContentType())) { if (!config.isIncludeBinaryContentInCrawling()) { return false; } else { page.setParseData(BinaryParseData.getInstance()); return true; } } else if (Util.hasPlainTextContent(page.getContentType())) { try { TextParseData parseData = new TextParseData(); parseData.setTextContent(new String(page.getContentData(), page.getContentCharset())); page.setParseData(parseData); return true; } catch (Exception e) { e.printStackTrace(); } return false; } Metadata metadata = new Metadata(); HtmlContentHandler contentHandler = new HtmlContentHandler(); InputStream inputStream = null; try { inputStream = new ByteArrayInputStream(page.getContentData()); htmlParser.parse(inputStream, contentHandler, metadata, parseContext); } catch (Exception e) { e.printStackTrace(); } finally { try { if (inputStream != null) { inputStream.close(); } } catch (IOException e) { e.printStackTrace(); } } if (page.getContentCharset() == null) { page.setContentCharset(metadata.get("Content-Encoding")); } HtmlParseData parseData = new HtmlParseData(); parseData.setText(contentHandler.getBodyText().trim()); parseData.setTitle(metadata.get(Metadata.TITLE)); List<WebURL> outgoingUrls = new ArrayList<WebURL>(); String baseURL = contentHandler.getBaseUrl(); if (baseURL != null) { contextURL = baseURL; } int urlCount = 0; for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) { String href = urlAnchorPair.getHref(); href = href.trim(); if (href.length() == 0) { continue; } String hrefWithoutProtocol = href.toLowerCase(); if (href.startsWith("http://")) { hrefWithoutProtocol = href.substring(7); } if (!hrefWithoutProtocol.contains("javascript:") && !hrefWithoutProtocol.contains("@")) { String url = URLCanonicalizer.getCanonicalURL(href, contextURL); if (url != null) { WebURL webURL = new WebURL(); webURL.setURL(url); webURL.setAnchor(urlAnchorPair.getAnchor()); outgoingUrls.add(webURL); urlCount++; if (urlCount > config.getMaxOutgoingLinksToFollow()) { break; } } } } parseData.setOutgoingUrls(outgoingUrls); try { if (page.getContentCharset() == null) { parseData.setHtml(new String(page.getContentData())); } else { parseData.setHtml(new String(page.getContentData(), page.getContentCharset())); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); return false; } page.setParseData(parseData); return true; } }