/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.plugin.crawler; import static org.junit.Assert.*; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.ewcms.plugin.crawler.manager.dao.GatherDAO; import com.ewcms.plugin.crawler.manager.service.GatherService; import com.ewcms.plugin.crawler.model.MatchBlock; /** * 测试匹配块和过滤块对HTML进行过滤的结果 * * @author wu_zhijun * */ public class BlockIterationTest { private static final Logger logger = LoggerFactory.getLogger(BlockIterationTest.class); private GatherService gatherService; private GatherDAO gatherDAO; @Before public void setUp() { gatherService = new GatherService(); gatherDAO = mock(GatherDAO.class); gatherService.setGatherDAO(gatherDAO); } @Test public void testRegex(){ String matchRegex = "div#test > div#test1"; assertEquals(matchRegex.substring(matchRegex.length() - 3).equals(" > "), false); matchRegex = "div#test > div#test1 > "; if (matchRegex.substring(matchRegex.length() - 3).equals(" > ")){ matchRegex = matchRegex.substring(0, matchRegex.length() - 3); assertEquals(matchRegex.length(), "div#test > div#test1".length()); } matchRegex = "div#test2, div#tets3, "; assertEquals(matchRegex.substring(0, matchRegex.length() - 2).length(), "div#test2, div#tets3".length()); } @Test public void testJsoup(){ try { Document doc = Jsoup.connect("http://sports.sina.com.cn/k/2011-12-14/00395867392.shtml").get(); Elements elements = doc.select("h1#artibodyTitle, div#artibody"); logger.info(elements.html()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } @Test public void testMatchBlock(){ String html="<html>" + " <head>" + " <title>测试使用</title>" + " </head" + " <body>" + " <table class='table_1'>" + " <tr class='tr_1'>" + " <td class='td_1'>" + " 内容1" + " </td>" + " </tr>" + " <tr class='tr_2'>" + " <td class='td_2'>" + " 内容2" + " </td>" + " </tr>" + " </table>" + " <table class='table_2'>" + " <tr class='tr_3'>" + " <td class='td'>" + " 内容3" + " </td>" + " </tr>" + " <tr class='tr_4'>" + " <td class='td'>" + " 内容4" + " </td>" + " </tr>" + " </table>" + " <table class='table_3'>" + " <tr class='tr_3'>" + " <td class='td_5'>" + " 内容5" + " </td>" + " </tr>" + " <tr class='tr_4'>" + " <td class='td'>" + " 内容6" + " </td>" + " </tr>" + " </table>" + " </body>" + "</html>"; MatchBlock matchBlock_1 = new MatchBlock(); matchBlock_1.setId(1L); matchBlock_1.setParent(null); matchBlock_1.setRegex("table.table_1"); matchBlock_1.setSort(1L); MatchBlock matchBlock_2 = new MatchBlock(); matchBlock_2.setId(2L); matchBlock_2.setParent(null); matchBlock_2.setRegex("table.table_2"); matchBlock_2.setSort(2L); MatchBlock matchBlock_3 = new MatchBlock(); matchBlock_3.setId(3L); matchBlock_3.setParent(matchBlock_1); matchBlock_3.setRegex("tr.tr_3"); matchBlock_3.setSort(3L); MatchBlock matchBlock_4 = new MatchBlock(); matchBlock_4.setId(4L); matchBlock_4.setParent(matchBlock_1); matchBlock_4.setRegex("tr.tr_4"); matchBlock_4.setSort(4L); MatchBlock matchBlock_5 = new MatchBlock(); matchBlock_5.setId(5L); matchBlock_5.setParent(matchBlock_3); matchBlock_5.setRegex("td.td_5"); matchBlock_5.setSort(5L); List<MatchBlock> parents = new ArrayList<MatchBlock>(); parents.add(matchBlock_1); parents.add(matchBlock_2); List<MatchBlock> matchBlocks_1 = new ArrayList<MatchBlock>(); matchBlocks_1.add(matchBlock_3); matchBlocks_1.add(matchBlock_4); when(gatherDAO.findChildMatchBlockByParentId(1L, 1L)).thenReturn(matchBlocks_1); when(gatherDAO.findChildMatchBlockByParentId(1L, 2L)).thenReturn(new ArrayList<MatchBlock>()); List<MatchBlock> matchBlocks_2 = new ArrayList<MatchBlock>(); matchBlocks_2.add(matchBlock_5); when(gatherDAO.findChildMatchBlockByParentId(1L, 3L)).thenReturn(matchBlocks_2); Document doc = Jsoup.parse(html); StringBuffer sbHtml = new StringBuffer(); childrenMatchBlock(1L, doc, parents, sbHtml); logger.info(sbHtml.toString()); } private void childrenMatchBlock(Long gatherId, Document doc, List<MatchBlock> matchBlocks, StringBuffer sbHtml) { for (MatchBlock matchBlock : matchBlocks) { String regex = matchBlock.getRegex(); Elements elements = doc.select(regex); String subHtml = elements.html(); List<MatchBlock> childrens = gatherDAO.findChildMatchBlockByParentId(gatherId, matchBlock.getId()); if (!childrens.isEmpty()) { Document subDoc = Jsoup.parse(subHtml); childrenMatchBlock(gatherId, subDoc, childrens, sbHtml); } sbHtml.append(subHtml); } } }