package org.codelibs.riverweb.transformer; import static org.hamcrest.core.Is.is; import static org.junit.Assert.assertThat; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; import org.codelibs.core.io.ResourceUtil; import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.entity.ResultData; import org.codelibs.riverweb.config.RiverConfig; import org.codelibs.riverweb.config.RiverConfigManager; import org.junit.Test; public class ScrapingTransformerTest { @Test public void fess_codelibs_org() { final RiverConfigManager riverConfigManager = new RiverConfigManager(); final ScrapingTransformer transformer = new ScrapingTransformer() { @SuppressWarnings("unchecked") @Override protected void storeIndex(final ResponseData responseData, final Map<String, Object> dataMap) { System.out.println(dataMap); assertThat(((List<String>) ((Map<String, Object>) dataMap.get("nav")).get("sideMenus")).size(), is(27)); assertThat(((Map<String, Object>) dataMap.get("section1")).get("title").toString(), is("What is Fess?")); assertThat(((List<String>) ((Map<String, Object>) dataMap.get("section1")).get("body")).size(), is(2)); assertThat(((Map<String, Object>) dataMap.get("section2")).get("title").toString(), is("Features")); assertThat(((List<String>) ((Map<String, Object>) dataMap.get("section2")).get("body")).size(), is(12)); } }; transformer.riverConfigManager = riverConfigManager; final String sessionId = "test"; final String url = "http://fess.codelibs.org/"; final RiverConfig riverConfig = riverConfigManager.get(sessionId); transformer.riverConfigLocal.set(riverConfig); final Map<String, Map<String, Object>> scrapingRuleMap = new HashMap<String, Map<String, Object>>(); addScrapingRuleMap(scrapingRuleMap, "text", "nav.sideMenus", "div.sidebar-nav ul li", Boolean.TRUE, Boolean.TRUE); addScrapingRuleMap(scrapingRuleMap, "text", "section1.title", "div.section:eq(0) h2", null, null); addScrapingRuleMap(scrapingRuleMap, "text", "section1.body", "div.section:eq(0) p", Boolean.TRUE, Boolean.TRUE); addScrapingRuleMap(scrapingRuleMap, "text", "section2.title", "div.section:eq(1) h2", null, null); addScrapingRuleMap(scrapingRuleMap, "text", "section2.body", "div.section:eq(1) ul li", Boolean.TRUE, Boolean.TRUE); final Map<String, Object> patternMap = new HashMap<String, Object>(); patternMap.put("url", url); riverConfig.addScrapingRule(null, patternMap, scrapingRuleMap); InputStream is = null; try { final ResponseData responseData = new ResponseData(); responseData.setSessionId(sessionId); responseData.setUrl(url); responseData.setResponseBody(ResourceUtil.getResourceAsFile("html/fess_codelibs_org.html"), false); responseData.setCharSet("UTF-8"); final ResultData resultData = new ResultData(); transformer.storeData(responseData, resultData); } finally { IOUtils.closeQuietly(is); } } private void addScrapingRuleMap(final Map<String, Map<String, Object>> scrapingRuleMap, final String type, final String property, final String path, final Boolean isArray, final Boolean trimSpaces) { final Map<String, Object> valueMap = new HashMap<String, Object>(); valueMap.put(type, path); if (isArray != null) { valueMap.put("is_array", isArray); } if (trimSpaces != null) { valueMap.put("trim_spaces", trimSpaces); } scrapingRuleMap.put(property, valueMap); } }