/*
* Copyright 2012-2017 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler.transformer;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.codelibs.core.lang.ClassUtil;
import org.codelibs.core.lang.FieldUtil;
import org.codelibs.core.misc.ValueHolder;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.es.config.exentity.LabelType;
import org.codelibs.fess.es.config.exentity.WebConfig;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.DocumentHelper;
import org.codelibs.fess.helper.FileTypeHelper;
import org.codelibs.fess.helper.LabelTypeHelper;
import org.codelibs.fess.helper.LabelTypeHelper.LabelTypePattern;
import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.MemoryUtil;
import org.cyberneko.html.parsers.DOMParser;
import org.lastaflute.di.core.exception.ComponentNotFoundException;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
public class FessXpathTransformerTest extends UnitFessTestCase {
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformerTest.class);
public void test_transform() throws Exception {
String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
SingletonLaContainerFactory.getContainer().register(CrawlingInfoHelper.class, "crawlingInfoHelper");
SingletonLaContainerFactory.getContainer().register(PathMappingHelper.class, "pathMappingHelper");
SingletonLaContainerFactory.getContainer().register(CrawlingConfigHelper.class, "crawlingConfigHelper");
SingletonLaContainerFactory.getContainer().register(SystemHelper.class, "systemHelper");
SingletonLaContainerFactory.getContainer().register(FileTypeHelper.class, "fileTypeHelper");
SingletonLaContainerFactory.getContainer().register(DocumentHelper.class, "documentHelper");
SingletonLaContainerFactory.getContainer().register(LabelTypeHelper.class, "labelTypeHelper");
WebConfig webConfig = new WebConfig();
setValueToObject(webConfig, "labelTypeList", new ArrayList<LabelType>());
ComponentUtil.getCrawlingConfigHelper().store("test", webConfig);
setValueToObject(ComponentUtil.getLabelTypeHelper(), "labelTypePatternList", new ArrayList<LabelTypePattern>());
for (int i = 0; i < 10000; i++) {
if (i % 1000 == 0) {
logger.info(MemoryUtil.getMemoryUsageLog() + ":" + i);
System.gc();
}
ResponseData responseData = new ResponseData();
responseData.setCharSet("UTF-8");
responseData.setContentLength(data.length());
responseData.setExecutionTime(1000L);
responseData.setHttpStatusCode(200);
responseData.setLastModified(new Date());
responseData.setMethod("GET");
responseData.setMimeType("text/html");
responseData.setParentUrl("http://fess.codelibs.org/");
responseData.setResponseBody(data.getBytes());
responseData.setSessionId("test-1");
responseData.setStatus(0);
responseData.setUrl("http://fess.codelibs.org/test.html");
ResultData resultData = fessXpathTransformer.transform(responseData);
// System.out.println(resultData.toString());
}
System.gc();
Thread.sleep(1000L);
logger.info(MemoryUtil.getMemoryUsageLog());
assertTrue(MemoryUtil.getUsedMemory() < 100000000L);
}
private void setValueToObject(Object obj, String name, Object value) {
Field field = ClassUtil.getDeclaredField(obj.getClass(), name);
field.setAccessible(true);
FieldUtil.set(field, obj, value);
}
public void test_pruneNode() throws Exception {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "";
}
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
assertEquals(getXmlString(document), getXmlString(pruneNode));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeNoScript() throws Exception {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "noscript";
}
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<SCRIPT>"));
assertTrue(docString.contains("foo"));
assertTrue(docString.contains("<NOSCRIPT>"));
assertTrue(docString.contains("bar"));
assertTrue(pnString.contains("<SCRIPT>"));
assertTrue(pnString.contains("foo"));
assertFalse(pnString.contains("<NOSCRIPT>"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeScriptAndNoscript() throws Exception {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "script,noscript";
}
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<SCRIPT>"));
assertTrue(docString.contains("foo"));
assertTrue(docString.contains("<NOSCRIPT>"));
assertTrue(docString.contains("bar"));
assertFalse(pnString.contains("<SCRIPT>"));
assertFalse(pnString.contains("foo"));
assertFalse(pnString.contains("<NOSCRIPT>"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeDivId() throws Exception {
final String data = "<html><body><br/><div>foo</div><div id=\"barid\">bar</div></body></html>";
final Document document = getDocument(data);
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "div#barid";
}
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<DIV>"));
assertTrue(docString.contains("foo"));
assertTrue(docString.contains("<DIV id=\"barid\">"));
assertTrue(docString.contains("bar"));
assertTrue(pnString.contains("<DIV>"));
assertTrue(pnString.contains("foo"));
assertFalse(pnString.contains("<DIV id=\"barid\">"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeDivClass() throws Exception {
final String data = "<html><body><br/><div>foo</div><div class=\"barcls\">bar</div></body></html>";
final Document document = getDocument(data);
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "div.barcls";
}
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<DIV>"));
assertTrue(docString.contains("foo"));
assertTrue(docString.contains("<DIV class=\"barcls\">"));
assertTrue(docString.contains("bar"));
assertTrue(pnString.contains("<DIV>"));
assertTrue(pnString.contains("foo"));
assertFalse(pnString.contains("<DIV class=\"barcls\">"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_processGoogleOffOn() throws Exception {
final String data =
"<html><body>foo1<!--googleoff: index-->foo2<a href=\"index.html\">foo3</a>foo4<!--googleon: index-->foo5</body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final Node pruneNode = transformer.processGoogleOffOn(document, new ValueHolder<>(true));
final String output = getXmlString(pruneNode).replaceAll(".*<BODY>", "").replaceAll("</BODY>.*", "");
assertEquals("foo1<!--googleoff: index--><A href=\"index.html\"></A><!--googleon: index-->foo5", output);
}
public void test_processMetaRobots_no() throws Exception {
final String data = "<html><body>foo</body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
transformer.processMetaRobots(responseData, new ResultData(), document);
assertFalse(responseData.isNoFollow());
}
public void test_processMetaRobots_none() throws Exception {
final String data = "<meta name=\"robots\" content=\"none\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processMetaRobots_noindexnofollow() throws Exception {
final String data = "<meta name=\"ROBOTS\" content=\"NOINDEX,NOFOLLOW\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processMetaRobots_noindex() throws Exception {
final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
responseData.setResponseBody(data.getBytes());
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
public void test_processMetaRobots_nofollow() throws Exception {
final String data = "<meta name=\"robots\" content=\"nofollow\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
transformer.processMetaRobots(responseData, new ResultData(), document);
assertTrue(responseData.isNoFollow());
}
private Document getDocument(final String data) throws Exception {
final DOMParser parser = new DOMParser();
final ByteArrayInputStream is = new ByteArrayInputStream(data.getBytes("UTF-8"));
parser.parse(new InputSource(is));
return parser.getDocument();
}
private String getXmlString(final Node node) throws Exception {
final TransformerFactory tf = TransformerFactory.newInstance();
final javax.xml.transform.Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.INDENT, "no");
// transformer.setOutputProperty(OutputKeys.METHOD, "xml");
final StringWriter writer = new StringWriter();
final StreamResult result = new StreamResult(writer);
final DOMSource source = new DOMSource(node);
transformer.transform(source, result);
return writer.toString();
}
public void test_isValidPath_valid() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
String value;
value = "foo.html";
assertTrue(fessXpathTransformer.isValidPath(value));
value = "./foo.html";
assertTrue(fessXpathTransformer.isValidPath(value));
value = "/foo.html";
assertTrue(fessXpathTransformer.isValidPath(value));
value = "http://www.seasar.org/foo.html";
assertTrue(fessXpathTransformer.isValidPath(value));
value = "a javascript:...";
assertTrue(fessXpathTransformer.isValidPath(value));
}
public void test_isValidPath_invalid() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
String value;
value = "javascript:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = "mailto:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = "irc:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = " javascript:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = " mailto:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = " irc:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = "JAVASCRIPT:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = "MAILTO:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = "IRC:...";
assertFalse(fessXpathTransformer.isValidPath(value));
value = "skype:...";
assertFalse(fessXpathTransformer.isValidPath(value));
}
public void test_convertChildUrlList() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
List<RequestData> urlList = new ArrayList<>();
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(0, urlList.size());
urlList.clear();
urlList.add(RequestDataBuilder.newRequestData().get().url("http://www.example.com").build());
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(1, urlList.size());
assertEquals("http://www.example.com", urlList.get(0).getUrl());
urlList.clear();
urlList.add(RequestDataBuilder.newRequestData().get().url("http://www.example.com").build());
urlList.add(RequestDataBuilder.newRequestData().get().url("http://www.test.com").build());
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(2, urlList.size());
assertEquals("http://www.example.com", urlList.get(0).getUrl());
assertEquals("http://www.test.com", urlList.get(1).getUrl());
urlList.clear();
urlList.add(RequestDataBuilder.newRequestData().get().url("feed://www.example.com").build());
urlList.add(RequestDataBuilder.newRequestData().get().url("http://www.test.com").build());
urlList = fessXpathTransformer.convertChildUrlList(urlList);
assertEquals(2, urlList.size());
assertEquals("http://www.example.com", urlList.get(0).getUrl());
assertEquals("http://www.test.com", urlList.get(1).getUrl());
}
public void test_removeCommentTag() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
assertEquals("", fessXpathTransformer.removeCommentTag(""));
assertEquals(" ", fessXpathTransformer.removeCommentTag("<!-- - -->"));
assertEquals("abc", fessXpathTransformer.removeCommentTag("abc"));
assertEquals("abc ", fessXpathTransformer.removeCommentTag("abc<!-- foo -->"));
assertEquals("abc 123", fessXpathTransformer.removeCommentTag("abc<!-- fo\no -->123"));
assertEquals("abc 123", fessXpathTransformer.removeCommentTag("abc<!--\n foo -->123"));
assertEquals("abc 123", fessXpathTransformer.removeCommentTag("abc<!-- foo -->123"));
assertEquals("abc 123 ", fessXpathTransformer.removeCommentTag("abc<!-- foo1 -->123<!-- foo2 -->"));
assertEquals("abc 123 xyz", fessXpathTransformer.removeCommentTag("abc<!-- foo1 -->123<!-- foo2 -->xyz"));
assertEquals("abc ", fessXpathTransformer.removeCommentTag("abc<!---->"));
assertEquals("abc -->", fessXpathTransformer.removeCommentTag("abc<!-- foo-->-->"));
assertEquals("abc<!-- foo", fessXpathTransformer.removeCommentTag("abc<!-- foo"));
assertEquals("abc -->123", fessXpathTransformer.removeCommentTag("abc<!-- <!-- foo --> -->123"));
}
public void test_canonicalXpath() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Map<String, Object> dataMap = new HashMap<String, Object>();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
String data = "<html><body>aaa</body></html>";
Document document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
fail();
} catch (final ComponentNotFoundException e) {
// ignore
}
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
fail();
} catch (final ComponentNotFoundException e) {
// ignore
}
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/foo\"></head><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<RequestData> childUrlList = e.getChildUrlList();
assertEquals(1, childUrlList.size());
assertEquals("http://example.com/foo", childUrlList.iterator().next().getUrl());
}
data = "<html><link rel=\"canonical\" href=\"http://example.com/foo\"><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<RequestData> childUrlList = e.getChildUrlList();
assertEquals(1, childUrlList.size());
assertEquals("http://example.com/foo", childUrlList.iterator().next().getUrl());
}
}
public void test_getSingleNodeValue() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
String data = "<html><body>aaa<style>bbb</style>ccc</body></html>";
Document document = getDocument(data);
String value = transformer.getSingleNodeValue(document, "//BODY", false);
assertEquals("aaa bbb ccc", value);
data = "<html><body> aaa <p> bbb <b>ccc</b> </p> </body></html>";
document = getDocument(data);
value = transformer.getSingleNodeValue(document, "//BODY", false);
assertEquals("aaa bbb ccc", value);
data = "<html><body> aaa <p> bbb <aaa>ccc</bbb> </p> </body></html>";
document = getDocument(data);
value = transformer.getSingleNodeValue(document, "//BODY", false);
assertEquals("aaa bbb ccc", value);
data = "<html><body> aaa <p> bbb <!-- test -->ccc<!-- test --> </p> </body></html>";
document = getDocument(data);
value = transformer.getSingleNodeValue(document, "//BODY", false);
assertEquals("aaa bbb ccc", value);
}
public void test_contentXpath() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
final String data = "<html><head><meta name=\"keywords\" content=\"bbb\"></head><body>aaa</body></html>";
final Document document = getDocument(data);
String value = transformer.getSingleNodeValue(document, "//BODY", false);
assertEquals("aaa", value);
value = transformer.getSingleNodeValue(document, "//META[@name='keywords']/@content", false);
assertEquals("bbb", value);
value = transformer.getSingleNodeValue(document, "//META[@name='keywords']/@content|//BODY", false);
assertEquals("bbb aaa", value);
}
public void test_normalizeCanonicalUrl() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
String value;
value = transformer.normalizeCanonicalUrl("http://hoge.com/", "a");
assertEquals("http://hoge.com/a", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/", "aaa");
assertEquals("http://hoge.com/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/", "/aaa");
assertEquals("http://hoge.com/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "aaa");
assertEquals("http://hoge.com/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb/", "aaa");
assertEquals("http://hoge.com/bbb/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb/", "/aaa");
assertEquals("http://hoge.com/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "/aaa");
assertEquals("http://hoge.com/aaa", value);
}
}