/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.webcrawler.tests;
import org.apache.manifoldcf.agents.interfaces.IOutputConnection;
import org.apache.manifoldcf.agents.interfaces.IOutputConnectionManager;
import org.apache.manifoldcf.agents.interfaces.OutputConnectionManagerFactory;
import org.apache.manifoldcf.core.interfaces.ConfigParams;
import org.apache.manifoldcf.core.interfaces.IResultRow;
import org.apache.manifoldcf.core.interfaces.IResultSet;
import org.apache.manifoldcf.core.interfaces.IThreadContext;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.SpecificationNode;
import org.apache.manifoldcf.core.interfaces.ThreadContextFactory;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig;
import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.server.ServerConnector;
import org.eclipse.jetty.servlet.ServletContextHandler;
import org.eclipse.jetty.util.thread.QueuedThreadPool;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import static org.hamcrest.core.Is.is;
import static org.hamcrest.core.IsNull.nullValue;
import static org.junit.Assert.assertThat;
public class DocumentContentExclusionHSQLDBIT extends BaseITHSQLDB {
private static final int MAX_DOC_COUNT = 3;
public static final String CONTENTFILTER_SERVLET_PATH = "/contentexclusiontest";
private static final int PORT = 8191;
public static final long MAX_WAIT_TIME = 60 * 1000L;
public static final String WEB_CONNECTION = "Web Connection";
static String baseUrl = "http://127.0.0.1:" + PORT + CONTENTFILTER_SERVLET_PATH + "?page=";
private Server server = null;
private IJobManager jobManager;
private IOutputConnectionManager outputConnectionManager;
private IRepositoryConnectionManager repoConnectionManager;
@Before
public void beforeDocumentContentFilterTest() throws Exception {
server = new Server(new QueuedThreadPool(20));
ServerConnector connector = new ServerConnector(server);
connector.setPort(PORT);
connector.setIdleTimeout(60000);// important for Http KeepAlive
server.addConnector(connector);
ServletContextHandler context = new ServletContextHandler(ServletContextHandler.SESSIONS);
context.addServlet(ContentFilterTestServlet.class, CONTENTFILTER_SERVLET_PATH);
server.setHandler(context);
server.start();
IThreadContext tc = ThreadContextFactory.make();
repoConnectionManager = RepositoryConnectionManagerFactory.make(tc);
outputConnectionManager = OutputConnectionManagerFactory.make(tc);
jobManager = JobManagerFactory.make(tc);
createRepoConnector();
createOutputConnector();
}
@Test
public void testDocumentContentExclusion() throws Exception {
//No content exclusion rule
IJobDescription job = setupContentFilterJob();
runContentFilterJob(job);
checkContentFilterHistory(false);
cleanupContentFilterJobs(job);
//With exclusion rule
job = setupContentFilterJob();
//add content exclusion rule
addContentExclusionRule(job);
runContentFilterJob(job);
checkContentFilterHistory(true);
cleanupContentFilterJobs(job);
}
private void checkContentFilterHistory(boolean hasContentExcluded) throws Exception {
FilterCriteria filter = new FilterCriteria(new String[]{"process"}, 0l, Long.MAX_VALUE, new RegExpCriteria(".*\\" + CONTENTFILTER_SERVLET_PATH + ".*", true), null);
SortOrder sortOrderValue = new SortOrder();
sortOrderValue.addCriteria("entityid", SortOrder.SORT_ASCENDING);
IResultSet result = repoConnectionManager.genHistorySimple(WEB_CONNECTION, filter, sortOrderValue, 0, 20);
assertThat(result.getRowCount(), is(MAX_DOC_COUNT));
for (int i = 0; i < MAX_DOC_COUNT; i++) {
IResultRow row = result.getRow(i);
assertThat((String) row.getValue("identifier"), is(baseUrl + i));
if (hasContentExcluded && i == 1) {
//if excluding, only page 1 will be excluded
assertThat((String) row.getValue("resultcode"), is("EXCLUDEDCONTENT"));
assertThat((String) row.getValue("resultdesc"), is("Rejected due to content exclusion rule"));
} else {
assertThat((String) row.getValue("resultcode"), is("OK"));
assertThat(row.getValue("resultdesc"), is(nullValue()));
}
}
}
@After
public void tearDownDocumentContentFilterTest() throws Exception {
if (server != null) {
server.stop();
}
}
private IJobDescription setupContentFilterJob() throws Exception {
// Create a job.
IJobDescription job = jobManager.createJob();
job.setDescription("Test Job");
job.setConnectionName(WEB_CONNECTION);
job.addPipelineStage(-1, true, "Null Connection", "");
job.setType(job.TYPE_SPECIFIED);
job.setStartMethod(job.START_DISABLE);
job.setHopcountMode(job.HOPCOUNT_NEVERDELETE);
Specification jobSpec = job.getSpecification();
// 3 seeds only
SpecificationNode sn = new SpecificationNode(WebcrawlerConfig.NODE_SEEDS);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < MAX_DOC_COUNT; i++) {
sb.append(baseUrl + i + "\n");
}
sn.setValue(sb.toString());
jobSpec.addChild(jobSpec.getChildCount(), sn);
sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDES);
sn.setValue(".*\n");
jobSpec.addChild(jobSpec.getChildCount(), sn);
sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDESINDEX);
sn.setValue(".*\n");
jobSpec.addChild(jobSpec.getChildCount(), sn);
// Save the job.
jobManager.save(job);
return job;
}
private void addContentExclusionRule(IJobDescription job) throws ManifoldCFException {
Specification jobSpec = job.getSpecification();
SpecificationNode sn;
sn = new SpecificationNode(WebcrawlerConfig.NODE_EXCLUDESCONTENTINDEX);
sn.setValue(".*expired.*\n");
jobSpec.addChild(jobSpec.getChildCount(), sn);
jobManager.save(job);
}
private IOutputConnection createOutputConnector() throws ManifoldCFException {
// Create a basic null output connection, and save it.
IOutputConnection outputConn = outputConnectionManager.create();
outputConn.setName("Null Connection");
outputConn.setDescription("Null Connection");
outputConn.setClassName("org.apache.manifoldcf.agents.tests.TestingOutputConnector");
outputConn.setMaxConnections(10);
// Now, save
outputConnectionManager.save(outputConn);
return outputConn;
}
private IRepositoryConnection createRepoConnector() throws ManifoldCFException {
//TODO: This is a copy/paste: Could we have common method for creating test jobs???
IRepositoryConnection repoConnection = repoConnectionManager.create();
repoConnection.setName("Web Connection");
repoConnection.setDescription("Web Connection");
repoConnection.setClassName("org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector");
repoConnection.setMaxConnections(50);
ConfigParams cp = repoConnection.getConfigParams();
cp.setParameter(WebcrawlerConfig.PARAMETER_EMAIL, "someone@somewhere.com");
cp.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE, "none");
repoConnectionManager.save(repoConnection);
return repoConnection;
}
private void cleanupContentFilterJobs(IJobDescription job) throws ManifoldCFException, InterruptedException {
repoConnectionManager.cleanUpHistoryData(WEB_CONNECTION);
jobManager.deleteJob(job.getID());
mcfInstance.waitJobDeletedNative(jobManager, job.getID(), MAX_WAIT_TIME);
}
private void runContentFilterJob(IJobDescription job) throws ManifoldCFException, InterruptedException {
jobManager.manualStart(job.getID());
try {
mcfInstance.waitJobInactiveNative(jobManager, job.getID(), MAX_WAIT_TIME);
} catch (ManifoldCFException e) {
System.err.println("Halting for inspection");
Thread.sleep(1000L);
throw e;
}
// Check to be sure we actually processed the right number of documents.
JobStatus status = jobManager.getStatus(job.getID());
System.err.println("doc processed: " + status.getDocumentsProcessed() + " Job status: " + status.getStatus());
}
public static class ContentFilterTestServlet extends HttpServlet {
@Override
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException,
IOException {
response.setContentType("text/html; charset=utf-8");
//response.setHeader("Keep-Alive", "timeout=5, max=100");
response.setStatus(HttpServletResponse.SC_OK);
String page = request.getParameter("page");
page = (page == null) ? "unkown" : page;
response.getWriter().println("<html><head><title></title></head><body><h1>You are now on page " + page + " </h1>");
if ("1".equals(page)) {
//Only page 1 will contain the keyword "expired"
response.getWriter().println("<h1>Page 1 has expired. bye bye</h1>");
}
response.getWriter().println("</body>");
response.getWriter().flush();
}
}
}