/* $Id$ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.webcrawler.tests; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.agents.interfaces.*; import org.apache.manifoldcf.crawler.interfaces.*; import org.apache.manifoldcf.crawler.system.ManifoldCF; import java.io.*; import java.util.*; import org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector; import org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig; /** Run a session-based crawl */ public class SessionTester { protected org.apache.manifoldcf.crawler.tests.ManifoldCFInstance instance; public SessionTester(org.apache.manifoldcf.crawler.tests.ManifoldCFInstance instance) { this.instance = instance; } public void executeTest() throws Exception { // Hey, we were able to install the web connector etc. // Now, create a local test job and run it. IThreadContext tc = ThreadContextFactory.make(); // Create a basic file system connection, and save it. IRepositoryConnectionManager mgr = RepositoryConnectionManagerFactory.make(tc); IRepositoryConnection conn = mgr.create(); conn.setName("Web Connection"); conn.setDescription("Web Connection"); conn.setClassName("org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector"); conn.setMaxConnections(100); ConfigParams cp = conn.getConfigParams(); cp.setParameter(WebcrawlerConfig.PARAMETER_EMAIL,"someone@somewhere.com"); cp.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE,"none"); // Set session auth settings ConfigurationNode accessCredential = new ConfigurationNode(WebcrawlerConfig.NODE_ACCESSCREDENTIAL); accessCredential.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_SESSION); accessCredential.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/web/"); // Add auth pages to accessCredential node // Redirection to login page ConfigurationNode redirectToLogin = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE); redirectToLogin.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/protectedcontent\\.html\\?"); redirectToLogin.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION); redirectToLogin.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/loginpage\\.html\\?"); accessCredential.addChild(accessCredential.getChildCount(),redirectToLogin); // Redirection to login page from index ConfigurationNode redirectFromIndex = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE); redirectFromIndex.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/index\\.html$"); redirectFromIndex.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION); redirectFromIndex.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/loginpage\\.html$"); accessCredential.addChild(accessCredential.getChildCount(),redirectFromIndex); // Login page ConfigurationNode loginPage = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE); loginPage.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/loginpage\\.html(\\?|$)"); loginPage.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_FORM); loginPage.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,""); // Set credentials ConfigurationNode userParameter = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPARAMETER); userParameter.setAttribute(WebcrawlerConfig.ATTR_NAMEREGEXP,"user"); userParameter.setAttribute(WebcrawlerConfig.ATTR_VALUE,"foo"); loginPage.addChild(loginPage.getChildCount(),userParameter); ConfigurationNode passwordParameter = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPARAMETER); passwordParameter.setAttribute(WebcrawlerConfig.ATTR_NAMEREGEXP,"password"); passwordParameter.setAttribute(WebcrawlerConfig.ATTR_VALUE,"bar"); loginPage.addChild(loginPage.getChildCount(),passwordParameter); accessCredential.addChild(accessCredential.getChildCount(),loginPage); // Redirection from login page to content ConfigurationNode redirectFromLogin = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE); redirectFromLogin.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/loginpage\\.html\\?"); redirectFromLogin.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION); redirectFromLogin.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/protectedcontent\\.html\\?"); accessCredential.addChild(accessCredential.getChildCount(),redirectFromLogin); // Redirection from login page to index ConfigurationNode redirectToIndexFromLogin = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE); redirectToIndexFromLogin.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/loginpage\\.html$"); redirectToIndexFromLogin.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION); redirectToIndexFromLogin.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/index\\.html$"); accessCredential.addChild(accessCredential.getChildCount(),redirectToIndexFromLogin); cp.addChild(cp.getChildCount(),accessCredential); // Now, save mgr.save(conn); // Create a basic null output connection, and save it. IOutputConnectionManager outputMgr = OutputConnectionManagerFactory.make(tc); IOutputConnection outputConn = outputMgr.create(); outputConn.setName("Null Connection"); outputConn.setDescription("Null Connection"); outputConn.setClassName("org.apache.manifoldcf.agents.tests.TestingOutputConnector"); outputConn.setMaxConnections(100); // Now, save outputMgr.save(outputConn); // Create a job. IJobManager jobManager = JobManagerFactory.make(tc); IJobDescription job = jobManager.createJob(); job.setDescription("Test Job"); job.setConnectionName("Web Connection"); job.addPipelineStage(-1,true,"Null Connection",""); job.setType(job.TYPE_SPECIFIED); job.setStartMethod(job.START_DISABLE); job.setHopcountMode(job.HOPCOUNT_NEVERDELETE); // Now, set up the document specification. Specification ds = job.getSpecification(); // Set up the seed SpecificationNode sn = new SpecificationNode(WebcrawlerConfig.NODE_SEEDS); sn.setValue("http://localhost:8191/web/index.html\n"); ds.addChild(ds.getChildCount(),sn); sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDES); sn.setValue(".*\n"); ds.addChild(ds.getChildCount(),sn); sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDESINDEX); sn.setValue(".*\n"); ds.addChild(ds.getChildCount(),sn); // Save the job. jobManager.save(job); // Now, start the job, and wait until it completes. long startTime = System.currentTimeMillis(); jobManager.manualStart(job.getID()); instance.waitJobInactiveNative(jobManager,job.getID(),600000L); System.err.println("Crawl required "+new Long(System.currentTimeMillis()-startTime).toString()+" milliseconds"); // Check to be sure we actually processed the right number of documents. JobStatus status = jobManager.getStatus(job.getID()); if (status.getDocumentsProcessed() != 101) { throw new ManifoldCFException("Wrong number of documents processed - expected 101, saw "+new Long(status.getDocumentsProcessed()).toString()); } // Now, delete the job. jobManager.deleteJob(job.getID()); instance.waitJobDeletedNative(jobManager,job.getID(),600000L); // Cleanup is automatic by the base class, so we can feel free to leave jobs and connections lying around. } }