/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.collection; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.util.Collection; import org.apache.nutch.util.NutchConfiguration; import junit.framework.TestCase; public class TestSubcollection extends TestCase { /**Test filtering logic * * @throws Exception */ public void testFilter() throws Exception { Subcollection sc=new Subcollection(NutchConfiguration.create()); sc.setWhiteList("www.nutch.org\nwww.apache.org"); sc.setBlackList("jpg\nwww.apache.org/zecret/"); //matches whitelist assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html")); //matches blacklist assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html")); assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); //no match assertEquals(null, sc.filter("http://www.google.com/")); } public void testInput(){ StringBuffer xml=new StringBuffer(); xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); xml.append("<!-- just a comment -->"); xml.append("<subcollections>"); xml.append("<subcollection>"); xml.append("<name>nutch collection</name>"); xml.append("<id>nutch</id>"); xml.append("<whitelist>"); xml.append("http://lucene.apache.org/nutch/\n"); xml.append("http://wiki.apache.org/nutch/\n"); xml.append("</whitelist>"); xml.append("<blacklist>"); xml.append("http://www.xxx.yyy\n"); xml.append("</blacklist>"); xml.append("</subcollection>"); xml.append("</subcollections>"); InputStream is=new ByteArrayInputStream(xml.toString().getBytes()); CollectionManager cm=new CollectionManager(); cm.parse(is); Collection c=cm.getAll(); // test that size matches assertEquals(1,c.size()); Subcollection collection=(Subcollection)c.toArray()[0]; //test collection id assertEquals("nutch", collection.getId()); //test collection name assertEquals("nutch collection", collection.getName()); //test whitelist assertEquals(2,collection.whiteList.size()); String wlUrl=(String)collection.whiteList.get(0); assertEquals("http://lucene.apache.org/nutch/", wlUrl); wlUrl=(String)collection.whiteList.get(1); assertEquals("http://wiki.apache.org/nutch/", wlUrl); //matches whitelist assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/")); //test blacklist assertEquals(1,collection.blackList.size()); String blUrl=(String)collection.blackList.get(0); assertEquals("http://www.xxx.yyy", blUrl); //no match assertEquals(null, collection.filter("http://www.google.com/")); } }