/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.collection; import java.util.ArrayList; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.nutch.net.URLFilter; import org.apache.xerces.util.DOMUtil; import org.w3c.dom.Element; /** * SubCollection represents a subset of index, you can define url patterns that * will indicate that particular page (url) is part of SubCollection. */ public class Subcollection extends Configured implements URLFilter{ public static final String TAG_COLLECTIONS="subcollections"; public static final String TAG_COLLECTION="subcollection"; public static final String TAG_WHITELIST="whitelist"; public static final String TAG_BLACKLIST="blacklist"; public static final String TAG_NAME="name"; public static final String TAG_ID="id"; ArrayList blackList = new ArrayList(); ArrayList whiteList = new ArrayList(); /** * SubCollection identifier */ String id; /** * SubCollection name */ String name; /** * SubCollection whitelist as String */ String wlString; /** * SubCollection blacklist as String */ String blString; /** public Constructor * * @param id id of SubCollection * @param name name of SubCollection */ public Subcollection(String id, String name, Configuration conf) { this(conf); this.id=id; this.name = name; } public Subcollection(Configuration conf){ super(conf); } /** * @return Returns the name */ public String getName() { return name; } /** * @return Returns the id */ public String getId() { return id; } /** * Returns whitelist * * @return Whitelist entries */ public ArrayList getWhiteList() { return whiteList; } /** * Returns whitelist String * * @return Whitelist String */ public String getWhiteListString() { return wlString; } /** * Returns blacklist String * * @return Blacklist String */ public String getBlackListString() { return blString; } /** * @param whiteList * The whiteList to set. */ public void setWhiteList(ArrayList whiteList) { this.whiteList = whiteList; } /** * Simple "indexOf" currentFilter for matching patterns. * * <pre> * rules for evaluation are as follows: * 1. if pattern matches in blacklist then url is rejected * 2. if pattern matches in whitelist then url is allowed * 3. url is rejected * </pre> * * @see org.apache.nutch.net.URLFilter#filter(java.lang.String) */ public String filter(String urlString) { // first the blacklist Iterator i = blackList.iterator(); while (i.hasNext()) { String row = (String) i.next(); if (urlString.indexOf(row) != -1) return null; } // then whitelist i = whiteList.iterator(); while (i.hasNext()) { String row = (String) i.next(); if (urlString.indexOf(row) != -1) return urlString; } return null; } /** * Initialize Subcollection from dom element * * @param collection */ public void initialize(Element collection) { this.id = DOMUtil.getChildText( collection.getElementsByTagName(TAG_ID).item(0)).trim(); this.name = DOMUtil.getChildText( collection.getElementsByTagName(TAG_NAME).item(0)).trim(); this.wlString = DOMUtil.getChildText( collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim(); this.blString = DOMUtil.getChildText( collection.getElementsByTagName(TAG_BLACKLIST).item(0)).trim(); parseList(this.whiteList, wlString); parseList(this.blackList, blString); } /** * Create a list of patterns from chunk of text, patterns are separated with * newline * * @param list * @param text */ protected void parseList(ArrayList list, String text) { list.clear(); StringTokenizer st = new StringTokenizer(text, "\n\r"); while (st.hasMoreElements()) { String line = (String) st.nextElement(); list.add(line.trim()); } } /** * Set contents of blacklist from String * * @param list the blacklist contents */ public void setBlackList(String list) { this.blString = list; parseList(blackList, list); } /** * Set contents of whitelist from String * * @param list the whitelist contents */ public void setWhiteList(String list) { this.wlString = list; parseList(whiteList, list); } }