/* * Created on Nov 2, 2004 * Author: Andrzej Bialecki <ab@getopt.org> * */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.tools; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintStream; import java.util.BitSet; import java.util.StringTokenizer; import java.util.Vector; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.HitCollector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * This tool prunes existing Nutch indexes of unwanted content. The main method * accepts a list of segment directories (containing indexes). These indexes will * be pruned of any content that matches one or more query from a list of Lucene * queries read from a file (defined in standard config file, or explicitly * overridden from command-line). Segments should already be indexed, if some * of them are missing indexes then these segments will be skipped. * * <p>NOTE 1: Queries are expressed in Lucene's QueryParser syntax, so a knowledge * of available Lucene document fields is required. This can be obtained by reading sources * of <code>index-basic</code> and <code>index-more</code> plugins, or using tools * like <a href="http://www.getopt.org/luke">Luke</a>. During query parsing a * WhitespaceAnalyzer is used - this choice has been made to minimize side effects of * Analyzer on the final set of query terms. You can use {@link org.apache.nutch.searcher.Query#main(String[])} * method to translate queries in Nutch syntax to queries in Lucene syntax.<br> * If additional level of control is required, an instance of {@link PruneChecker} can * be provided to check each document before it's deleted. The results of all * checkers are logically AND-ed, which means that any checker in the chain * can veto the deletion of the current document. Two example checker implementations * are provided - PrintFieldsChecker prints the values of selected index fields, * StoreUrlsChecker stores the URLs of deleted documents to a file. Any of them can * be activated by providing respective command-line options. * </p> * <p>The typical command-line usage is as follows:<br> * <blockquote> * <code>PruneIndexTool index_dir -dryrun -queries queries.txt -showfields url,title</code><br> * This command will just print out fields of matching documents.<br> * <code>PruneIndexTool index_dir -queries queries.txt</code><br> * This command will actually remove all matching entries, according to the * queries read from <code>queries.txt</code> file. * </blockquote></p> * <p>NOTE 2: This tool removes matching documents ONLY from segment indexes (or * from a merged index). In particular it does NOT remove the pages and links * from WebDB. This means that unwanted URLs may pop up again when new segments * are created. To prevent this, use your own {@link org.apache.nutch.net.URLFilter}, * or PruneDBTool (under construction...).</p> * <p>NOTE 3: This tool uses a low-level Lucene interface to collect all matching * documents. For large indexes and broad queries this may result in high memory * consumption. If you encounter OutOfMemory exceptions, try to narrow down your * queries, or increase the heap size.</p> * * @author Andrzej Bialecki <ab@getopt.org> */ public class PruneIndexTool implements Runnable { public static final Log LOG = LogFactory.getLog(PruneIndexTool.class); /** Log the progress every LOG_STEP number of processed documents. */ public static int LOG_STEP = 50000; /** * This interface can be used to implement additional checking on matching * documents. * @author Andrzej Bialecki <ab@getopt.org> */ public static interface PruneChecker { /** * Check whether this document should be pruned. NOTE: this method * MUST NOT modify the IndexReader. * @param reader index reader to read documents from * @param docNum document ID * @return true if the document should be deleted, false otherwise. */ public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception; /** * Close the checker - this could involve flushing output files or somesuch. */ public void close(); } /** * This checker's main function is just to print out * selected field values from each document, just before * they are deleted. * * @author Andrzej Bialecki <ab@getopt.org> */ public static class PrintFieldsChecker implements PruneChecker { private PrintStream ps = null; private String[] fields = null; /** * * @param ps an instance of PrintStream to print the information to * @param fields a list of Lucene index field names. Values from these * fields will be printed for every matching document. */ public PrintFieldsChecker(PrintStream ps, String[] fields) { this.ps = ps; this.fields = fields; } public void close() { ps.flush(); } public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception { Document doc = reader.document(docNum); StringBuffer sb = new StringBuffer("#" + docNum + ":"); for (int i = 0; i < fields.length; i++) { String[] values = doc.getValues(fields[i]); sb.append(" " + fields[i] + "="); if (values != null) { for (int k = 0; k < values.length; k++) { sb.append("[" + values[k] + "]"); } } else sb.append("[null]"); } ps.println(sb.toString()); return true; } } /** * This checker's main function is just to store * the URLs of each document to be deleted in a text file. * * @author Andrzej Bialecki <ab@getopt.org> */ public static class StoreUrlsChecker implements PruneChecker { private BufferedWriter output = null; private boolean storeHomeUrl = false; /** * Store the list in a file * @param out name of the output file */ public StoreUrlsChecker(File out, boolean storeHomeUrl) throws Exception { this.output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8")); this.storeHomeUrl = storeHomeUrl; } public void close() { try { output.flush(); output.close(); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Error closing: " + e.getMessage()); } } } public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception { Document doc = reader.document(docNum); String url = doc.get("url"); output.write(url); output.write('\n'); if (storeHomeUrl) { // store also the main url int idx = url.indexOf("://"); if (idx != -1) { idx = url.indexOf('/', idx + 3); if (idx != -1) { output.write(url.substring(0, idx + 1) + "\n"); } } } return true; } } private Query[] queries = null; private IndexReader reader = null; private IndexSearcher searcher = null; private PruneChecker[] checkers = null; private boolean dryrun = false; private String dr = ""; /** * Create an instance of the tool, and open all input indexes. * @param indexDirs directories with input indexes. At least one valid index must * exist, otherwise an Exception is thrown. * @param queries pruning queries. Each query will be processed in turn, and the * length of the array must be at least one, otherwise an Exception is thrown. * @param checkers if not null, they will be used to perform additional * checks on matching documents - each checker's method {@link PruneChecker#isPrunable(Query, IndexReader, int)} * will be called in turn, for each matching document, and if it returns true this means that * the document should be deleted. A logical AND is performed on the results returned * by all checkers (which means that if one of them returns false, the document will * not be deleted). * @param unlock if true, and if any of the input indexes is locked, forcibly * unlock it. Use with care, only when you are sure that other processes don't * modify the index at the same time. * @param dryrun if set to true, don't change the index, just show what would be done. * If false, perform all actions, changing indexes as needed. Note: dryrun doesn't prevent * PruneCheckers from performing changes or causing any other side-effects. * @throws Exception */ public PruneIndexTool(File[] indexDirs, Query[] queries, PruneChecker[] checkers, boolean unlock, boolean dryrun) throws Exception { if (indexDirs == null || queries == null) throw new Exception("Invalid arguments."); if (indexDirs.length == 0 || queries.length == 0) throw new Exception("Nothing to do."); this.queries = queries; this.checkers = checkers; this.dryrun = dryrun; if (dryrun) dr = "[DRY RUN] "; int numIdx = 0; if (indexDirs.length == 1) { Directory dir = FSDirectory.getDirectory(indexDirs[0], false); if (IndexReader.isLocked(dir)) { if (!unlock) { throw new Exception("Index " + indexDirs[0] + " is locked."); } if (!dryrun) { IndexReader.unlock(dir); if (LOG.isDebugEnabled()) { LOG.debug(" - had to unlock index in " + dir); } } } reader = IndexReader.open(dir); numIdx = 1; } else { Directory dir; Vector<IndexReader> indexes = new Vector<IndexReader>(indexDirs.length); for (int i = 0; i < indexDirs.length; i++) { try { dir = FSDirectory.getDirectory(indexDirs[i], false); if (IndexReader.isLocked(dir)) { if (!unlock) { if (LOG.isWarnEnabled()) { LOG.warn(dr + "Index " + indexDirs[i] + " is locked. Skipping..."); } continue; } if (!dryrun) { IndexReader.unlock(dir); if (LOG.isDebugEnabled()) { LOG.debug(" - had to unlock index in " + dir); } } } IndexReader r = IndexReader.open(dir); indexes.add(r); numIdx++; } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + "Invalid index in " + indexDirs[i] + " - skipping..."); } } } if (indexes.size() == 0) throw new Exception("No input indexes."); IndexReader[] readers = indexes.toArray(new IndexReader[0]); reader = new MultiReader(readers); } if (LOG.isInfoEnabled()) { LOG.info(dr + "Opened " + numIdx + " index(es) with total " + reader.numDocs() + " documents."); } searcher = new IndexSearcher(reader); } /** * This class collects all matching document IDs in a BitSet. * <p>NOTE: the reason to use this API is that the most common way of * performing Lucene queries (Searcher.search(Query)::Hits) does NOT * return all matching documents, because it skips very low scoring hits.</p> * * @author Andrzej Bialecki <ab@getopt.org> */ private static class AllHitsCollector extends HitCollector { private BitSet bits; public AllHitsCollector(BitSet bits) { this.bits = bits; } public void collect(int doc, float score) { bits.set(doc); } } /** * For each query, find all matching documents and delete them from all input * indexes. Optionally, an additional check can be performed by using {@link PruneChecker} * implementations. */ public void run() { BitSet bits = new BitSet(reader.maxDoc()); AllHitsCollector ahc = new AllHitsCollector(bits); boolean doDelete = false; for (int i = 0; i < queries.length; i++) { if (LOG.isInfoEnabled()) { LOG.info(dr + "Processing query: " + queries[i].toString()); } bits.clear(); try { searcher.search(queries[i], ahc); } catch (IOException e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + " - failed: " + e.getMessage()); } continue; } if (bits.cardinality() == 0) { if (LOG.isInfoEnabled()) { LOG.info(dr + " - no matching documents."); } continue; } if (LOG.isInfoEnabled()) { LOG.info(dr + " - found " + bits.cardinality() + " document(s)."); } // Now delete all matching documents int docNum = -1, start = 0, cnt = 0; // probably faster than looping sequentially through all index values? while ((docNum = bits.nextSetBit(start)) != -1) { // don't delete the same document multiple times if (reader.isDeleted(docNum)) continue; try { if (checkers != null && checkers.length > 0) { boolean check = true; for (int k = 0; k < checkers.length; k++) { // fail if any checker returns false check &= checkers[k].isPrunable(queries[i], reader, docNum); } doDelete = check; } else doDelete = true; if (doDelete) { if (!dryrun) reader.deleteDocument(docNum); cnt++; } } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + " - failed to delete doc #" + docNum); } } start = docNum + 1; } if (LOG.isInfoEnabled()) { LOG.info(dr + " - deleted " + cnt + " document(s)."); } } // close checkers if (checkers != null) { for (int i = 0; i < checkers.length; i++) { checkers[i].close(); } } try { reader.close(); } catch (IOException e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + "Exception when closing reader(s): " + e.getMessage()); } } } public static void main(String[] args) throws Exception { if (args.length == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Missing arguments"); } return; } File idx = new File(args[0]); if (!idx.isDirectory()) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Not a directory: " + idx); } return; } Vector<File> paths = new Vector<File>(); if (IndexReader.indexExists(idx)) { paths.add(idx); } else { // try and see if there are segments inside, with index dirs File[] dirs = idx.listFiles(new FileFilter() { public boolean accept(File f) { return f.isDirectory(); } }); if (dirs == null || dirs.length == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx); } return; } for (int i = 0; i < dirs.length; i++) { File sidx = new File(dirs[i], "index"); if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) { paths.add(sidx); } } if (paths.size() == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx + " or its subdirs."); } return; } } File[] indexes = paths.toArray(new File[0]); boolean force = false; boolean dryrun = false; String qPath = null; String outPath = null; String fList = null; for (int i = 1; i < args.length; i++) { if (args[i].equals("-force")) { force = true; } else if (args[i].equals("-queries")) { qPath = args[++i]; } else if (args[i].equals("-output")) { outPath = args[++i]; } else if (args[i].equals("-showfields")) { fList = args[++i]; } else if (args[i].equals("-dryrun")) { dryrun = true; } else { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Unrecognized option: " + args[i]); } return; } } Vector<PruneChecker> cv = new Vector<PruneChecker>(); if (fList != null) { StringTokenizer st = new StringTokenizer(fList, ","); Vector<String> tokens = new Vector<String>(); while (st.hasMoreTokens()) tokens.add(st.nextToken()); String[] fields = tokens.toArray(new String[0]); PruneChecker pc = new PrintFieldsChecker(System.out, fields); cv.add(pc); } if (outPath != null) { StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false); cv.add(luc); } PruneChecker[] checkers = null; if (cv.size() > 0) { checkers = cv.toArray(new PruneChecker[0]); } Query[] queries = null; InputStream is = null; if (qPath != null) { is = new FileInputStream(qPath); } else { Configuration conf = NutchConfiguration.create(); qPath = conf.get("prune.index.tool.queries"); is = conf.getConfResourceAsInputStream(qPath); } if (is == null) { if (LOG.isFatalEnabled()) { LOG.fatal("Can't load queries from " + qPath); } return; } try { queries = parseQueries(is); } catch (Exception e) { if (LOG.isFatalEnabled()) { LOG.fatal("Error parsing queries: " + e.getMessage()); } return; } try { PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, force, dryrun); pit.run(); } catch (Exception e) { if (LOG.isFatalEnabled()) { LOG.fatal("Error running PruneIndexTool: " + e.getMessage()); } return; } } /** * Read a list of Lucene queries from the stream (UTF-8 encoding is assumed). * There should be a single Lucene query per line. Blank lines and comments * starting with '#' are allowed. * <p>NOTE: you may wish to use {@link org.apache.nutch.searcher.Query#main(String[])} * method to translate queries from Nutch format to Lucene format.</p> * @param is InputStream to read from * @return array of Lucene queries * @throws Exception */ public static Query[] parseQueries(InputStream is) throws Exception { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String line = null; QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer()); Vector<Query> queries = new Vector<Query>(); while ((line = br.readLine()) != null) { line = line.trim(); //skip blanks and comments if (line.length() == 0 || line.charAt(0) == '#') continue; Query q = qp.parse(line); queries.add(q); } return queries.toArray(new Query[0]); } private static void usage() { System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]"); System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n"); System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done."); System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!"); System.err.println("\t-queries filename\tread pruning queries from this file, instead of the"); System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n"); System.err.println("\t-output filename\tstore pruned URLs in a text file"); System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields."); System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude."); System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown."); } }