/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutchbase.searcher; import java.io.*; import java.util.*; import java.util.concurrent.Future; import java.util.concurrent.Callable; import java.util.concurrent.Executors; import java.util.concurrent.ExecutorService; import javax.servlet.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.conf.*; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.util.Bytes; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.searcher.*; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.Inlink; import org.apache.nutchbase.util.hbase.RowPart; import org.apache.nutchbase.util.hbase.ImmutableRowPart; import org.apache.nutchbase.util.hbase.TableUtil; import org.apache.nutchbase.util.hbase.TableColumns; /** * One stop shopping for search-related functionality. * * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $ */ public class NutchBeanHbase implements SearchBean, HitSummarizer, HitInlinks, Closeable { public static final Log LOG = LogFactory.getLog(NutchBean.class); public static final String KEY = "nutchBean"; // static { // LogFormatter.setShowThreadIDs(true); // } private final Summarizer summarizer; private SearchBean searchBean; private HTable table; /** * BooleanQuery won't permit more than 32 required/prohibited clauses. We * don't want to use too many of those. */ private static final int MAX_PROHIBITED_TERMS = 20; private final Configuration conf; private final FileSystem fs; /** Returns the cached instance in the servlet context. * @see NutchBeanConstructor*/ public static NutchBeanHbase get(ServletContext app, Configuration conf) throws IOException { final NutchBeanHbase bean = (NutchBeanHbase)app.getAttribute(KEY); return bean; } /** * @param conf * @throws IOException */ public NutchBeanHbase(Configuration conf, String tablename) throws IOException { this(conf, null, tablename); } /** * Construct in a named directory. * * @param conf * @param dir * @throws IOException */ public NutchBeanHbase(Configuration conf, Path dir, String tablename) throws IOException { if (tablename == null || "".equals(tablename)) { throw new java.lang.IllegalArgumentException("tablename cannot be null or empty!"); } this.conf = conf; this.summarizer = new SummarizerFactory(this.conf).getSummarizer(); this.fs = FileSystem.get(this.conf); if (dir == null) { dir = new Path(this.conf.get("searcher.dir", "crawl")); } final Path luceneConfig = new Path(dir, "search-servers.txt"); final Path solrConfig = new Path(dir, "solr-servers.txt"); if (fs.exists(luceneConfig) || fs.exists(solrConfig)) { searchBean = new DistributedSearchBean(conf, luceneConfig, solrConfig); } else { final Path indexDir = new Path(dir, "index_merged"); final Path indexesDir = new Path(dir, "index"); searchBean = new LuceneSearchBean(conf, indexDir, indexesDir); } table = new HTable(tablename); } public Hits search(Query query, int numHits) throws IOException { return search(query, numHits, null, null, false); } public Hits search(Query query, int numHits, String dedupField, String sortField, boolean reverse) throws IOException { return searchBean.search(query, numHits, dedupField, sortField, reverse); } @Override public boolean ping() throws IOException { return true; } @SuppressWarnings("serial") private class DupHits extends ArrayList<Hit> { private boolean maxSizeExceeded; } /** * Search for pages matching a query, eliminating excessive hits from the * same site. Hits after the first <code>maxHitsPerDup</code> from the same * site are removed from results. The remaining hits have {@link * Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero then all * hits are returned. * * @param query query * @param numHits number of requested hits * @param maxHitsPerDup the maximum hits returned with matching values, or zero * @return Hits the matching hits * @throws IOException */ public Hits search(Query query, int numHits, int maxHitsPerDup) throws IOException { return search(query, numHits, maxHitsPerDup, "site", null, false); } /** * Search for pages matching a query, eliminating excessive hits with * matching values for a named field. Hits after the first * <code>maxHitsPerDup</code> are removed from results. The remaining hits * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero * then all hits are returned. * * @param query query * @param numHits number of requested hits * @param maxHitsPerDup the maximum hits returned with matching values, or zero * @param dedupField field name to check for duplicates * @return Hits the matching hits * @throws IOException */ public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField) throws IOException { return search(query, numHits, maxHitsPerDup, dedupField, null, false); } /** * Search for pages matching a query, eliminating excessive hits with * matching values for a named field. Hits after the first * <code>maxHitsPerDup</code> are removed from results. The remaining hits * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero * then all hits are returned. * * @param query query * @param numHits number of requested hits * @param maxHitsPerDup the maximum hits returned with matching values, or zero * @param dedupField field name to check for duplicates * @param sortField Field to sort on (or null if no sorting). * @param reverse True if we are to reverse sort by <code>sortField</code>. * @return Hits the matching hits * @throws IOException */ public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField, boolean reverse) throws IOException { if (maxHitsPerDup <= 0) // disable dup checking return search(query, numHits, dedupField, sortField, reverse); final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f); int numHitsRaw = (int) (numHits * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("searching for " + numHitsRaw + " raw hits"); } Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse); final long total = hits.getTotal(); final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>(); final List<Hit> resultList = new ArrayList<Hit>(); final Set<Hit> seen = new HashSet<Hit>(); final List<String> excludedValues = new ArrayList<String>(); boolean totalIsExact = true; for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { // get the next raw hit if (rawHitNum >= hits.getLength()) { // optimize query by prohibiting more matches on some excluded values final Query optQuery = (Query) query.clone(); for (int i = 0; i < excludedValues.size(); i++) { if (i == MAX_PROHIBITED_TERMS) break; optQuery.addProhibitedTerm(excludedValues.get(i), dedupField); } numHitsRaw = (int) (numHitsRaw * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery); } hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse); if (LOG.isInfoEnabled()) { LOG.info("found " + hits.getTotal() + " raw hits"); } rawHitNum = -1; continue; } final Hit hit = hits.getHit(rawHitNum); if (seen.contains(hit)) continue; seen.add(hit); // get dup hits for its value final String value = hit.getDedupValue(); DupHits dupHits = dupToHits.get(value); if (dupHits == null) dupToHits.put(value, dupHits = new DupHits()); // does this hit exceed maxHitsPerDup? if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit if (!dupHits.maxSizeExceeded) { // mark prior hits with moreFromDupExcluded for (int i = 0; i < dupHits.size(); i++) { dupHits.get(i).setMoreFromDupExcluded(true); } dupHits.maxSizeExceeded = true; excludedValues.add(value); // exclude dup } totalIsExact = false; } else { // no -- collect the hit resultList.add(hit); dupHits.add(hit); // are we done? // we need to find one more than asked for, so that we can tell if // there are more hits to be shown if (resultList.size() > numHits) break; } } final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()])); results.setTotalIsExact(totalIsExact); return results; } @Override public String[] getAnchors(HitDetails hitDetails) throws IOException { return getInlinks(hitDetails).getAnchors(); } @Override public Inlinks getInlinks(HitDetails hitDetails) throws IOException { ImmutableRowPart row = getRow(hitDetails); Inlinks inLinks = new Inlinks(); for (Inlink inlink : row.getInlinks()) { inLinks.add(inlink); } return inLinks; } private static final ExecutorService executor = Executors.newCachedThreadPool(); private class SummaryTask implements Callable<Summary> { private final HitDetails details; private final Query query; public SummaryTask(HitDetails details, Query query) { this.details = details; this.query = query; } public Summary call() throws Exception { return getSummary(details, query); } } @Override public Summary getSummary(HitDetails hitDetails, Query query) throws IOException { ImmutableRowPart row = getRow(hitDetails); return this.summarizer.getSummary(row.getText(), query); } @Override public Summary[] getSummary(HitDetails[] details, Query query) throws IOException { final List<Callable<Summary>> tasks = new ArrayList<Callable<Summary>>(details.length); for (int i = 0; i < details.length; i++) { tasks.add(new SummaryTask(details[i], query)); } List<Future<Summary>> summaries; try { summaries = executor.invokeAll(tasks); } catch (final InterruptedException e) { throw new RuntimeException(e); } final Summary[] results = new Summary[details.length]; for (int i = 0; i < details.length; i++) { final Future<Summary> f = summaries.get(i); Summary summary; try { summary = f.get(); } catch (final Exception e) { if (e.getCause() instanceof IOException) { throw (IOException) e.getCause(); } throw new RuntimeException(e); } results[i] = summary; } return results; } public String getExplanation(Query query, Hit hit) throws IOException { return searchBean.getExplanation(query, hit); } public HitDetails getDetails(Hit hit) throws IOException { return searchBean.getDetails(hit); } public HitDetails[] getDetails(Hit[] hits) throws IOException { return searchBean.getDetails(hits); } public ImmutableRowPart getRow(HitDetails details) throws IOException { String url = details.getValue("url"); String rowKey = TableUtil.reverseUrl(url); return new ImmutableRowPart(table.getRow(Bytes.toBytes(rowKey), new byte[][]{TableColumns.TEXT, TableColumns.CONTENT, TableColumns.PAGERANK, TableColumns.VOTES, TableColumns.INLINKS, TableColumns.CONTENT_TYPE})); } public void close() throws IOException { if (searchBean != null) { searchBean.close(); } if (fs != null) { fs.close(); } } /** * For debugging. */ public static void main(String[] args) throws Exception { final String usage = "NutchBean webtable query"; if (args.length != 2) { System.err.println(usage); System.exit(-1); } final Configuration conf = NutchConfiguration.create(); final String webtable = args[0]; final NutchBeanHbase bean = new NutchBeanHbase(conf, webtable); final Query query = Query.parse(args[1], conf); final Hits hits = bean.search(query, 10); System.out.println("Total hits: " + hits.getTotal()); final int length = (int) Math.min(hits.getTotal(), 10); final Hit[] show = hits.getHits(0, length); final HitDetails[] details = bean.getDetails(show); final Summary[] summaries = bean.getSummary(details, query); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]); } } /** * Responsible for constructing a NutchBean singleton instance and * caching it in the servlet context. This class should be registered in * the deployment descriptor as a listener */ public static class NutchBeanConstructor implements ServletContextListener { public void contextDestroyed(ServletContextEvent sce) { } public void contextInitialized(ServletContextEvent sce) { final ServletContext app = sce.getServletContext(); final Configuration conf = NutchConfiguration.get(app); final String tablename = (String) app.getInitParameter("org.apache.nutchbase.webtable"); LOG.info("creating new bean"); NutchBeanHbase bean = null; try { bean = new NutchBeanHbase(conf, tablename); app.setAttribute(KEY, bean); } catch (final IOException ex) { LOG.error(StringUtils.stringifyException(ex)); } } } }