/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.searcher; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.*; import org.apache.hadoop.mapred.lib.*; import org.apache.nutch.crawl.*; /** Implements {@link HitSummarizer} and {@link HitContent} for a set of * fetched segments. */ public class FetchedSegments implements HitSummarizer, HitContent { private static class Segment implements Closeable { private static final Partitioner PARTITIONER = new HashPartitioner(); private FileSystem fs; private Path segmentDir; private MapFile.Reader[] content; private MapFile.Reader[] parseText; private MapFile.Reader[] parseData; private MapFile.Reader[] crawl; private Configuration conf; public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException { this.fs = fs; this.segmentDir = segmentDir; this.conf = conf; } public CrawlDatum getCrawlDatum(Text url) throws IOException { synchronized (this) { if (crawl == null) crawl = getReaders(CrawlDatum.FETCH_DIR_NAME); } return (CrawlDatum)getEntry(crawl, url, new CrawlDatum()); } public byte[] getContent(Text url) throws IOException { synchronized (this) { if (content == null) content = getReaders(Content.DIR_NAME); } return ((Content)getEntry(content, url, new Content())).getContent(); } public ParseData getParseData(Text url) throws IOException { synchronized (this) { if (parseData == null) parseData = getReaders(ParseData.DIR_NAME); } return (ParseData)getEntry(parseData, url, new ParseData()); } public ParseText getParseText(Text url) throws IOException { synchronized (this) { if (parseText == null) parseText = getReaders(ParseText.DIR_NAME); } return (ParseText)getEntry(parseText, url, new ParseText()); } private MapFile.Reader[] getReaders(String subDir) throws IOException { return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf); } private Writable getEntry(MapFile.Reader[] readers, Text url, Writable entry) throws IOException { return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry); } public void close() throws IOException { if (content != null) { closeReaders(content); } if (parseText != null) { closeReaders(parseText); } if (parseData != null) { closeReaders(parseData); } if (crawl != null) { closeReaders(crawl); } } private void closeReaders(MapFile.Reader[] readers) throws IOException { for (int i = 0; i < readers.length; i++) { readers[i].close(); } } } private HashMap segments = new HashMap(); private Summarizer summarizer; /** Construct given a directory containing fetcher output. */ public FetchedSegments(FileSystem fs, String segmentsDir, Configuration conf) throws IOException { Path[] segmentDirs = fs.listPaths(new Path(segmentsDir)); this.summarizer = new SummarizerFactory(conf).getSummarizer(); if (segmentDirs != null) { for (int i = 0; i < segmentDirs.length; i++) { Path segmentDir = segmentDirs[i]; // Path indexdone = new Path(segmentDir, IndexSegment.DONE_NAME); // if (fs.exists(indexdone) && fs.isFile(indexdone)) { // segments.put(segmentDir.getName(), new Segment(fs, segmentDir)); // } segments.put(segmentDir.getName(), new Segment(fs, segmentDir, conf)); } } } public String[] getSegmentNames() { return (String[])segments.keySet().toArray(new String[segments.size()]); } public byte[] getContent(HitDetails details) throws IOException { return getSegment(details).getContent(getUrl(details)); } public ParseData getParseData(HitDetails details) throws IOException { return getSegment(details).getParseData(getUrl(details)); } public long getFetchDate(HitDetails details) throws IOException { return getSegment(details).getCrawlDatum(getUrl(details)) .getFetchTime(); } public ParseText getParseText(HitDetails details) throws IOException { return getSegment(details).getParseText(getUrl(details)); } public Summary getSummary(HitDetails details, Query query) throws IOException { if (this.summarizer == null) { return new Summary(); } Segment segment = getSegment(details); ParseText parseText = segment.getParseText(getUrl(details)); String text = (parseText != null) ? parseText.getText() : ""; return this.summarizer.getSummary(text, query); } private class SummaryThread extends Thread { private HitDetails details; private Query query; private Summary summary; private Throwable throwable; public SummaryThread(HitDetails details, Query query) { this.details = details; this.query = query; } public void run() { try { this.summary = getSummary(details, query); } catch (Throwable throwable) { this.throwable = throwable; } } } public Summary[] getSummary(HitDetails[] details, Query query) throws IOException { SummaryThread[] threads = new SummaryThread[details.length]; for (int i = 0; i < threads.length; i++) { threads[i] = new SummaryThread(details[i], query); threads[i].start(); } Summary[] results = new Summary[details.length]; for (int i = 0; i < threads.length; i++) { try { threads[i].join(); } catch (InterruptedException e) { throw new RuntimeException(e); } if (threads[i].throwable instanceof IOException) { throw (IOException)threads[i].throwable; } else if (threads[i].throwable != null) { throw new RuntimeException(threads[i].throwable); } results[i] = threads[i].summary; } return results; } private Segment getSegment(HitDetails details) { return (Segment)segments.get(details.getValue("segment")); } private Text getUrl(HitDetails details) { String url = details.getValue("orig"); if (StringUtils.isBlank(url)) { url = details.getValue("url"); } return new Text(url); } public void close() throws IOException { Iterator iterator = segments.values().iterator(); while (iterator.hasNext()) { ((Segment) iterator.next()).close(); } } }