/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.searcher;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.*;
import org.apache.nutch.crawl.*;
/** Implements {@link HitSummarizer} and {@link HitContent} for a set of
* fetched segments. */
public class FetchedSegments implements RPCSegmentBean {
public static final long VERSION = 1L;
private static final ExecutorService executor =
Executors.newCachedThreadPool();
private class SummaryTask implements Callable<Summary> {
private final HitDetails details;
private final Query query;
public SummaryTask(HitDetails details, Query query) {
this.details = details;
this.query = query;
}
public Summary call() throws Exception {
return getSummary(details, query);
}
}
private class SegmentUpdater extends Thread {
@Override
public void run() {
while (true) {
try {
final FileStatus[] fstats = fs.listStatus(segmentsDir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats);
final Iterator<Map.Entry<String, Segment>> i =
segments.entrySet().iterator();
while (i.hasNext()) {
final Map.Entry<String, Segment> entry = i.next();
final Segment seg = entry.getValue();
if (!fs.exists(seg.segmentDir)) {
try {
seg.close();
} catch (final Exception e) {
/* A segment may fail to close
* since it may already be deleted from
* file system. So we just ignore the
* exception and remove the mapping from
* 'segments'.
*/
} finally {
i.remove();
}
}
}
if (segmentDirs != null) {
for (final Path segmentDir : segmentDirs) {
segments.putIfAbsent(segmentDir.getName(),
new Segment(fs, segmentDir, conf));
}
}
Thread.sleep(60000);
} catch (final InterruptedException e) {
// ignore
} catch (final IOException e) {
// ignore
}
}
}
}
private static class Segment implements java.io.Closeable {
private static final Partitioner<Text, Writable> PARTITIONER =
new HashPartitioner<Text, Writable>();
private final FileSystem fs;
private final Path segmentDir;
private MapFile.Reader[] content;
private MapFile.Reader[] parseText;
private MapFile.Reader[] parseData;
private MapFile.Reader[] crawl;
private final Configuration conf;
public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
this.fs = fs;
this.segmentDir = segmentDir;
this.conf = conf;
}
public CrawlDatum getCrawlDatum(Text url) throws IOException {
synchronized (this) {
if (crawl == null)
crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
}
return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
}
public byte[] getContent(Text url) throws IOException {
synchronized (this) {
if (content == null)
content = getReaders(Content.DIR_NAME);
}
return ((Content)getEntry(content, url, new Content())).getContent();
}
public ParseData getParseData(Text url) throws IOException {
synchronized (this) {
if (parseData == null)
parseData = getReaders(ParseData.DIR_NAME);
}
return (ParseData)getEntry(parseData, url, new ParseData());
}
public ParseText getParseText(Text url) throws IOException {
synchronized (this) {
if (parseText == null)
parseText = getReaders(ParseText.DIR_NAME);
}
return (ParseText)getEntry(parseText, url, new ParseText());
}
private MapFile.Reader[] getReaders(String subDir) throws IOException {
return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf);
}
private Writable getEntry(MapFile.Reader[] readers, Text url,
Writable entry) throws IOException {
return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
}
public void close() throws IOException {
if (content != null) { closeReaders(content); }
if (parseText != null) { closeReaders(parseText); }
if (parseData != null) { closeReaders(parseData); }
if (crawl != null) { closeReaders(crawl); }
}
private void closeReaders(MapFile.Reader[] readers) throws IOException {
for (int i = 0; i < readers.length; i++) {
readers[i].close();
}
}
}
private final ConcurrentMap<String, Segment> segments =
new ConcurrentHashMap<String, Segment>();
private final FileSystem fs;
private final Configuration conf;
private final Path segmentsDir;
private final SegmentUpdater segUpdater;
private final Summarizer summarizer;
/** Construct given a directory containing fetcher output. */
public FetchedSegments(Configuration conf, Path segmentsDir)
throws IOException {
this.conf = conf;
this.fs = FileSystem.get(this.conf);
final FileStatus[] fstats = fs.listStatus(segmentsDir,
HadoopFSUtil.getPassDirectoriesFilter(fs));
final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats);
this.summarizer = new SummarizerFactory(this.conf).getSummarizer();
this.segmentsDir = segmentsDir;
this.segUpdater = new SegmentUpdater();
if (segmentDirs != null) {
for (final Path segmentDir : segmentDirs) {
segments.put(segmentDir.getName(),
new Segment(this.fs, segmentDir, this.conf));
}
}
this.segUpdater.start();
}
public String[] getSegmentNames() {
return segments.keySet().toArray(new String[segments.size()]);
}
public byte[] getContent(HitDetails details) throws IOException {
return getSegment(details).getContent(getUrl(details));
}
public ParseData getParseData(HitDetails details) throws IOException {
return getSegment(details).getParseData(getUrl(details));
}
public long getFetchDate(HitDetails details) throws IOException {
return getSegment(details).getCrawlDatum(getUrl(details))
.getFetchTime();
}
public ParseText getParseText(HitDetails details) throws IOException {
return getSegment(details).getParseText(getUrl(details));
}
public Summary getSummary(HitDetails details, Query query)
throws IOException {
if (this.summarizer == null) { return new Summary(); }
final Segment segment = getSegment(details);
final ParseText parseText = segment.getParseText(getUrl(details));
final String text = (parseText != null) ? parseText.getText() : "";
return this.summarizer.getSummary(text, query);
}
public long getProtocolVersion(String protocol, long clientVersion)
throws IOException {
return VERSION;
}
public Summary[] getSummary(HitDetails[] details, Query query)
throws IOException {
final List<Callable<Summary>> tasks =
new ArrayList<Callable<Summary>>(details.length);
for (int i = 0; i < details.length; i++) {
tasks.add(new SummaryTask(details[i], query));
}
List<Future<Summary>> summaries;
try {
summaries = executor.invokeAll(tasks);
} catch (final InterruptedException e) {
throw new RuntimeException(e);
}
final Summary[] results = new Summary[details.length];
for (int i = 0; i < details.length; i++) {
final Future<Summary> f = summaries.get(i);
Summary summary;
try {
summary = f.get();
} catch (final Exception e) {
if (e.getCause() instanceof IOException) {
throw (IOException) e.getCause();
}
throw new RuntimeException(e);
}
results[i] = summary;
}
return results;
}
private Segment getSegment(HitDetails details) {
return segments.get(details.getValue("segment"));
}
private Text getUrl(HitDetails details) {
String url = details.getValue("orig");
if (StringUtils.isBlank(url)) {
url = details.getValue("url");
}
return new Text(url);
}
public void close() throws IOException {
final Iterator<Segment> iterator = segments.values().iterator();
while (iterator.hasNext()) {
iterator.next().close();
}
}
}