/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.searcher;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.reflect.Method;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.VersionedProtocol;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.util.NutchConfiguration;
/** Implements the search API over IPC connnections. */
public class DistributedSearch {
public static final Log LOG = LogFactory.getLog(DistributedSearch.class);
private DistributedSearch() {} // no public ctor
/** The distributed search protocol. */
public static interface Protocol
extends Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks, VersionedProtocol {
/** The name of the segments searched by this node. */
String[] getSegmentNames();
}
/** The search server. */
public static class Server {
private Server() {}
/** Runs a search server. */
public static void main(String[] args) throws Exception {
String usage = "DistributedSearch$Server <port> <index dir>";
if (args.length == 0 || args.length > 2) {
System.err.println(usage);
System.exit(-1);
}
int port = Integer.parseInt(args[0]);
Path directory = new Path(args[1]);
Configuration conf = NutchConfiguration.create();
org.apache.hadoop.ipc.Server server = getServer(conf, directory, port);
server.start();
server.join();
}
static org.apache.hadoop.ipc.Server getServer(Configuration conf, Path directory, int port) throws IOException{
NutchBean bean = new NutchBean(conf, directory);
int numHandlers = conf.getInt("searcher.num.handlers", 10);
return RPC.getServer(bean, "0.0.0.0", port, numHandlers, true, conf);
}
}
/** The search client. */
public static class Client extends Thread
implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
Runnable {
private InetSocketAddress[] defaultAddresses;
private boolean[] liveServer;
private HashMap segmentToAddress = new HashMap();
private boolean running = true;
private Configuration conf;
private Path file;
private long timestamp;
private FileSystem fs;
/** Construct a client talking to servers listed in the named file.
* Each line in the file lists a server hostname and port, separated by
* whitespace.
*/
public Client(Path file, Configuration conf)
throws IOException {
this(readConfig(file, conf), conf);
this.file = file;
this.timestamp = fs.getFileStatus(file).getModificationTime();
}
private static InetSocketAddress[] readConfig(Path path, Configuration conf)
throws IOException {
FileSystem fs = FileSystem.get(conf);
BufferedReader reader =
new BufferedReader(new InputStreamReader(fs.open(path)));
try {
ArrayList addrs = new ArrayList();
String line;
while ((line = reader.readLine()) != null) {
StringTokenizer tokens = new StringTokenizer(line);
if (tokens.hasMoreTokens()) {
String host = tokens.nextToken();
if (tokens.hasMoreTokens()) {
String port = tokens.nextToken();
addrs.add(new InetSocketAddress(host, Integer.parseInt(port)));
if (LOG.isInfoEnabled()) {
LOG.info("Client adding server " + host + ":" + port);
}
}
}
}
return (InetSocketAddress[])
addrs.toArray(new InetSocketAddress[addrs.size()]);
} finally {
reader.close();
}
}
/** Construct a client talking to the named servers. */
public Client(InetSocketAddress[] addresses, Configuration conf) throws IOException {
this.conf = conf;
this.defaultAddresses = addresses;
this.liveServer = new boolean[addresses.length];
this.fs = FileSystem.get(conf);
updateSegments();
setDaemon(true);
start();
}
private static final Method GET_SEGMENTS;
private static final Method SEARCH;
private static final Method DETAILS;
private static final Method SUMMARY;
static {
try {
GET_SEGMENTS = Protocol.class.getMethod
("getSegmentNames", new Class[] {});
SEARCH = Protocol.class.getMethod
("search", new Class[] { Query.class, Integer.TYPE, String.class,
String.class, Boolean.TYPE});
DETAILS = Protocol.class.getMethod
("getDetails", new Class[] { Hit.class});
SUMMARY = Protocol.class.getMethod
("getSummary", new Class[] { HitDetails.class, Query.class});
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
/**
* Check to see if search-servers file has been modified
*
* @throws IOException
*/
public boolean isFileModified()
throws IOException {
if (file != null) {
long modTime = fs.getFileStatus(file).getModificationTime();
if (timestamp < modTime) {
this.timestamp = fs.getFileStatus(file).getModificationTime();
return true;
}
}
return false;
}
/** Updates segment names.
*
* @throws IOException
*/
public void updateSegments() throws IOException {
int liveServers = 0;
int liveSegments = 0;
if (isFileModified()) {
defaultAddresses = readConfig(file, conf);
}
// Create new array of flags so they can all be updated at once.
boolean[] updatedLiveServer = new boolean[defaultAddresses.length];
// build segmentToAddress map
Object[][] params = new Object[defaultAddresses.length][0];
String[][] results =
(String[][])RPC.call(GET_SEGMENTS, params, defaultAddresses, this.conf);
for (int i = 0; i < results.length; i++) { // process results of call
InetSocketAddress addr = defaultAddresses[i];
String[] segments = results[i];
if (segments == null) {
updatedLiveServer[i] = false;
if (LOG.isWarnEnabled()) {
LOG.warn("Client: no segments from: " + addr);
}
continue;
}
for (int j = 0; j < segments.length; j++) {
if (LOG.isTraceEnabled()) {
LOG.trace("Client: segment "+segments[j]+" at "+addr);
}
segmentToAddress.put(segments[j], addr);
}
updatedLiveServer[i] = true;
liveServers++;
liveSegments += segments.length;
}
// Now update live server flags.
this.liveServer = updatedLiveServer;
if (LOG.isInfoEnabled()) {
LOG.info("STATS: "+liveServers+" servers, "+liveSegments+" segments.");
}
}
/** Return the names of segments searched. */
public String[] getSegmentNames() {
return (String[])
segmentToAddress.keySet().toArray(new String[segmentToAddress.size()]);
}
public Hits search(final Query query, final int numHits,
final String dedupField, final String sortField,
final boolean reverse) throws IOException {
// Get the list of live servers. It would be nice to build this
// list in updateSegments(), but that would create concurrency issues.
// We grab a local reference to the live server flags in case it
// is updated while we are building our list of liveAddresses.
boolean[] savedLiveServer = this.liveServer;
int numLive = 0;
for (int i = 0; i < savedLiveServer.length; i++) {
if (savedLiveServer[i])
numLive++;
}
InetSocketAddress[] liveAddresses = new InetSocketAddress[numLive];
int[] liveIndexNos = new int[numLive];
int k = 0;
for (int i = 0; i < savedLiveServer.length; i++) {
if (savedLiveServer[i]) {
liveAddresses[k] = defaultAddresses[i];
liveIndexNos[k] = i;
k++;
}
}
Object[][] params = new Object[liveAddresses.length][5];
for (int i = 0; i < params.length; i++) {
params[i][0] = query;
params[i][1] = new Integer(numHits);
params[i][2] = dedupField;
params[i][3] = sortField;
params[i][4] = Boolean.valueOf(reverse);
}
Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses, this.conf);
TreeSet queue; // cull top hits from results
if (sortField == null || reverse) {
queue = new TreeSet(new Comparator() {
public int compare(Object o1, Object o2) {
return ((Comparable)o2).compareTo(o1); // reverse natural order
}
});
} else {
queue = new TreeSet();
}
long totalHits = 0;
Comparable maxValue = null;
for (int i = 0; i < results.length; i++) {
Hits hits = results[i];
if (hits == null) continue;
totalHits += hits.getTotal();
for (int j = 0; j < hits.getLength(); j++) {
Hit h = hits.getHit(j);
if (maxValue == null ||
((reverse || sortField == null)
? h.getSortValue().compareTo(maxValue) >= 0
: h.getSortValue().compareTo(maxValue) <= 0)) {
queue.add(new Hit(liveIndexNos[i], h.getIndexDocNo(),
h.getSortValue(), h.getDedupValue()));
if (queue.size() > numHits) { // if hit queue overfull
queue.remove(queue.last()); // remove lowest in hit queue
maxValue = ((Hit)queue.last()).getSortValue(); // reset maxValue
}
}
}
}
return new Hits(totalHits, (Hit[])queue.toArray(new Hit[queue.size()]));
}
// version for hadoop-0.5.0.jar
public static final long versionID = 1L;
private Protocol getRemote(Hit hit) throws IOException {
return (Protocol)
RPC.getProxy(Protocol.class, versionID, defaultAddresses[hit.getIndexNo()], conf);
}
private Protocol getRemote(HitDetails hit) throws IOException {
InetSocketAddress address =
(InetSocketAddress)segmentToAddress.get(hit.getValue("segment"));
return (Protocol)RPC.getProxy(Protocol.class, versionID, address, conf);
}
public String getExplanation(Query query, Hit hit) throws IOException {
return getRemote(hit).getExplanation(query, hit);
}
public HitDetails getDetails(Hit hit) throws IOException {
return getRemote(hit).getDetails(hit);
}
public HitDetails[] getDetails(Hit[] hits) throws IOException {
InetSocketAddress[] addrs = new InetSocketAddress[hits.length];
Object[][] params = new Object[hits.length][1];
for (int i = 0; i < hits.length; i++) {
addrs[i] = defaultAddresses[hits[i].getIndexNo()];
params[i][0] = hits[i];
}
return (HitDetails[])RPC.call(DETAILS, params, addrs, conf);
}
public Summary getSummary(HitDetails hit, Query query) throws IOException {
return getRemote(hit).getSummary(hit, query);
}
public Summary[] getSummary(HitDetails[] hits, Query query)
throws IOException {
InetSocketAddress[] addrs = new InetSocketAddress[hits.length];
Object[][] params = new Object[hits.length][2];
for (int i = 0; i < hits.length; i++) {
HitDetails hit = hits[i];
addrs[i] =
(InetSocketAddress)segmentToAddress.get(hit.getValue("segment"));
params[i][0] = hit;
params[i][1] = query;
}
return (Summary[])RPC.call(SUMMARY, params, addrs, conf);
}
public byte[] getContent(HitDetails hit) throws IOException {
return getRemote(hit).getContent(hit);
}
public ParseData getParseData(HitDetails hit) throws IOException {
return getRemote(hit).getParseData(hit);
}
public ParseText getParseText(HitDetails hit) throws IOException {
return getRemote(hit).getParseText(hit);
}
public String[] getAnchors(HitDetails hit) throws IOException {
return getRemote(hit).getAnchors(hit);
}
public Inlinks getInlinks(HitDetails hit) throws IOException {
return getRemote(hit).getInlinks(hit);
}
public long getFetchDate(HitDetails hit) throws IOException {
return getRemote(hit).getFetchDate(hit);
}
public static void main(String[] args) throws Exception {
String usage = "DistributedSearch$Client query <host> <port> ...";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
Query query = Query.parse(args[0], NutchConfiguration.create());
InetSocketAddress[] addresses = new InetSocketAddress[(args.length-1)/2];
for (int i = 0; i < (args.length-1)/2; i++) {
addresses[i] =
new InetSocketAddress(args[i*2+1], Integer.parseInt(args[i*2+2]));
}
Client client = new Client(addresses, NutchConfiguration.create());
//client.setTimeout(Integer.MAX_VALUE);
Hits hits = client.search(query, 10, null, null, false);
System.out.println("Total hits: " + hits.getTotal());
for (int i = 0; i < hits.getLength(); i++) {
System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i)));
}
}
public void run() {
while (running){
try{
Thread.sleep(10000);
} catch (InterruptedException ie){
if (LOG.isInfoEnabled()) {
LOG.info("Thread sleep interrupted.");
}
}
try{
if (LOG.isInfoEnabled()) {
LOG.info("Querying segments from search servers...");
}
updateSegments();
} catch (IOException ioe) {
if (LOG.isWarnEnabled()) { LOG.warn("No search servers available!"); }
liveServer = new boolean[defaultAddresses.length];
}
}
}
/**
* Stops the watchdog thread.
*/
public void close() {
running = false;
interrupt();
}
public boolean[] getLiveServer() {
return liveServer;
}
}
}