/*
* RemoteQueryRunner.java
*
* Copyright (c) 2007-2011, The University of Sheffield.
*
* This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html),
* and is free software, licenced under the GNU Lesser General Public License,
* Version 3, June 2007 (also included with this distribution as file
* LICENCE-LGPL3.html).
*
* Valentin Tablan, 08 Dec 2011
*
* $Id$
*/
package gate.mimir.search;
import gate.mimir.index.DocumentData;
import gate.mimir.index.IndexException;
import gate.mimir.search.query.Binding;
import gate.mimir.search.query.QueryNode;
import gate.mimir.tool.WebUtils;
import it.unimi.dsi.fastutil.doubles.DoubleBigArrayBigList;
import it.unimi.dsi.fastutil.doubles.DoubleBigList;
import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongBigArrayBigList;
import it.unimi.dsi.fastutil.longs.LongBigList;
import java.io.IOException;
import java.io.Serializable;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.FutureTask;
import org.apache.log4j.Logger;
/**
* A {@link QueryRunner} implementation that proxies a QueryRunner running on
* a remote Mímir server.
*/
public class RemoteQueryRunner implements QueryRunner {
protected static final String SERVICE_SEARCH = "search";
protected static final String ACTION_POST_QUERY_BIN = "postQueryBin";
protected static final String ACTION_CURRENT_DOC_COUNT_BIN = "documentsCurrentCountBin";
protected static final String ACTION_DOC_COUNT_BIN = "documentsCountBin";
protected static final String ACTION_DOC_IDS_BIN = "documentIdsBin";
protected static final String ACTION_DOC_SCORES_BIN = "documentsScoresBin";
protected static final String ACTION_DOC_HITS_BIN = "documentHitsBin";
protected static final String ACTION_DOC_DATA_BIN = "documentDataBin";
protected static final String ACTION_RENDER_DOCUMENT = "renderDocument";
protected static final String ACTION_CLOSE = "close";
/**
* The maximum number of documents to be stored in the local document cache.
*/
protected static final int DOCUMENT_CACHE_SIZE = 1000;
/**
* Action run in a background thread, used to update the document data
* (document ID, document score) from the remote endpoint.
* This runs once, started during the creation of the query runner.
*/
protected class DocumentDataUpdater implements Runnable {
@Override
public void run() {
int failuresAllowed = 10;
// wait for the first pass to complete
while(documentsCount < 0) {
if(closed) return;
// update document counts
try {
long newDocumentsCount = webUtils.getLong(
getActionBaseUrl(ACTION_DOC_COUNT_BIN), "queryId",
URLEncoder.encode(queryId, "UTF-8"));
if(newDocumentsCount < 0) {
// still not finished-> update current count
currentDocumentsCount = webUtils.getLong(
getActionBaseUrl(ACTION_CURRENT_DOC_COUNT_BIN), "queryId",
URLEncoder.encode(queryId, "UTF-8"));
// ... and wait a while before asking again
Thread.sleep(500);
} else {
// remote side has finished enumerating all documents
// download the first block of IDs and scores
downloadDocIdScores(0);
// ...and we're done!
documentsCount = newDocumentsCount;
}
} catch(InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while waiting", e);
} catch (Exception e) {
if(failuresAllowed > 0) {
failuresAllowed --;
logger.error("Exception while obtaining remote document data (will retry)", e);
try {
Thread.sleep(100);
} catch(InterruptedException e1) {
Thread.currentThread().interrupt();
}
} else {
logger.error("Exception while obtaining remote document data.", e);
exceptionInBackgroundThread = e;
return;
}
}
}
}
}
/**
* The size of the document block (the number of documents for which the IDs
* are downloaded in one operation.
*/
private int docBlockSize = 1000;
/**
* A cache of MG4J {@link Document}s used for returning the hit text.
*/
private Long2ObjectLinkedOpenHashMap<DocumentData> documentCache;
/**
* The WebUtils instance we use to communicate with the remote
* index.
*/
private WebUtils webUtils;
/**
* The URL to the server hosting the remote index we're searching
*/
private String remoteUrl;
/**
* The query ID for the actual query runner, local to the remote index.
*/
protected String queryId;
/**
* The total number of result documents (or -1 if not yet known).
*/
private volatile long documentsCount;
/**
* The current number of documents. After all documents have been retrieved,
* this value is identical to {@link #documentsCount}.
*/
private volatile long currentDocumentsCount;
/**
* The task that's working on collecting all the document IDs. When this
* activity has finished, the precise documents count is known.
*/
private volatile FutureTask<Object> docDataUpdaterFuture;
private volatile boolean closed;
/**
* Shared Logger
*/
private static Logger logger = Logger.getLogger(RemoteQueryRunner.class);
/**
* If the background thread encounters an exception, it
* will save it here. As the background thread cannot report it itself, it is
* the job of any of the interactive methods to report it.
*/
private Exception exceptionInBackgroundThread;
/**
* The document IDs in ranking order. If ranking is not preformed, then the
* document IDs are in the order they are returned by the index.
*/
protected LongBigList documentIds;
/**
* The document scores. This list is aligned to {@link #documentIds}.
*/
protected DoubleBigList documentScores;
/**
* Creates a new remote query runner instance which executes a search on a
* Mímir server and makes the results available locally.
*
* @param remoteUrl the index URL for the index being searched. This can be
* obtained from the admin interface of the remote Mímir server.
*
* @param queryString the Mímir query to be executed, represented as a string.
*
* @param threadSource a source of threads (such as a thread pool) used for
* background processes. If <code>null</code> is given then new threads are
* started as required.
*
* @param webUtils an instance of {@link WebUtils}. If the remote server
* requires authentication, the correct user name and password should be set
* on the WebUtils instance before being used for the creation of remote query
* runners. WebUtils instances can be reused for multiple query runners.
*
* @throws IOException
*/
public RemoteQueryRunner(String remoteUrl, String queryString,
Executor threadSource, WebUtils webUtils) throws IOException {
this.remoteUrl = remoteUrl.endsWith("/") ? remoteUrl : (remoteUrl + "/");
this.webUtils = webUtils;
this.closed = false;
// submit the remote query
try {
init((String) webUtils.getObject(
getActionBaseUrl(ACTION_POST_QUERY_BIN),
"queryString",
URLEncoder.encode(queryString, "UTF-8")), threadSource);
} catch(ClassNotFoundException e) {
//we were expecting a String but got some object of unknown class
throw (IOException) new IOException(
"Was expecting a String query ID value, but got " +
"an unknown object type!").initCause(e);
}
}
/**
* Creates a new remote query runner instance which executes a search on a
* Mímir server and makes the results available locally.
*
* @param remoteUrl the index URL for the index being searched. This can be
* obtained from the admin interface of the remote Mímir server.
*
* @param query the query to be executed. This constructor variant takes a
* {@link QueryNode} value; for queries expressed as strings, use the other
* constructor.
*
* @param threadSource a source of threads (such as a thread pool) used for
* background processes. If <code>null</code> is given then new threads are
* started as required.
*
* @param webUtils an instance of {@link WebUtils}. If the remote server
* requires authentication, the correct user name and password should be set
* on the WebUtils instance before being used for the creation of remote query
* runners. WebUtils instances can be reused for multiple query runners.
*
* @throws IOException
*/
public RemoteQueryRunner(String remoteUrl, QueryNode query,
Executor threadSource, WebUtils webUtils) throws IOException {
this.remoteUrl = remoteUrl.endsWith("/") ? remoteUrl : (remoteUrl + "/");
this.webUtils = webUtils;
this.closed = false;
// submit the remote query
try {
init((String) webUtils.rpcCall(
getActionBaseUrl(ACTION_POST_QUERY_BIN),
query), threadSource);
} catch(ClassNotFoundException e) {
//we were expecting a String but got some object of unknown class
throw (IOException) new IOException(
"Was expecting a String query ID value, but got " +
"an unknown object type!").initCause(e);
}
}
protected void init(String queryId, Executor threadSource) {
this.queryId = queryId;
// init the caches
this.documentIds = new LongBigArrayBigList();
this.documentScores = new DoubleBigArrayBigList();
this.documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
// start the background action
documentsCount = -1;
currentDocumentsCount = 0;
docDataUpdaterFuture = new FutureTask<Object>( new DocumentDataUpdater(), null);
if(threadSource != null) {
threadSource.execute(docDataUpdaterFuture);
} else {
new Thread(docDataUpdaterFuture,
DocumentDataUpdater.class.getCanonicalName()).start();
}
}
protected String getActionBaseUrl(String action) throws IOException{
//this method is always called from interactive methods, that are capable of
//reporting errors to the user. So we use this place to check if the
//background thread had any problems, and report them if so.
if(exceptionInBackgroundThread != null){
Exception e = exceptionInBackgroundThread;
exceptionInBackgroundThread = null;
throw (IOException)new IOException(
"Problem communicating with the remote index", e);
}
//an example URL looks like this:
//http://localhost:8080/mimir/bf25398ff0874224/search/documentsCountBin?queryId=c4da799e-9ca2-46ae-8ded-30bdc37ad607
StringBuilder str = new StringBuilder(remoteUrl);
str.append(SERVICE_SEARCH);
str.append('/');
str.append(action);
return str.toString();
}
/**
* Gets (from the cache, or from the remote endpoint) the {@link DocumentData}
* for the document at the specified rank.
* @param rank
* @return
* @throws IndexException
* @throws IndexOutOfBoundsException
* @throws IOException
*/
protected DocumentData getDocumentData(long rank) throws IndexException,
IndexOutOfBoundsException, IOException {
DocumentData docData = documentCache.getAndMoveToFirst(rank);
if(docData == null) {
// cache miss -> remote retrieve
try {
docData = (DocumentData)webUtils.getObject(
getActionBaseUrl(ACTION_DOC_DATA_BIN),
"queryId", URLEncoder.encode(queryId, "UTF-8"),
"documentRank", Long.toString(rank));
documentCache.putAndMoveToFirst(rank, docData);
if(documentCache.size() > DOCUMENT_CACHE_SIZE) {
// reduce size
documentCache.removeLast();
}
} catch(IOException e) {
throw new IndexException(e);
} catch(ClassNotFoundException e) {
throw new IndexException("Was expecting a DocumentData value, " +
"but got an unknown object type!", e);
}
}
return docData;
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentsCount()
*/
@Override
public long getDocumentsCount() {
return documentsCount;
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentsCountSync()
*/
@Override
public long getDocumentsCountSync() {
try{
docDataUpdaterFuture.get();
} catch(Exception e) {
logger.error("Exception while getting all document IDs", e);
throw new IllegalStateException(
"Exception while getting all document IDs", e);
}
return getDocumentsCount();
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getCurrentDocumentsCount()
*/
@Override
public long getDocumentsCurrentCount() {
return (documentsCount < 0) ? currentDocumentsCount : documentsCount;
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentID(int)
*/
@Override
public long getDocumentID(long rank) throws IndexOutOfBoundsException,
IOException {
if(rank >= documentIds.size()) {
// we need to get more document IDs&scores
downloadDocIdScores(rank);
}
return documentIds.getLong(rank);
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentScore(int)
*/
@Override
public double getDocumentScore(long rank) throws IndexOutOfBoundsException,
IOException {
if(rank >= documentScores.size64()) {
// we need to get more document IDs&scores
downloadDocIdScores(rank);
}
return documentScores.get(rank);
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentHits(int)
*/
@SuppressWarnings("unchecked")
@Override
public List<Binding> getDocumentHits(long rank)
throws IndexOutOfBoundsException, IOException {
try {
return (List<Binding>)webUtils.getObject(
getActionBaseUrl(ACTION_DOC_HITS_BIN),
"queryId", queryId,
"documentRank", Long.toString(rank));
} catch(ClassNotFoundException e) {
throw new RuntimeException("Got wrong value type from remote endpoint",
e);
}
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentText(int, int, int)
*/
@Override
public String[][] getDocumentText(long rank, int termPosition, int length)
throws IndexException, IndexOutOfBoundsException, IOException {
return getDocumentData(rank).getText(termPosition, length);
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentURI(int)
*/
@Override
public String getDocumentURI(long rank) throws IndexException,
IndexOutOfBoundsException, IOException {
return getDocumentData(rank).getDocumentURI();
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentTitle(int)
*/
@Override
public String getDocumentTitle(long rank) throws IndexException,
IndexOutOfBoundsException, IOException {
return getDocumentData(rank).getDocumentTitle();
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentMetadataField(int, java.lang.String)
*/
@Override
public Serializable getDocumentMetadataField(long rank, String fieldName)
throws IndexException, IndexOutOfBoundsException, IOException {
return getDocumentData(rank).getMetadataField(fieldName);
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#getDocumentMetadataFields(int, java.util.Set)
*/
@Override
public Map<String, Serializable> getDocumentMetadataFields(long rank,
Set<String> fieldNames) throws IndexException, IndexOutOfBoundsException,
IOException {
Map<String, Serializable> res = new HashMap<String, Serializable>();
for(String fieldName : fieldNames) {
Serializable value = getDocumentMetadataField(rank, fieldName);
if(value != null) res.put(fieldName, value);
}
return res;
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#renderDocument(int, java.lang.Appendable)
*/
@Override
public void renderDocument(long rank, Appendable out) throws IOException,
IndexException {
webUtils.getText(out, getActionBaseUrl(ACTION_RENDER_DOCUMENT),
"queryId", queryId,
"rank", Long.toString(rank));
}
/* (non-Javadoc)
* @see gate.mimir.search.QueryRunner#close()
*/
@Override
public void close() throws IOException {
webUtils.getVoid(getActionBaseUrl(ACTION_CLOSE),
"queryId", queryId);
closed = true;
documentCache.clear();
}
/**
* Gets from the remote end point a range of document IDs and document scores,
* which is guaranteed to include the document at the given rank.
* @param rank
* @throws IOException
*/
protected void downloadDocIdScores(long rank) throws IOException {
long firstRank = documentIds.size64();
if(firstRank != documentScores.size64()) {
throw new IllegalStateException("Document IDs and scores out of sync.");
}
long size = rank - firstRank;
if(size < docBlockSize) size = docBlockSize;
long[] newDocIds;
double[] newDocScores;
try {
newDocIds = (long[]) webUtils.getObject(
getActionBaseUrl(ACTION_DOC_IDS_BIN),
"queryId", URLEncoder.encode(queryId, "UTF-8"),
"firstRank", Long.toString(firstRank),
"size", Long.toString(size));
documentIds.addElements(firstRank, new long[][]{newDocIds});
newDocScores = (double[]) webUtils.getObject(
getActionBaseUrl(ACTION_DOC_SCORES_BIN),
"queryId", URLEncoder.encode(queryId, "UTF-8"),
"firstRank", Long.toString(firstRank),
"size", Long.toString(size));
documentScores.addElements(firstRank, new double[][]{newDocScores});
} catch(ClassNotFoundException e) {
// this should really not happen (the 'class' is double)
throw new RuntimeException("Error communicating to remote endpoint", e);
}
}
/**
* Returns the query ID that this instance is working over.
*/
public String getQueryId() {
return queryId;
}
}