/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.resourceindex;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.archive.wayback.ResourceIndex;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.UrlSearchResult;
import org.archive.wayback.core.UrlSearchResults;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.resourceindex.filterfactory.ClosestTrackingCaptureFilterGroup;
import org.archive.wayback.resourceindex.filters.ConditionalGetAnnotationFilter;
import org.archive.wayback.resourceindex.filters.SelfRedirectFilter;
import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.ObjectFilterChain;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
* ResourceIndex implementation that relays a query to a remote index
* implementation over HTTP. The XMLQueryUI is assumed to be active on the
* remote server, and the query is sent over as-is, formulated as an OpenSearch
* query. Results are also returned as-is -- this class attempts to be as
* transparent as possible.
*
* @author brad
* @version $Date$, $Revision$
*/
public class RemoteResourceIndex implements ResourceIndex {
private static final Logger LOGGER = Logger.getLogger(RemoteResourceIndex
.class.getName());
private String searchUrlBase;
private int connectTimeout = 10000;
private int readTimeout = 10000;
private DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private static final String WB_XML_REQUEST_TAGNAME = "request";
private static final String WB_XML_RESULT_TAGNAME = "result";
private static final String WB_XML_ERROR_TAGNAME = "error";
private static final String WB_XML_ERROR_TITLE = "title";
private static final String WB_XML_ERROR_MESSAGE = "message";
private UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
@SuppressWarnings("unchecked")
private final ThreadLocal tl = new ThreadLocal() {
protected synchronized Object initialValue() {
DocumentBuilder builder = null;
try {
if(factory != null) {
builder = factory.newDocumentBuilder();
if (!builder.isNamespaceAware()) {
LOGGER.severe("Builder is not namespace aware.");
}
}
} catch (ParserConfigurationException e) {
// TODO: OK to just "eat" this error?
e.printStackTrace();
}
return builder;
}
};
private DocumentBuilder getDocumentBuilder() {
return (DocumentBuilder) tl.get();
}
/**
* @throws ConfigurationException
*/
public void init() throws ConfigurationException {
LOGGER.info("initializing RemoteCDXIndex...");
this.factory.setNamespaceAware(false);
LOGGER.info("Using base search url " + this.searchUrlBase);
}
/*
* (non-Javadoc)
*
* @see org.archive.wayback.ResourceIndex#query(org.archive.wayback.core.WaybackRequest)
*/
public SearchResults query(WaybackRequest wbRequest)
throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
// throw new ResourceIndexNotAvailableException("oops");
ClosestTrackingCaptureFilterGroup closestGroup = new ClosestTrackingCaptureFilterGroup(wbRequest, canonicalizer);
SearchResults results = urlToSearchResults(getRequestUrl(wbRequest),
getSearchResultFilters(wbRequest, closestGroup));
closestGroup.annotateResults(results);
return results;
}
protected SearchResults urlToSearchResults(String requestUrl,
ObjectFilter<CaptureSearchResult> filter)
throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
Document document = null;
try {
// HTTP Request + parse
LOGGER.info("Getting index XML from ("+requestUrl+")");
document = getHttpDocument(requestUrl);
} catch (IOException e) {
// TODO: better error for user:
e.printStackTrace();
throw new ResourceIndexNotAvailableException(e.getMessage());
} catch (SAXException e) {
e.printStackTrace();
throw new ResourceIndexNotAvailableException("Unexpected SAX: "
+ e.getMessage());
}
checkDocumentForExceptions(document);
return documentToSearchResults(document, filter);
}
protected void checkDocumentForExceptions(Document document)
throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
NodeList errors = document.getElementsByTagName(WB_XML_ERROR_TAGNAME);
if(errors.getLength() != 0) {
String errTitle = getNodeContent((Element) errors.item(0),
WB_XML_ERROR_TITLE);
String errMessage = getNodeContent((Element) errors.item(0),
WB_XML_ERROR_MESSAGE);
// TODO: Localization Problems.. Think of something clever.
if(errTitle == null) {
throw new ResourceIndexNotAvailableException("Unknown error!");
} else if(errTitle.equals("Resource Not In Archive")) {
throw new ResourceNotInArchiveException(errMessage);
} else if(errTitle.equals("Bad Query Exception")) {
throw new BadQueryException(errMessage);
} else if(errTitle.equals("Resource Index Not Available Exception")) {
throw new ResourceIndexNotAvailableException(errMessage);
} else if(errTitle.equals("Access Control Exception")) {
throw new AccessControlException(errMessage);
} else {
throw new ResourceIndexNotAvailableException("Unknown error!");
}
}
}
private String getResultsType(Document document) {
NodeList list = document.getElementsByTagName(
SearchResults.RESULTS_TYPE);
if(list.getLength() == 1) {
return getNodeTextValue(list.item(0));
} else {
return SearchResults.RESULTS_TYPE_CAPTURE;
}
}
private String getNodeTextValue(Node n) {
if(n.hasChildNodes()) {
if(n.getFirstChild().getNodeName().equals("#text")) {
return n.getFirstChild().getNodeValue();
}
}
return "";
}
protected ObjectFilter<CaptureSearchResult> getSearchResultFilters(
WaybackRequest wbRequest, ClosestTrackingCaptureFilterGroup closestGroup) {
ObjectFilterChain<CaptureSearchResult> filters = null;
if (wbRequest.isReplayRequest()) {
filters = new ObjectFilterChain<CaptureSearchResult>();
SelfRedirectFilter selfRedirectFilter = new SelfRedirectFilter();
selfRedirectFilter.setCanonicalizer(canonicalizer);
filters.addFilter(selfRedirectFilter);
filters.addFilter(new WARCRevisitAnnotationFilter());
filters.addFilter(new ConditionalGetAnnotationFilter());
filters.addFilter(closestGroup.getFilter());
} else {
// no filters for now
filters = null;
}
return filters;
}
protected SearchResults documentToSearchResults(Document document,
ObjectFilter<CaptureSearchResult> filter)
throws ResourceNotInArchiveException {
SearchResults results = null;
NodeList filters = getRequestFilters(document);
String resultsType = getResultsType(document);
if(resultsType.equals(SearchResults.RESULTS_TYPE_CAPTURE)) {
results = documentToCaptureSearchResults(document,filter);
} else {
results = documentToUrlSearchResults(document);
}
for(int i = 0; i < filters.getLength(); i++) {
String key = filters.item(i).getNodeName();
String value = getNodeTextValue(filters.item(i));
if(!key.equals("#text")) {
results.putFilter(key,value);
}
}
return results;
}
private UrlSearchResults documentToUrlSearchResults(
Document document) {
UrlSearchResults results = new UrlSearchResults();
NodeList xresults = getSearchResults(document);
for(int i = 0; i < xresults.getLength(); i++) {
Node xresult = xresults.item(i);
UrlSearchResult result = searchElementToUrlSearchResult(xresult);
results.addSearchResult(result, true);
}
return results;
}
private CaptureSearchResults documentToCaptureSearchResults(
Document document, ObjectFilter<CaptureSearchResult> filter)
throws ResourceNotInArchiveException {
CaptureSearchResults results = new CaptureSearchResults();
NodeList xresults = getSearchResults(document);
int numAdded = 0;
for(int i = 0; i < xresults.getLength(); i++) {
Node xresult = xresults.item(i);
CaptureSearchResult result = searchElementToCaptureSearchResult(xresult);
int ruling = ObjectFilter.FILTER_INCLUDE;
if (filter != null) {
ruling = filter.filterObject(result);
}
if (ruling == ObjectFilter.FILTER_ABORT) {
break;
} else if (ruling == ObjectFilter.FILTER_INCLUDE) {
numAdded++;
results.addSearchResult(result, true);
}
}
if(numAdded == 0) {
throw new ResourceNotInArchiveException("No documents matching" +
" filter");
}
return results;
}
private UrlSearchResult searchElementToUrlSearchResult(Node e) {
UrlSearchResult result = new UrlSearchResult();
addNodeDataToSearchResult(e,result);
return result;
}
private CaptureSearchResult searchElementToCaptureSearchResult(Node e) {
CaptureSearchResult result = new CaptureSearchResult();
addNodeDataToSearchResult(e,result);
return result;
}
private void addNodeDataToSearchResult(Node e, SearchResult result) {
NodeList chitlens = e.getChildNodes();
for(int i = 0; i < chitlens.getLength(); i++) {
String key = chitlens.item(i).getNodeName();
String value = getNodeTextValue(chitlens.item(i));
if(!key.equals("#text")) {
result.putCustom(key, value);
}
}
}
protected NodeList getRequestFilters(Document d) {
if (d == null) {
return null;
}
// Jump to the search item list.
NodeList nodes = d.getElementsByTagName(WB_XML_REQUEST_TAGNAME);
if(nodes.getLength() != 1) {
// TODO: warning?
return null;
}
return nodes.item(0).getChildNodes();
}
protected NodeList getSearchResults(Document d) {
if (d == null) {
return null;
}
NodeList nodes = d.getElementsByTagName(WB_XML_RESULT_TAGNAME);
return (nodes.getLength() <= 0) ? null : nodes;
}
protected String getRequestUrl(WaybackRequest wbRequest)
throws BadQueryException {
WaybackRequest tmp = wbRequest.clone();
if(tmp.isReplayRequest()) {
tmp.setCaptureQueryRequest();
}
return this.searchUrlBase + "?" + tmp.getQueryArguments();
}
// extract the text content of a single tag under a node
protected String getNodeContent(Element e, String key) {
NodeList nodes = e.getElementsByTagName(key);
String result = null;
if (nodes != null && nodes.getLength() > 0) {
result = getNodeTextValue(nodes.item(0));
}
return (result == null || result.length() == 0) ? null : result;
}
// do an HTTP request, plus parse the result into an XML DOM
protected Document getHttpDocument(String url)
throws IOException, SAXException {
URL u = new URL(url);
URLConnection conn = u.openConnection();
conn.setConnectTimeout(connectTimeout);
conn.setReadTimeout(readTimeout);
return (getDocumentBuilder()).parse(conn.getInputStream(),url);
}
protected Document getFileDocument(File f)
throws IOException, SAXException {
return (getDocumentBuilder()).parse(f);
}
/**
* @return the searchUrlBase
*/
public String getSearchUrlBase() {
return searchUrlBase;
}
/**
* @param searchUrlBase the searchUrlBase to set
*/
public void setSearchUrlBase(String searchUrlBase) {
this.searchUrlBase = searchUrlBase;
}
public void shutdown() throws IOException {
// No-op
}
public UrlCanonicalizer getCanonicalizer() {
return canonicalizer;
}
public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
this.canonicalizer = canonicalizer;
}
public int getConnectTimeout() {
return connectTimeout;
}
public void setConnectTimeout(int connectTimeout) {
this.connectTimeout = connectTimeout;
}
public int getReadTimeout() {
return readTimeout;
}
public void setReadTimeout(int readTimeout) {
this.readTimeout = readTimeout;
}
}