/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.resourceindex;
import it.unimi.dsi.lang.MutableString;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.archive.wayback.ResourceIndex;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.util.Timestamp;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
*
*
* @author brad
* @version $Date$, $Revision$
*/
public class NutchResourceIndex implements ResourceIndex {
private static final Logger LOGGER =
Logger.getLogger(NutchResourceIndex.class.getName());
private final static int MAX_RECORDS = 1000;
private int maxRecords = MAX_RECORDS;
private static final String NUTCH_NS =
"http://www.nutch.org/opensearchrss/1.0/";
private String searchUrlBase;
private DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private DocumentBuilder builder;
// private static final String NUTCH_ARCNAME = "arcname";
// private static final String NUTCH_ARCOFFSET = "arcoffset";
private static final String NUTCH_FILENAME = "filename";
private static final String NUTCH_FILEOFFSET = "fileoffset";
private static final String NUTCH_ARCDATE = "date";
// private static final String NUTCH_ARCDATE_ALT = "arcdate";
private static final String NUTCH_DIGEST = "digest";
private static final String NUTCH_MIME_TYPE = "type";
// private static final String NUTCH_PRIMARY_TYPE = "primaryType";
// private static final String NUTCH_SUB_TYPE = "subType";
// private static final String NUTCH_CAPTURE_HOST = "site";
private static final String NUTCH_CAPTURE_URL = "link";
private static final String NUTCH_SEARCH_RESULT_TAG = "item";
private static final String NUTCH_SEARCH_RESULTS_TAG = "channel";
private static final String NUTCH_FIRST_RESULT = "opensearch:startIndex";
private static final String NUTCH_NUM_RESULTS = "opensearch:totalResults";
private static final String NUTCH_NUM_RETURNED = "opensearch:itemsPerPage";
private static final String NUTCH_DEFAULT_HTTP_CODE = "200";
private static final String NUTCH_DEFAULT_REDIRECT_URL = "-";
/**
* @throws ConfigurationException
*/
public void init() throws ConfigurationException {
LOGGER.info("initializing NutchResourceIndex...");
LOGGER.info("Using base search url " + this.searchUrlBase);
this.factory.setNamespaceAware(true);
try {
this.builder = this.factory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
// TODO: quiet extra stacktrace..
e.printStackTrace();
throw new ConfigurationException(e.getMessage());
}
}
/* (non-Javadoc)
* @see org.archive.wayback.ResourceIndex#query(org.archive.wayback.core.WaybackRequest)
*/
public SearchResults query(WaybackRequest wbRequest)
throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
// Get the URL for the request:
String requestUrl = getRequestUrl(wbRequest);
Document document = null;
try {
// HTTP Request + parse
LOGGER.info("Requesting OpenSearch: " + requestUrl);
document = getHttpDocument(requestUrl);
} catch (IOException e) {
// TODO: better error for user:
e.printStackTrace();
throw new ResourceIndexNotAvailableException(e.getMessage());
} catch (SAXException e) {
e.printStackTrace();
throw new ResourceIndexNotAvailableException("Unexpected SAX: " +
e.getMessage());
}
CaptureSearchResults results;
if(wbRequest.isReplayRequest() || wbRequest.isCaptureQueryRequest()) {
results = new CaptureSearchResults();
} else {
// TODO: this is wrong, but needs exploration into what NutchWax
// can actually do.
throw new BadQueryException("Unable to perform path " +
"prefix requests with this index type");
}
NodeList channel = getSearchChannel(document);
NodeList nodes = getSearchItems(document);
if (channel == null || channel.getLength() != 1) {
// TODO: better error for user:
throw new ResourceNotInArchiveException("No results for " +
requestUrl);
}
if (nodes == null) {
// TODO: better error for user:
throw new ResourceNotInArchiveException("No results for " +
requestUrl);
}
for (int i = 0; i < nodes.getLength(); i++) {
Element e = (Element) nodes.item(i);
List<CaptureSearchResult> resultsList = itemToSearchResults(e);
if(resultsList != null) {
for(CaptureSearchResult result : resultsList) {
results.addSearchResult(result);
}
}
}
Element channelElement = (Element) channel.item(0);
results.putFilter(SearchResults.RESULTS_FIRST_RETURNED,
getNodeContent(channelElement,NUTCH_FIRST_RESULT));
results.putFilter(SearchResults.RESULTS_NUM_RESULTS,
getNodeContent(channelElement,NUTCH_NUM_RESULTS));
results.putFilter(SearchResults.RESULTS_NUM_RETURNED,
getNodeContent(channelElement,NUTCH_NUM_RETURNED));
results.putFilter(SearchResults.RESULTS_REQUESTED,
String.valueOf(wbRequest.getResultsPerPage()));
results.putFilter(WaybackRequest.REQUEST_START_DATE,
Timestamp.earliestTimestamp().getDateStr());
results.putFilter(WaybackRequest.REQUEST_END_DATE,
Timestamp.latestTimestamp().getDateStr());
return results;
}
private List<CaptureSearchResult> itemToSearchResults(Element e)
throws ResourceIndexNotAvailableException {
String fileName = getNodeNutchContent(e,NUTCH_FILENAME);
String httpCode = NUTCH_DEFAULT_HTTP_CODE;
String digest = getNodeNutchContent(e,NUTCH_DIGEST);
String mimeType = getNodeNutchContent(e,NUTCH_MIME_TYPE);
String offsetStr = getNodeNutchContent(e,NUTCH_FILEOFFSET);
long offset = 0;
if(offsetStr != null && offsetStr.length() > 0) {
offset = Long.parseLong(offsetStr);
}
String redirectUrl = NUTCH_DEFAULT_REDIRECT_URL;
String originalUrl = getNodeContent(e,NUTCH_CAPTURE_URL);
String urlKey = originalUrl;
NodeList nodes = e.getElementsByTagNameNS(NUTCH_NS,NUTCH_ARCDATE);
int numDates = nodes.getLength();
ArrayList<CaptureSearchResult> results = null;
if(numDates > 0) {
results = new ArrayList<CaptureSearchResult>();
for(int i = 0; i < numDates; i++) {
String captureDate = getNodeTextValue(nodes.item(i));
CaptureSearchResult result = new CaptureSearchResult();
result.setFile(fileName);
result.setCaptureTimestamp(captureDate);
result.setHttpCode(httpCode);
result.setDigest(digest);
result.setMimeType(mimeType);
result.setOffset(offset);
result.setRedirectUrl(redirectUrl);
result.setOriginalUrl(originalUrl);
result.setUrlKey(urlKey);
results.add(result);
}
}
return results;
}
protected NodeList getSearchChannel(Document d) {
if (d == null) {
return null;
}
// Jump to the search item list.
NodeList nodes = d.getElementsByTagName(NUTCH_SEARCH_RESULTS_TAG);
return (nodes.getLength() <= 0)? null: nodes;
}
protected NodeList getSearchItems(Document d) {
if (d == null) {
return null;
}
// Jump to the search item list.
NodeList nodes = d.getElementsByTagName(NUTCH_SEARCH_RESULT_TAG);
return (nodes.getLength() <= 0)? null: nodes;
}
protected String getRequestUrl(WaybackRequest wbRequest)
throws BadQueryException {
String urlStr = wbRequest.getRequestUrl();
String exactDateStr = wbRequest.getReplayTimestamp();
if (exactDateStr != null && exactDateStr.length() == 0) {
exactDateStr = null;
}
String endDateStr = wbRequest.getEndTimestamp();
if (endDateStr == null || endDateStr.length() == 0) {
endDateStr = Timestamp.latestTimestamp().getDateStr();
}
String startDateStr = wbRequest.getStartTimestamp();
if (startDateStr == null || startDateStr.length() == 0) {
startDateStr = Timestamp.earliestTimestamp().getDateStr();
}
int hitsPerPage = wbRequest.getResultsPerPage();
if(hitsPerPage < 1) {
throw new BadQueryException("Hits per page must be positive");
}
if(hitsPerPage > maxRecords) {
throw new BadQueryException("Hits per page must be less than " +
maxRecords);
}
int start = (wbRequest.getPageNum()-1) * hitsPerPage;
if (urlStr == null || urlStr.length() <= 0) {
throw new BadQueryException("Url is empty.");
}
// Construct the search url.
MutableString ms = new MutableString(this.searchUrlBase)
.append("?query=");
// Add 'date:...+' to query string.
ms.append("date%3A").append(startDateStr).append('-').append(endDateStr);
ms.append('+');
// Add 'url:URL'.
if(wbRequest.isUrlQueryRequest()) {
ms.append("url%3A");
} else {
ms.append("exacturl%3A");
}
try {
ms.append(java.net.URLEncoder.encode("\""+urlStr+"\"", "UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new BadQueryException(e.toString());
}
ms.append("&hitsPerPage=").append(hitsPerPage);
ms.append("&start=").append(start);
ms.append("&dedupField=site");
// As we are always searching agains an url, a
// higher perDup/Site will return just more versions
ms.append("&hitsPerDup=").append(hitsPerPage);
ms.append("&hitsPerSite=").append(hitsPerPage);
return ms.toString();
}
// extract the text content of a single nutch: tag under a node
protected String getNodeNutchContent(Element e, String key) {
NodeList nodes = e.getElementsByTagNameNS(NUTCH_NS, key);
String result = null;
if (nodes != null && nodes.getLength() > 0) {
result = getNodeTextValue(nodes.item(0));
}
return (result == null || result.length() == 0)? null: result;
}
// extract the text content of a single tag under a node
protected String getNodeContent(Element e, String key) {
NodeList nodes = e.getElementsByTagName(key);
String result = null;
if (nodes != null && nodes.getLength() > 0) {
result = getNodeTextValue(nodes.item(0));
}
return (result == null || result.length() == 0)? null: result;
}
private String getNodeTextValue(Node n) {
if(n.hasChildNodes()) {
if(n.getFirstChild().getNodeName().equals("#text")) {
return n.getFirstChild().getNodeValue();
}
}
return "";
}
// do an HTTP request, plus parse the result into an XML DOM
protected synchronized Document getHttpDocument(String url)
throws IOException, SAXException {
Document d = null;
d = this.builder.parse(url);
return d;
}
/**
* @return the searchUrlBase
*/
public String getSearchUrlBase() {
return searchUrlBase;
}
/**
* @param searchUrlBase the searchUrlBase to set
*/
public void setSearchUrlBase(String searchUrlBase) {
this.searchUrlBase = searchUrlBase;
}
/**
* @return the maxRecords
*/
public int getMaxRecords() {
return maxRecords;
}
/**
* @param maxRecords the maxRecords to set
*/
public void setMaxRecords(int maxRecords) {
this.maxRecords = maxRecords;
}
public void shutdown() throws IOException {
}
}