/**
*
*/
package org.archive.hadoop.fs;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.archive.petabox.PetaboxClient;
import org.mortbay.util.ajax.JSON;
/**
* Searches items in given collection with private custom web app that indexes individual collection
* for faster look up.
*
* @author kenji
*
*/
public class CollectionIndexItemSearcher implements ItemSearcher {
private static final Log LOG = LogFactory.getLog(CollectionIndexItemSearcher.class);
protected PetaboxFileSystem fs;
protected URI fsUri;
//String serviceUri = "http://crawl400.us.archive.org/crawling/wide/getitems.py/";
String serviceUri = "http://archive.org/~kenji/getitems.php?c=";
protected int maxRetries = 10;
protected int retryDelay = 2000; // milliseconds
/* (non-Javadoc)
* @see org.archive.crawler.hadoop.ItemSearcher#initialize(org.archive.crawler.hadoop.PetaboxFileSystem, java.net.URI, org.apache.hadoop.conf.Configuration)
*/
@Override
public void initialize(PetaboxFileSystem fs, URI fsUri, Configuration conf) {
this.fs = fs;
this.fsUri = fsUri;
if (conf != null) {
serviceUri = conf.get(CollectionIndexItemSearcher.class.getName()+".serviceUri", serviceUri);
}
}
protected URI buildSearchURI(String itemid) throws URISyntaxException {
return URI.create(serviceUri + itemid);
}
/* (non-Javadoc)
* @see org.archive.crawler.hadoop.ItemSearcher#searchItems(java.lang.String)
*/
@Override
public FileStatus[] searchItems(String itemid) throws IOException {
List<FileStatus> result = null;
URI uri;
try {
uri = buildSearchURI(itemid);
LOG.debug("search uri=" + uri);
} catch (URISyntaxException ex) {
throw new IOException("failed to build URI for itemid=" + itemid, ex);
}
PetaboxClient pbclient = fs.getPetaboxClient();
// HttpClient client = fs.getHttpClient();
// HttpGet get = fs.createHttpGet(uri);
HttpEntity entity = null;
int retries = 0;
do {
if (retries > 0) {
if (retries > maxRetries) {
throw new IOException(uri + ": retry exhausted, giving up.");
}
try {
Thread.sleep(retryDelay);
} catch (InterruptedException ex) {
}
}
HttpResponse resp;
try {
// resp = client.execute(get);
resp = pbclient.doGet(uri);
} catch (IOException ex) {
LOG.warn("connection to " + uri + " failed", ex);
++retries;
continue;
}
StatusLine st = resp.getStatusLine();
entity = resp.getEntity();
switch (st.getStatusCode()) {
case 200:
if (retries > 0) {
LOG.info(uri + ": succeeded after " + retries + " retry(ies)");
}
// it appears search engine often fails to return JSON formatted output despite
// status code 200. detect it here.
Reader reader = new InputStreamReader(entity.getContent(), "UTF-8");
BufferedReader lines = new BufferedReader(reader);
result = new ArrayList<FileStatus>();
String line;
int ln = 0;
try {
while ((line = lines.readLine()) != null) {
ln++;
String iid = null;
Long mtime = null;
if (line.startsWith("{")) {
@SuppressWarnings("unchecked")
Map<String, Object> jo = (Map<String, Object>)JSON.parse(line);
iid = (String)jo.get("id");
// m is in seconds. be sure to multiply it by 1000 for FileStatus.
mtime = (Long)jo.get("m");
} else if (Character.isLetterOrDigit(line.charAt(0))) {
int p = line.indexOf(' ');
if (p < 0) {
iid = line;
} else {
iid = line.substring(0, p);
}
mtime = 0L;
} else {
LOG.warn(uri + ": invalid line (neither JSON nor identifier) at " + ln);
continue;
}
if (iid == null) {
LOG.warn(uri + ": id undefined or null at line " + ln);
continue;
}
if (mtime == null) {
LOG.warn(uri + ": m undefined or null at line " + ln);
mtime = 0L;
}
Path qf = new Path(fsUri.toString(), "/" + iid);
LOG.debug("collection:" + itemid + " qf=" + qf);
FileStatus fst = new FileStatus(0, true, 2, 4096, mtime * 1000, qf);
result.add(fst);
}
} catch (IOException ex) {
LOG.warn(uri + "error reading response", ex);
++retries;
continue;
} catch (IllegalStateException ex) {
// JSON.parse throws this for parse error.
LOG.warn(uri + ": JSON.parse failed at line " + ln, ex);
++retries;
continue;
} finally {
lines.close();
}
break;
case 502:
case 503:
case 504:
if (entity != null)
entity.getContent().close();
++retries;
LOG.warn(uri + " failed " + st.getStatusCode() + " "
+ st.getReasonPhrase() + ", retry " + retries);
entity = null;
continue;
default:
entity.getContent().close();
throw new IOException(st.getStatusCode() + " " + st.getReasonPhrase());
}
} while (result == null);
LOG.info(String.format("searchItems(collection=%s): returning %d items", itemid, result.size()));
return result.toArray(new FileStatus[result.size()]);
}
// main method for quick test against production service.
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
URI fsUri = URI.create("petabox://archive.org/");
PetaboxFileSystem fs = new PetaboxFileSystem();
fs.initialize(fsUri, conf);
CollectionIndexItemSearcher searcher = new CollectionIndexItemSearcher();
searcher.initialize(fs, fsUri, conf);
FileStatus[] items = searcher.searchItems("wide00005");
for (int i = 0; i < items.length; i++) {
System.out.println(items[i].getPath());
}
}
}