package org.archive.hadoop.fs; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URISyntaxException; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.archive.petabox.PetaboxClient; import org.mortbay.util.ajax.JSON; /** * Searches items in given collection with IA's search engine. * search engine is publicly accessible, but can only find items indexed by search engine. * those items marked "NoIndex" will not be returned by this implementation. * @author kenji * */ public class SearchEngineItemSearcher implements ItemSearcher { private static Log LOG = LogFactory.getLog(SearchEngineItemSearcher.class); protected PetaboxFileSystem fs; protected URI fsUri; protected int maxRetries = 10; protected int retryDelay = 2000; // milliseconds protected int connectionTimeout = 60*1000; protected int socketTimeout = 0; // milliseconds, 0=infinite public final static int SEARCH_ROWS_PER_PAGE = 200; public SearchEngineItemSearcher() { } @Override public void initialize(PetaboxFileSystem fs, URI fsUri, Configuration conf) { this.fs = fs; this.fsUri = fsUri; String confbase = "fs." + fsUri.getScheme(); maxRetries = conf.getInt(confbase + ".max-retries", 10); } protected static long isodatetime2timestamp(String isodatetime) { if (isodatetime == null) return 0; DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ENGLISH); try { Date date = df.parse(isodatetime); return date.getTime(); } catch (ParseException ex) { return 0; } } protected URI buildSearchURI(String itemid, int start) throws URISyntaxException { StringBuilder params = new StringBuilder(); params.append("q=collection:").append(itemid); params.append("&fl[]=identifier&fl[]=publicdate"); params.append("&sort[]=publicdate+asc"); params.append("&indent=&start=").append(start); params.append("&rows=").append(SEARCH_ROWS_PER_PAGE); params.append("&output=json"); return new URI("http", fsUri.getAuthority(), "/advancedsearch.php", params.toString(), null); } /* (non-Javadoc) * @see org.archive.crawler.hadoop.ItemSearcher#searchItems(java.lang.String) */ @SuppressWarnings("unchecked") @Override public FileStatus[] searchItems(String itemid) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); int start = 0; long numresults = Long.MAX_VALUE; while (start < numresults) { URI uri; try { uri = buildSearchURI(itemid, start); LOG.debug("search uri=" + uri); } catch (URISyntaxException ex) { throw new IOException("failed to build URI for itemid=" + itemid + ", start=" + start, ex); } PetaboxClient pbclient = fs.getPetaboxClient(); // HttpClient client = fs.getHttpClient(); // HttpGet get = fs.createHttpGet(uri); HttpEntity entity = null; Map<String, Object> jo = null; int retries = 0; do { if (retries > 0) { try { Thread.sleep(retryDelay); } catch (InterruptedException ex) { } } HttpResponse resp; try { //resp = client.execute(get); resp = pbclient.doGet(uri); } catch (IOException ex) { LOG.warn("connection to " + uri + " failed", ex); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted trying to connect"); } continue; } StatusLine st = resp.getStatusLine(); entity = resp.getEntity(); switch (st.getStatusCode()) { case 200: if (retries > 0) { LOG.info(uri + ": succeeded after " + retries + " retry(ies)"); } // it appears search engine often fails to return JSON formatted output despite // status code 200. detect it here. try { Reader reader = new InputStreamReader(entity.getContent(), "UTF-8"); jo = (Map<String, Object>)JSON.parse(reader); reader.close(); } catch (IOException ex) { LOG.warn(uri + " error reading 200 response: " + ex.getMessage()); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted"); } continue; } if (jo == null) { LOG.warn(uri + " returned 200, but JSON parser failed on entity"); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted"); } continue; } break; case 502: case 503: case 504: if (entity != null) entity.getContent().close(); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted on " + st.getStatusCode() + " " + st.getReasonPhrase()); } LOG.warn(uri + " failed " + st.getStatusCode() + " " + st.getReasonPhrase() + ", retry " + retries); entity = null; continue; default: entity.getContent().close(); throw new IOException(st.getStatusCode() + " " + st.getReasonPhrase()); } } while (jo == null); Map<String, Object> jresp = (Map<String, Object>)jo.get("response"); // is this a failure scenario that should be retried? if (jresp == null) break; Long numfound = (Long)jresp.get("numFound"); numresults = numfound != null ? numfound : 0; Object[] jdocs = (Object[])jresp.get("docs"); // TODO: log warning? if (jdocs == null || jdocs.length == 0) break; for (int i = 0; i < jdocs.length; i++) { Map<String, Object> jdoc = (Map<String, Object>)jdocs[i]; if (jdoc != null) { String iid = (String)jdoc.get("identifier"); if (iid == null) continue; String publicdate = (String)jdoc.get("publicdate"); // ISO format long mtime = isodatetime2timestamp(publicdate); Path qf = new Path(fsUri.toString(), "/" + iid); LOG.debug("collection:" + itemid + " qf=" + qf); FileStatus fst = new FileStatus(0, true, 2, 4096, mtime, qf); result.add(fst); } } start += jdocs.length; } LOG.info(String.format("searchItems(collection=%s): returning %d items", itemid, result.size())); return result.toArray(new FileStatus[result.size()]); } }