package org.archive.hadoop.fs; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringWriter; import java.net.URI; import java.net.URISyntaxException; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.apache.http.client.methods.HttpGet; import org.archive.petabox.PetaboxClient; import org.mortbay.util.ajax.JSON; /** * Searches items in given collection with MetaManager (metamgr.php). * MetaManager is restricted to authenticated users, can lookup all items with complex * query, but its JSON API has critical issues that makes it almost useless for item lookup. * this code is here just in case new service similar to current MetaManager replaces it. * @author Kenji Nagahashi * */ public class MetaManagerItemSearcher implements ItemSearcher { private static Log LOG = LogFactory.getLog(MetaManagerItemSearcher.class); protected PetaboxFileSystem fs; protected URI fsUri; protected int maxRetries = 10; protected int retryDelay = 2000; // milliseconds protected int connectionTimeout = 60*1000; protected int socketTimeout = 0; // milliseconds, 0=infinite public final static int SEARCH_ROWS_PER_PAGE = 200; public MetaManagerItemSearcher() { } @Override public void initialize(PetaboxFileSystem fs, URI fsUri, Configuration conf) { this.fs = fs; this.fsUri = fsUri; String confbase = "fs." + fsUri.getScheme(); maxRetries = conf.getInt(confbase + ".max-retries", 10); } protected static long sqldatetime2timestamp(String sqldatetime) { if (sqldatetime == null) return 0; DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); try { Date date = df.parse(sqldatetime); return date.getTime(); } catch (ParseException ex) { return 0; } } protected static boolean inCollection(String iid, String collections) { int s = collections.indexOf(iid); if (s < 0) return false; int e = s + iid.length(); if (s == 0) { return e >= collections.length() || collections.charAt(e) == ';'; } else { return collections.charAt(s - 1) == ';' && (e >= collections.length() || collections.charAt(e) == ';'); } } public final static int METAMGR_ROWS_PER_PAGE = 200; protected URI buildMetaManagerURI(String itemid, int start) throws URISyntaxException { StringBuilder params = new StringBuilder(); params.append("srt=identifier"); params.append("&ord=asc"); params.append("&w_collection=*").append(itemid).append("*"); params.append("&fs_identifier=on&fs_mediatype=on&fs_collection=on"); params.append("&off=").append(Integer.toString(start)); // getting all often results in 504 error for big collection. //params.append("&lim=0"); // "all" params.append("&lim=").append(Integer.toString(METAMGR_ROWS_PER_PAGE)); params.append("&output_format=json"); //return new URI("http", fsUri.getAuthority(), "/metamgr.php", params.toString(), null); return new URI("http", "www.us.archive.org", "/metamgr.php", params.toString(), null); } /* (non-Javadoc) * @see org.archive.crawler.hadoop.ItemSearcher#searchItems(java.lang.String) */ @SuppressWarnings("unchecked") @Override public FileStatus[] searchItems(String itemid) throws IOException { LOG.info("looking up items in collection " + itemid + " with metamgr"); List<FileStatus> result = new ArrayList<FileStatus>(); int start = 0; // total number of results is not available in metamgr's JSON response. long numresults = Long.MAX_VALUE; while (start < numresults) { URI uri; try { uri = buildMetaManagerURI(itemid, start); LOG.info("search uri=" + uri); } catch (URISyntaxException ex) { throw new IOException("failed to build URI for itemid=" + itemid + ", start=" + start, ex); } PetaboxClient pbclient = fs.getPetaboxClient(); // HttpGet get = fs.createHttpGet(uri); HttpEntity entity = null; Map<String, Object> jo = null; int retries = 0; do { if (retries > 0) { try { Thread.sleep(retryDelay); } catch (InterruptedException ex) { } } HttpResponse resp; try { // resp = fs.getHttpClient().execute(get); resp = pbclient.doGet(uri); } catch (IOException ex) { LOG.warn("connection to " + uri + " failed", ex); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted trying to connect"); } continue; } StatusLine st = resp.getStatusLine(); entity = resp.getEntity(); switch (st.getStatusCode()) { case 200: if (retries > 0) { LOG.info(uri + ": succeeded after " + retries + " retry(ies)"); } // it appears search engine often fails to return JSON formatted output despite // status code 200. detect it here. Reader reader = new InputStreamReader(entity.getContent(), "UTF-8"); try { jo = (Map<String, Object>)JSON.parse(reader); } catch (IllegalStateException ex) { LOG.error("JSON.parse failed", ex); StringWriter w = new StringWriter(); int c; while ((c = reader.read()) != -1) { w.write(c); } LOG.error("rest of response:" + w.toString()); } reader.close(); if (jo == null) { LOG.warn(uri + " returned 200, but JSON parser failed on entity"); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted on " + uri); } continue; } break; case 502: case 503: case 504: if (entity != null) entity.getContent().close(); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted on " + st.getStatusCode() + " " + st.getReasonPhrase()); } LOG.warn(uri + " failed " + st.getStatusCode() + " " + st.getReasonPhrase() + ", retry " + retries); entity = null; continue; default: entity.getContent().close(); throw new IOException(uri + ": " + st.getStatusCode() + " " + st.getReasonPhrase()); } } while (jo == null); // fields are returned in an array. we assume they are always in the same // order as fs_* parameters appears in query URL. // TODO: we could at least put a check of field names here. Object[] rows = (Object[])jo.get("rows"); if (rows == null) { break; //? } for (int i = 0; i < rows.length; i++) { Object[] row = (Object[])rows[i]; if (row == null) continue; // just in case... String iid = (String)row[0]; if (iid == null) continue; // exclude collection items String mediatype = (String)row[1]; if ("collection".equals(mediatype)) continue; // collection query pattern is not specific enough. check whether item // really have itemid as its collection. if (!inCollection(itemid, (String)row[2])) continue; String publicdate = row.length > 3 ? (String)row[3] : null; long mtime = sqldatetime2timestamp(publicdate); Path qf = new Path(fsUri.toString(), "/" + iid); LOG.debug("collection:" + itemid + " qf=" + qf); FileStatus fst = new FileStatus(0, true, 2, 4096, mtime, qf); result.add(fst); } start += rows.length; } LOG.info(String.format("searchItems(collection=%s): returning %d items", itemid, result.size())); return result.toArray(new FileStatus[result.size()]); } }