package org.archive.petabox; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.net.URISyntaxException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.PositionedReadable; import org.apache.hadoop.fs.Seekable; import org.apache.http.ConnectionClosedException; import org.apache.http.HttpEntity; import org.apache.http.HttpMessage; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; import org.archive.hadoop.fs.PetaboxFileSystem; public class PetaboxClient { private static final Log LOG = LogFactory.getLog(PetaboxClient.class); protected String petaboxProtocol = "http"; protected String petaboxHost = "archive.org"; public void setPetaboxHost(String petaboxHost) { this.petaboxHost = petaboxHost; } /** * text set to Referer header of each request. * for helping petabox admin identify source of requests. */ protected String referer; public void setReferer(String referer) { this.referer = referer; } protected int maxRetries = 10; protected int retryDelay = 2000; // milliseconds protected int connectionTimeout = 60*1000; protected int socketTimeout = 60*1000; // milliseconds, 0=infinite // socket parameters for metadata API protected int metadataConnectionTimeout = 10*1000; // milliseconds protected int metadataSocketTimeout = 5*1000; // milliseconds protected int bufferSize = 8192; /** * if true, PetaboxFileSystem makes up empty item when Metadata API tells it's non-existent, * instead of throwing FileNotFoundException. */ protected boolean ignoreMissingItems = false; public static final String VERSION = "0.0.2"; public static final String USER_AGENT = PetaboxFileSystem.class.getName() + "/" + VERSION; protected PetaboxAuthProvider authProvider; /** * set {@link PetaboxCredentialProvider} for authenticating requests. * @deprecated use {@link #setAuthProvider(PetaboxAuthProvider)}. * @param credentialProvider */ public void setCredentialProvider(PetaboxCredentialProvider credentialProvider) { this.authProvider = credentialProvider; } /** * set {@link PetaboxAuthProvider} for authenticating requests. * @param authProvider */ public void setAuthProvider(PetaboxAuthProvider authProvider) { this.authProvider = authProvider; } protected HttpClient client; public PetaboxClient(PetaboxClientConfig conf) { // ClientConnectionManager properties can be configured by config properties. ThreadSafeClientConnManager connman = new ThreadSafeClientConnManager(); int maxPerRoute = conf.getInt("max-per-route", connman.getDefaultMaxPerRoute()); int maxTotal = conf.getInt("max-total", connman.getMaxTotal()); connman.setDefaultMaxPerRoute(maxPerRoute); connman.setMaxTotal(maxTotal); this.client = new DefaultHttpClient(connman); this.maxRetries = conf.getInt("max-retries", this.maxRetries); this.retryDelay = conf.getInt("retry-delay", this.retryDelay); this.connectionTimeout = conf.getInt("connection-timeout", this.connectionTimeout); this.socketTimeout = conf.getInt("socket-timeout", this.socketTimeout); this.metadataConnectionTimeout = conf.getInt("metadata.connection-timeout", this.metadataConnectionTimeout); this.metadataSocketTimeout = conf.getInt("metadata.socket-timeout", this.metadataSocketTimeout); this.ignoreMissingItems = conf.getBoolean("ignore-missing-items", this.ignoreMissingItems); } public ItemMetadata getItemMetadata(String itemid) throws IOException { if (itemid == null) throw new IOException("invalid itemid: null"); if (itemid.equals("")) throw new IOException("invalid itemid \"" + itemid + "\""); URI uri; try { uri = new URI(petaboxProtocol, petaboxHost, "/metadata/" + itemid, null); } catch (URISyntaxException ex) { throw new IOException(ex); } HttpGet get = new HttpGet(uri); HttpParams params = get.getParams(); HttpConnectionParams.setConnectionTimeout(params, metadataConnectionTimeout); HttpConnectionParams.setSoTimeout(params, metadataSocketTimeout); LOG.debug("fetching metadata for item '" + itemid + "'"); HttpEntity entity = null; int retries = 0; ItemMetadata md = null; do { if (retries > 0) { if (retries > maxRetries) { throw new IOException(uri + ": retry exhausted"); } try { Thread.sleep(retryDelay); } catch (InterruptedException ex) { } } HttpResponse resp; try { resp = client.execute(get); } catch (IOException ex) { // although getItemMetadata is declared as throws IOException, throwing IOException // will kill hadoop job. Request should be retried upon errors like "connection refused". LOG.warn(uri + " failed: " + ex.getMessage()); ++retries; continue; } StatusLine st = resp.getStatusLine(); entity = resp.getEntity(); switch (st.getStatusCode()) { case 200: if (retries > 0) { LOG.info(uri + ": succeeded after " + retries + " retry(ies)"); } break; case 502: case 503: case 504: entity.getContent().close(); LOG.warn(uri + " failed " + st.getStatusCode() + " " + st.getReasonPhrase() + ", try " + retries); ++retries; entity = null; continue; default: entity.getContent().close(); throw new IOException(uri + ": failed " + st.getStatusCode() + " " + st.getReasonPhrase()); } // XXX assuming JSON is in UTF-8 encoding ByteArrayOutputStream bao = new ByteArrayOutputStream(); InputStream is = entity.getContent(); int c; try { while ((c = is.read()) != -1) { bao.write(c); } } catch (IOException ex) { LOG.warn("error reading metadata response (" + ex.getMessage() + ")"); ++retries; continue; } finally { is.close(); } Reader reader = new InputStreamReader(new ByteArrayInputStream(bao.toByteArray())); //Reader reader = new InputStreamReader(entity.getContent(), "UTF-8"); try { md = new ItemMetadata(reader); } catch (Throwable ex) { LOG.error("failed to parse matadata API response for item " + itemid + "(" + bao.size() + " bytes):\n" + bao.toString(), ex); throw new IOException("failed to parse metadata API response for item " + itemid, ex); } reader.close(); if (md.server == null) { if (md.dir == null) { // assume metadata API returned "{}", i.e. non-existent item. if (++retries > maxRetries) { // if ignore-missing-items flag is set, return with empty metadata. don't add it // to the metadataCache. if (ignoreMissingItems) { break; } // throw specific exception for non-existent item case. throw new FileNotFoundException("/" + itemid + ": non-existent item, retry exhausted"); } LOG.warn("metadata API says item non-existent, retrying"); md = null; continue; } else { LOG.warn("metadata API failed (no server info) for item " + itemid + ", try " + retries); LOG.warn("entity=" + new String(bao.toByteArray(), "UTF-8")); ++retries; md = null; continue; } } } while (md == null); return md; } public void setupRequest(HttpMessage msg) { msg.addHeader("User-Agent", USER_AGENT); if (referer != null) msg.addHeader("Referer", referer); if (authProvider != null) authProvider.addAuthCookies(msg); } /** * return HttpGet object properly setup with connection parameters and headers. * @param uri request URI * @return configured HttpGet object */ public HttpGet createHttpGet(URI uri) { HttpGet get = new HttpGet(uri); HttpParams params = get.getParams(); HttpConnectionParams.setConnectionTimeout(params, connectionTimeout); HttpConnectionParams.setSoTimeout(params, socketTimeout); setupRequest(get); return get; } /** * HttpInputStream implements MapReduce compatible InputStream on top of HTTP-based * access to files on Petabox. It makes best effort to encapsulate retries often * necessary to deal with transient problems. * HttpInputStream implements Seekable and PositionedReadable interfaces for HTTP * resources efficiently with Region requests. * Actual open is delayed until read operation is performed. * */ public class HttpInputStream extends InputStream implements Seekable, PositionedReadable { protected URI uri; protected long pos; protected long endpos; protected InputStream in; protected byte[] buffer; protected int bufpos; protected int bufend; /** * maximum length of seeking by reading off instead of re-opening resource * with new Range request. optimal value depends on the relative cost of * Range request. */ public final int SMALL_GAP = 1000000; /** * * @param uri actual HTTP URL to open. * @param bufferSize currently unused. */ public HttpInputStream(URI uri, int bufferSize) { this(uri, bufferSize, 0); } public HttpInputStream(URI uri, int bufferSize, long offset) { this.uri = uri; this.pos = offset; this.endpos = -1; this.in = null; if (bufferSize > 0) { buffer = new byte[bufferSize]; } } // it is critical to override this method for GZIP decompression // to always work correctly on block-compressed (concatenated) file. // InputStream.available() always returns 0, which makes GZIP decompression // to assume there's no more concatenated blocks when there are <= 26 // bytes left in its decompression buffer. see GZIPInputStream.readTrailer() // for details. @Override public int available() throws IOException { // as long as it is > 0, return value itself doesn't mean much. return pos < endpos ? 1 : 0; } // Seekable public long getPos() throws IOException { return pos; } public void seek(long pos) throws IOException { if (this.pos == pos) return; if (in != null) { if (pos >= this.pos && pos <= this.pos + SMALL_GAP) { int skiplen = (int)(pos - this.pos); if (buffer != null) { if (bufpos + skiplen < bufend) { // new position is within the buffer bufpos += skiplen; skiplen = 0; } else { skiplen -= (bufend - bufpos); } } byte[] buffer = new byte[4096]; while (skiplen > 0) { // TODO: should we let IOException from read to bubble up, or catch it and have // read retry? decision depends on how Hadoop react to IOException from seek. int n = in.read(buffer, 0, skiplen > buffer.length ? buffer.length : skiplen); if (n < 0) break; skiplen -= n; } } else { // moving backward or gap is larger than adequate for seek-by-reading close(); } } this.pos = pos; } public boolean seekToNewSource(long targetPos) throws IOException { // TODO: check the detailed requirements for this method. return false; } /** * open actual InputStream for the resource, offset {@code pos}. * must set {@code in} to non-null if returning without throwing exception. * @throws IOException exception from underlying HTTP protocol */ protected void open() throws IOException { HttpGet get = createHttpGet(uri); if (pos > 0) { get.addHeader("Range", "bytes=" + pos + "-"); } LOG.info("HttpInputStream.open:" + uri + "(pos=" + pos + ")"); int retries = 0; HttpEntity entity = null; do { if (retries > 0) { try { Thread.sleep(retryDelay); } catch (InterruptedException ex) { } } HttpResponse resp = null; try { resp = client.execute(get); } catch (IOException ex) { LOG.warn("connection to " + uri + " failed", ex); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted trying to connect"); } continue; } StatusLine st = resp.getStatusLine(); entity = resp.getEntity(); // TODO: detect failed Range request and report it. I know Range request is supported // on most resources, but It is catastrophic to ignore when it fails. switch (st.getStatusCode()) { case 200: case 206: // Partial Content if (retries > 0) { LOG.info(uri + ": succeeded after " + retries + " retry(ies)"); } long clen = entity.getContentLength(); if (clen < 0) { LOG.info("content-length is unavailable - no auto resume will be attempted."); endpos = -1; } else { endpos = pos + entity.getContentLength(); } break; case 404: //entity.getContent().close(); // even 404 may be retried :-) //throw new FileNotFoundException(uri + ": " + st.getReasonPhrase()); case 403: // petabox sometimes return false 403... //entity.getContent().close(); //throw new IOException(uri + ": " + st.getReasonPhrase()); case 500: // Internal Server Error case 502: // Bad Gateway case 503: // Service Unavailable case 504: // Gateway Timeout // these happen when paired storage is overloaded, or having infrastructure-level // problems and not rare. entity.getContent().close(); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted on " + st.getStatusCode() + " " + st.getReasonPhrase()); } LOG.warn(uri + ": " + st.getStatusCode() + " " + st.getReasonPhrase() + ", retry " + retries + "/" + maxRetries); entity = null; if (st.getStatusCode() == 403) { // when we get false 403, it often 'sticks' to HttpClient because of // PHPSESSID cookie. Remove it before retrying. if (client instanceof DefaultHttpClient) { ((DefaultHttpClient)client).getCookieStore().clear(); } } continue; default: entity.getContent().close(); throw new IOException(uri + ": " + st.getStatusCode() + " " + st.getReasonPhrase()); } } while (entity == null); in = entity.getContent(); if (buffer != null) { bufpos = buffer.length; bufend = 0; } } @Override public void close() throws IOException { if (in != null) { in.close(); in = null; } } @Override public int read() throws IOException { while (true) { if (in == null) open(); int b; try { if (buffer == null) { b = in.read(); } else { if (bufpos >= buffer.length) { bufend = in.read(buffer); bufpos = 0; } if (bufpos >= bufend) { b = -1; } else { b = (buffer[bufpos++] & 0xff); } } } catch (ConnectionClosedException ex) { // sender closed socket, probably for long idle period. LOG.info("connection closed unexpectedly", ex); // TODO: not sure if we can assume connection has been returned // to the pool and there's no need to call in.close(). assuming // we need to call in.close(). b = -1; } if (b == -1) { // if receiver/sender closed socket prematurely, try reopening. if (endpos >= 0 && pos < endpos) { LOG.info("socket closed prematurely. rereading from " + pos); close(); continue; } return b; } else { pos++; return b; } } } @Override public int read(byte[] b, int off, int len) throws IOException { //LOG.info("read("+b+","+off+","+len+")"); return super.read(b, off, len); } // PositionedReadable public int read(long position, byte[] buffer, int offset, int length) throws IOException { LOG.info("HttpInputStream.read(" + position + "," + buffer.length + ")"); // As PositionedReadable interface dictates this method does not change // file offset, and thread-safe, it is implemented as one-shot HTTP request. if (buffer.length == 0) return 0; HttpGet get = new HttpGet(uri); HttpParams params = get.getParams(); HttpConnectionParams.setConnectionTimeout(params, connectionTimeout); HttpConnectionParams.setSoTimeout(params, socketTimeout); setupRequest(get); get.addHeader("Range", "bytes=" + position + "-" + (position + length - 1)); HttpEntity entity = null; int retries = 0; do { if (retries > 0) { try { Thread.sleep(retryDelay); } catch (InterruptedException ex) { } } HttpResponse resp; try { resp = client.execute(get); } catch (IOException ex) { LOG.warn("connection to " + uri + " failed", ex); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted trying to connect"); } continue; } StatusLine st = resp.getStatusLine(); entity = resp.getEntity(); // TODO: detect failed Range request and report it. I know Range request is supported // on most resources, but It is catastrophic to ignore when it fails. switch (st.getStatusCode()) { case 200: if (retries > 0) { LOG.info(uri + ": succeeded after " + retries + " retry(ies)"); } break; case 404: //entity.getContent().close(); //throw new FileNotFoundException(st.getReasonPhrase()); case 403: // petabox often returns false 403... //entity.getContent().close(); //throw new IOException(st.getReasonPhrase()); case 502: case 503: case 504: entity.getContent().close(); if (++retries > maxRetries) { throw new IOException(uri + ": retry exhausted on " + st.getStatusCode() + " " + st.getReasonPhrase()); } LOG.warn(uri + ": " + st.getStatusCode() + " " + st.getReasonPhrase() + ", retry " + retries); entity = null; continue; default: entity.getContent().close(); throw new IOException(st.getReasonPhrase()); } } while (entity == null); InputStream tin = entity.getContent(); int n = tin.read(buffer, offset, length); tin.close(); return n; } public void readFully(long position, byte[] buffer) throws IOException { readFully(position, buffer, 0, buffer.length); } public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { read(position, buffer, offset, length); // is there anything I should do if read bytes is less than length? } } protected HttpInputStream openURI(URI uri, long offset) throws URISyntaxException { String scheme = uri.getScheme(); String authority = uri.getAuthority(); if (scheme == null || authority == null) { if (scheme == null) scheme = petaboxProtocol; if (authority == null) authority = petaboxHost; uri = new URI(scheme, authority, uri.getPath(), uri.getQuery(), uri.getFragment()); } return new HttpInputStream(uri, bufferSize, offset); } public HttpInputStream openURI(URI uri) throws URISyntaxException { return openURI(uri, 0); } /** * return HttpInputStream for reading {@code urlpath} reliably. * @param urlpath path part of an HTTP resource to read. * @return HttpInputStream * @throws URISyntaxException if urlpath is illegal as URI path */ protected HttpInputStream openPath(String urlpath, long offset) throws URISyntaxException { URI uri = new URI(petaboxProtocol, petaboxHost, urlpath, null); return new HttpInputStream(uri, bufferSize, offset); } protected HttpInputStream openPath(String urlpath) throws URISyntaxException { return openPath(urlpath, 0); } /** * * @param path designates a file to download in /IDENTIFIER/FILE format. * @return HttpInputStream * @throws URISyntaxException if path is illegal as URI path */ public HttpInputStream openDownload(String path) throws URISyntaxException { return openDownload(path, 0); } public HttpInputStream openDownload(String path, long offset) throws URISyntaxException { return openPath((path.startsWith("/") ? "/download" : "/download/") + path, offset); } /** * open HttpInputStream for reading an item file {@code path} reliably. in contrast to * {@link #openDownload(String, long)}, this method opens the file through {@code /serve} * URL so as not to increment download counter. * @param path designates a file to download in /IDENTIFIER/FILE format. * @return HttpInputStream * @throws URISyntaxException if path is illegal as URI path. */ public HttpInputStream openServe(String path) throws URISyntaxException { return openServe(path, 0); } public HttpInputStream openServe(String path, long offset) throws URISyntaxException { return openPath((path.startsWith("/") ? "/serve" : "/serve/") + path, offset); } // TEMPORARY - subject to refactoring in the near future. public HttpResponse doGet(URI uri) throws IOException { HttpGet get = createHttpGet(uri); return client.execute(get); } }