/** * This file is part of muCommander, http://www.mucommander.com * Copyright (C) 2002-2016 Maxence Bernard * * muCommander is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * muCommander is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.mucommander.commons.file.protocol.http; import com.mucommander.commons.file.*; import com.mucommander.commons.file.protocol.FileProtocols; import com.mucommander.commons.file.protocol.ProtocolFile; import com.mucommander.commons.io.BlockRandomInputStream; import com.mucommander.commons.io.RandomAccessInputStream; import com.mucommander.commons.io.RandomAccessOutputStream; import com.mucommander.commons.io.base64.Base64Encoder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.util.StringTokenizer; import java.util.TreeSet; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * HTTPFile provides access to files located on an HTTP/HTTPS server. * * <p>The associated {@link FileURL} schemes are {@link FileProtocols#HTTP} and {@link FileProtocols#HTTPS}. * The host part of the URL designates the HTTP server. Credentials can be specified in the login and password parts * and will be used for HTTP Basic Authentication.</p> * * <p>Here are a few examples of valid HTTP URLs: * <code> * http://www.mucommander.com/index.html<br> * http://www.mucommander.com/index.php?<br> * http://john:p4sswd@www.mucommander.com/restricted_area/<br> * </code> * </p> * * <p> * A notable feature of HTTPFile is that it handles HTML/XHTML files as archives: when any of the {@link #ls()} methods * is called, the HTML file is parsed and any link found in the code is considered as a file: * <ul> * <li>If the link looks like a link to an HTML file, the child HTTPFile will be 'browsable' ({@link #isBrowsable()} * will return <code>true</code>). * <li>If not, the file will just be a regular file. * </ul> * </p> * * <p>In order to avoid the cost of having to perform a HEAD request for each file, some guessing based on the URL and * its filename is performed to determine if the file is an HTML/XHTML file or not. * In practice, this works quite well for most sites but the algorithm will be confused by some non-conventional * file naming, for instance if an HTML file ends with the '.gif' extension. * <br> * A HEAD request is then issued only for non-HTML files, to determine their size and last modified date. * HTML files will thus have a size returned by {@link #getSize()} of <code>-1</code> (undetermined), and a date * returned by {@link #getDate()} corresponding to 'now' (current time).</p> * * <p>Access to HTTP files is provided by the <code>java.net</code> API. The {@link #getUnderlyingFileObject()} method * allows to retrieve a <code>java.net.URL</code> instance corresponding to this HTTPFile.</p> * * @author Maxence Bernard */ public class HTTPFile extends ProtocolFile { private static final Logger LOGGER = LoggerFactory.getLogger(HTTPFile.class); /** java.net.URL corresponding to this */ private URL url; /** Contains the attributes of the remote HTTP resource. Contains default values until the file has been resolved */ private SimpleFileAttributes attributes; /** True if the file should be resolved on the remote HTTP server to fetch attribute values, false if these are * guessed. */ private boolean resolve; /** True if file has been resolved on the remote HTTP server, either successfully or unsuccessfully */ private boolean fileResolved; private boolean parentValSet; protected AbstractFile parent; /** Permissions for HTTP files: r-- (400 octal). Only the 'user' permissions bits are supported. */ private final static FilePermissions PERMISSIONS = new SimpleFilePermissions(256, 448); /** User agent used for all HTTP connections made by HTTPFile */ // TODO: add file API version, like muCommander-file-API/1.0 public static final String USER_AGENT = "muCommander-file-API (Java "+System.getProperty("java.vm.version") + "; " + System.getProperty("os.name") + " " + System.getProperty("os.version") + " " + System.getProperty("os.arch") + ")"; /** Matches HTML and XHTML attribute key/value pairs, where the value is surrounded by Single Quotes */ private final static Pattern linkAttributePatternSQ = Pattern.compile("(src|href|SRC|HREF)=\\\'.*?\\\'"); /** Matches HTML and XHTML attribute key/value pairs, where the value is surrounded by Double Quotes */ private final static Pattern linkAttributePatternDQ = Pattern.compile("(src|href|SRC|HREF)=\\\".*?\\\""); protected HTTPFile(FileURL fileURL) throws IOException { // TODO: optimize this this(fileURL, new URL(fileURL.toString(false))); } protected HTTPFile(FileURL fileURL, URL url) throws IOException { super(fileURL); String scheme = fileURL.getScheme().toLowerCase(); if((!scheme.equals(FileProtocols.HTTP) && !scheme.equals(FileProtocols.HTTPS)) || fileURL.getHost()==null) throw new IOException(); this.url = url; attributes = getDefaultAttributes(); String mimeType; String filename = fileURL.getFilename(); // Simple/fuzzy heuristic to avoid file resolution (HEAD) in cases where we have good reasons to believe that // the URL denotes a HTML/XTHML document: // - URL's path has no filename (e.g. http://www.mucommander.com/) or path ends with '/' (e.g. http://www.mucommander.com/download/) // - URL has a query part (works most of the time, must not always) // - URL has an extension that registered with an HTML/XHTML mime type if((filename==null || fileURL.getPath().endsWith("/") || fileURL.getQuery()!=null || ((mimeType=MimeTypes.getMimeType(this))!=null && isParsableMimeType(mimeType)))) { attributes.setDirectory(true); resolve = false; } else { resolve = true; } } private static SimpleFileAttributes getDefaultAttributes() { SimpleFileAttributes attributes = new SimpleFileAttributes(); attributes.setDate(System.currentTimeMillis()); attributes.setSize(-1); // Unknown attributes.setPermissions(PERMISSIONS); // exist = false // isDirectory = false // path = null (unused) return attributes; } /** * Returns <code>true</code> if the given mime type corresponds to HTML or XHTML and can be parsed. * * @param mimeType a MIME type / content type * @return <code>true</code> if the given mime type corresponds to HTML or XHTML and can be parsed */ private boolean isParsableMimeType(String mimeType) { return mimeType!=null && (mimeType.startsWith("text/html") || mimeType.startsWith("application/xhtml+xml") || mimeType.startsWith("application/xml")); } /** * Performs a HEAD request on the HTTP server to retrieve the file's attributes. * * @throws IOException if the HEAD request failed, either because the resource doesn't exist (404) or for any other * reason */ private void resolveFile() throws IOException { try { LOGGER.info("Resolving {}", url); // Get URLConnection instance HttpURLConnection conn = getHttpURLConnection(url); // Use HEAD instead of GET as we don't need the body conn.setRequestMethod("HEAD"); // Establish connection conn.connect(); // Check HTTP response code and throw appropriate IOException if request failed checkHTTPResponse(conn); // Resolve date: use last-modified header, if not set use date header, and if still not set use System.currentTimeMillis long date = conn.getLastModified(); if(date==0) { date = conn.getDate(); if(date==0) date = System.currentTimeMillis(); } attributes.setDate(date); // Resolve size with content-length header (-1 if not available) attributes.setSize(conn.getContentLength()); // Test if content is HTML String contentType = conn.getContentType(); if(isParsableMimeType(contentType)) attributes.setDirectory(true); // File was successfully resolved on the remote HTTP server and thus exists attributes.setExists(true); } catch(IOException e) { LOGGER.info("Failed to resolve file {}", url, e); } finally { // Mark the file as resolved, even if the request failed fileResolved = true; } } /** * Opens and returns a <code>HttpURLConnection</code> to the resource denoted by the specified URL. * If the {@link FileURL} contained by this HTTPFile contains {@link Credentials}, these will be used as credentials * for <i>HTTP Basic Authentication<i>. * * @param url the URL to open * @return a HttpURLConnection to the resource denoted by the specified URL * @throws IOException if the HttpURLConnection could not be opened */ private HttpURLConnection getHttpURLConnection(URL url) throws IOException { // Get URLConnection instance HttpURLConnection conn = (HttpURLConnection)url.openConnection(); // If credentials are contained in this HTTPFile's FileURL, use them for Basic HTTP Authentication Credentials credentials = fileURL.getCredentials(); if(credentials!=null) conn.setRequestProperty( "Authorization", "Basic "+ Base64Encoder.encode(credentials.getLogin()+":"+credentials.getPassword()) ); // Set user-agent header. conn.setRequestProperty("User-Agent", USER_AGENT); return conn; } /** * Checks the response code of the given HttpURLConnection and : * <ul> * <li>throws an {@link AuthException} if the response code is 401 (Unauthorized) * <li>throws an IOException if the response code is not in the 2xx - 3xx range (not a positive response) * <li>does nothing otherwise * * @param conn the HttpURLConnection connection to examine * @throws AuthException if the response code is 401 (Unauthorized) * @throws IOException if the response code is not in the 2xx - 3xx range (not a positive response) */ private void checkHTTPResponse(HttpURLConnection conn) throws AuthException, IOException { int responseCode = conn.getResponseCode(); LOGGER.info("response code = {}", responseCode); // If we got a 401 (Unauthorized) response, throw an AuthException to ask for credentials if(responseCode==401) throw new AuthException(fileURL, conn.getResponseMessage()); if(responseCode<200 || responseCode>=400) throw new IOException(conn.getResponseMessage()); } private void checkResolveFile() { if(resolve && !fileResolved) { try { resolveFile(); } catch(IOException e) { LOGGER.info("Failed to resolve {}", url, e); // file will be considered as resolved } } } ///////////////////////////////////////// // AbstractFile methods implementation // ///////////////////////////////////////// @Override public long getDate() { checkResolveFile(); return attributes.getDate(); } /** * Implementation notes: always throws {@link UnsupportedFileOperationException}. * * @throws UnsupportedFileOperationException always. */ @Override @UnsupportedFileOperation public void changeDate(long date) throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.CHANGE_DATE); } @Override public long getSize() { checkResolveFile(); return attributes.getSize(); // Size == -1 if not known } @Override public AbstractFile getParent() { if(!parentValSet) { FileURL parentURL = fileURL.getParent(); if(parentURL==null) this.parent = null; else { this.parent = FileFactory.getFile(parentURL); } this.parentValSet = true; } return this.parent; } @Override public void setParent(AbstractFile parent) { this.parent = parent; this.parentValSet = true; } @Override public boolean exists() { if(!fileResolved) { // Note: file will only be resolved once, even if the request failed try { resolveFile(); } catch(IOException e) {} } return attributes.exists(); } @Override public FilePermissions getPermissions() { return attributes.getPermissions(); } @Override public PermissionBits getChangeablePermissions() { return PermissionBits.EMPTY_PERMISSION_BITS; } @Override @UnsupportedFileOperation public void changePermission(PermissionAccess access, PermissionType permission, boolean enabled) throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.CHANGE_PERMISSION); } @Override public String getOwner() { return null; } @Override public boolean canGetOwner() { return false; } @Override public String getGroup() { return null; } @Override public boolean canGetGroup() { return false; } @Override public boolean isDirectory() { checkResolveFile(); return attributes.isDirectory(); } @Override public boolean isSymlink() { return false; } @Override public boolean isSystem() { return false; } @Override public InputStream getInputStream() throws IOException { HttpURLConnection conn = getHttpURLConnection(this.url); // Establish connection conn.connect(); // Check HTTP response code and throw appropriate IOException if request failed checkHTTPResponse(conn); return conn.getInputStream(); } /** * Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only. * * @throws UnsupportedFileOperationException always */ @Override @UnsupportedFileOperation public OutputStream getOutputStream() throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.WRITE_FILE); } /** * Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only. * * @throws UnsupportedFileOperationException always */ @Override @UnsupportedFileOperation public OutputStream getAppendOutputStream() throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.APPEND_FILE); } @Override public RandomAccessInputStream getRandomAccessInputStream() throws IOException { return new HTTPRandomAccessInputStream(); } /** * Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only. * * @throws UnsupportedFileOperationException always */ @Override @UnsupportedFileOperation public RandomAccessOutputStream getRandomAccessOutputStream() throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.RANDOM_WRITE_FILE); } /** * Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only. * * @throws UnsupportedFileOperationException always */ @Override @UnsupportedFileOperation public void delete() throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.DELETE); } /** * Always throws {@link UnsupportedFileOperationException} when called. * * @throws UnsupportedFileOperationException, always */ @Override @UnsupportedFileOperation public void copyRemotelyTo(AbstractFile destFile) throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.COPY_REMOTELY); } /** * Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only. * * @throws UnsupportedFileOperationException always */ @Override @UnsupportedFileOperation public void renameTo(AbstractFile destFile) throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.RENAME); } /** * Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only. * * @throws UnsupportedFileOperationException always */ @Override @UnsupportedFileOperation public void mkdir() throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.CREATE_DIRECTORY); } /** * Always throws {@link UnsupportedFileOperationException} when called. * * @throws UnsupportedFileOperationException, always */ @Override @UnsupportedFileOperation public long getFreeSpace() throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.GET_FREE_SPACE); } /** * Always throws {@link UnsupportedFileOperationException} when called. * * @throws UnsupportedFileOperationException, always */ @Override @UnsupportedFileOperation public long getTotalSpace() throws UnsupportedFileOperationException { throw new UnsupportedFileOperationException(FileOperation.GET_TOTAL_SPACE); } /** * Returns a <code>java.net.URL</code> instance corresponding to this file. */ @Override public Object getUnderlyingFileObject() { return url; } @Override public AbstractFile[] ls() throws IOException { // Implementation note: javax.swing.text.html.HTMLEditorKit isn't quite powerful enough to be used BufferedReader br = null; try { URL contextURL = this.url; HttpURLConnection conn; do { // Get a connection instance conn = getHttpURLConnection(contextURL); // Disable automatic redirections to track URL change conn.setInstanceFollowRedirects(false); // Establish connection conn.connect(); // Check HTTP response code and throw appropriate IOException if request failed checkHTTPResponse(conn); int responseCode = conn.getResponseCode(); // Test if reponse code is in the 3xx range (redirection) and if 'Location' field is set String locationHeader = conn.getHeaderField("Location"); if(responseCode>=300 && responseCode<400 && locationHeader!=null) { // Redirect to Location field and remember context url LOGGER.info("Location header = {}", conn.getHeaderField("Location")); contextURL = new URL(contextURL, locationHeader); // One more time continue; } break; } while(true); // Retrieve content type and throw an IOException if doesn't correspond to a parsable type (HTML/XHTML) String contentType = conn.getContentType(); if(contentType==null || !isParsableMimeType(contentType)) throw new IOException("Document cannot be parsed (not HTML or XHTML)"); // Todo: localize this message int pos; String enc = null; // Extract content type information (if any) if((pos=contentType.indexOf("charset"))!=-1 || (pos=contentType.indexOf("Charset"))!=-1) { StringTokenizer st = new StringTokenizer(contentType.substring(pos, contentType.length())); enc = st.nextToken(); } // Use the encoding reported in HTTP header if there was one, otherwise just use the default encoding InputStream in = conn.getInputStream(); InputStreamReader ir; if(enc==null) ir = new InputStreamReader(in); else { try { ir = new InputStreamReader(in, enc); } catch(UnsupportedEncodingException e) { ir = new InputStreamReader(in); } } br = new BufferedReader(ir); Vector<AbstractFile> children = new Vector<AbstractFile>(); // List that contains children URL, a TreeSet for fast (log(n)) search operations TreeSet<String> childrenURL = new TreeSet<String>(); URL childURL; FileURL childFileURL; Credentials credentials = fileURL.getCredentials(); String parentPath = fileURL.getPath(); if(!parentPath.endsWith("/")) parentPath += "/"; String parentHost = fileURL.getHost(); FileURL tempChildURL = (FileURL)fileURL.clone(); Pattern pattern; String line, match, link; while((line=br.readLine())!=null) { for(pattern=linkAttributePatternSQ;; pattern=linkAttributePatternDQ) { Matcher matcher = pattern.matcher(line); while(matcher.find()) { match = matcher.group(); link = match.substring(match.indexOf(pattern==linkAttributePatternSQ?'\'':'\"')+1, match.length()-1); // These are not proper URLs, skip them if(link.startsWith("mailto") || link.startsWith("MAILTO") || link.startsWith("#") || link.startsWith("javascript:")) continue; // Don't add the same link more than once if(childrenURL.contains(link)) continue; try { LOGGER.trace("creating child {} context={}", link, contextURL); childURL = new URL(contextURL, link); // Create the child FileURL instance childFileURL = FileURL.getFileURL(childURL.toExternalForm()); // Keep the parent's credentials (HTTP basic authentication), only if the host is the same. // It would otherwise be unsafe. if(parentHost.equals(childFileURL.getHost())) childFileURL.setCredentials(credentials); // TODO: resolve file here instead of in the constructor, and multiplex requests just like a browser children.add(FileFactory.getFile(childFileURL, null, childURL, childURL.toString())); childrenURL.add(link); } catch(IOException e) { LOGGER.info("Cannot create child: {}", e); } } if(pattern==linkAttributePatternDQ) break; } } AbstractFile childrenArray[] = new AbstractFile[children.size()]; children.toArray(childrenArray); return childrenArray; } catch (Exception e) { LOGGER.info("Exception caught while parsing HTML, throwing IOException", e); if(e instanceof IOException) throw (IOException)e; throw new IOException(); } finally { try { // Try and close URL connection if(br!=null) br.close(); } catch(IOException e) {} } } //////////////////////// // Overridden methods // //////////////////////// @Override public boolean isHidden() { return false; } @Override public String getName() { try {return java.net.URLDecoder.decode(super.getName(), "utf-8");} catch(Exception e) {return super.getName();} } /** * Overrides AbstractFile's getInputStream(long) method to provide a more efficient implementation: * use the HTTP 1.1 header to start the transfer at the given offset. */ @Override public InputStream getInputStream(long offset) throws IOException { HttpURLConnection conn = getHttpURLConnection(this.url); // Set header that allows to resume transfer conn.setRequestProperty("Range", "bytes="+offset+"-"); // Establish connection conn.connect(); // Check HTTP response code and throw appropriate IOException if request failed checkHTTPResponse(conn); return conn.getInputStream(); } /////////////////// // Inner classes // /////////////////// /** * HTTPRandomAccessInputStream extends BlockRandomInputStream to provide random read access to an HTTPFile. * It uses the 'Range' request header to read the HTTP resource partially, chunk by chunk and reposition the offset * when {@link #seek(long)} is called. */ private class HTTPRandomAccessInputStream extends BlockRandomInputStream { /** Amount of data returned */ private final static int CHUNK_SIZE = 1024; /** Length of the HTTP resource */ private long length; private HTTPRandomAccessInputStream() throws IOException { super(CHUNK_SIZE); // HEAD the HTTP resource to get its length if(!fileResolved) resolveFile(); length = getSize(); if(length == -1) // Knowing the content length is required throw new IOException(); } /////////////////////////////////////////// // BlockRandomInputStream implementation // /////////////////////////////////////////// @Override protected int readBlock(long fileOffset, byte block[], int blockLen) throws IOException { HttpURLConnection conn = getHttpURLConnection(url); // Note: 'Range' may not be supported by the HTTP server, in that case an IOException will be thrown conn.setRequestProperty("Range", "bytes="+fileOffset +"-"+ Math.min(fileOffset+blockLen, length-1)); conn.connect(); checkHTTPResponse(conn); // Read up to blockLen bytes InputStream in = conn.getInputStream(); try { int totalRead = 0; int read; while(totalRead<blockLen) { read = in.read(block, totalRead, blockLen-totalRead); if(read==-1) break; totalRead += read; } return totalRead; } finally { in.close(); } } public long getLength() throws IOException { return length; } @Override public void close() throws IOException { // No-op, the underlying stream is already closed } } }