/**
* This file is part of muCommander, http://www.mucommander.com
* Copyright (C) 2002-2016 Maxence Bernard
*
* muCommander is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* muCommander is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.mucommander.commons.file.protocol.http;
import com.mucommander.commons.file.*;
import com.mucommander.commons.file.protocol.FileProtocols;
import com.mucommander.commons.file.protocol.ProtocolFile;
import com.mucommander.commons.io.BlockRandomInputStream;
import com.mucommander.commons.io.RandomAccessInputStream;
import com.mucommander.commons.io.RandomAccessOutputStream;
import com.mucommander.commons.io.base64.Base64Encoder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.TreeSet;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* HTTPFile provides access to files located on an HTTP/HTTPS server.
*
* <p>The associated {@link FileURL} schemes are {@link FileProtocols#HTTP} and {@link FileProtocols#HTTPS}.
* The host part of the URL designates the HTTP server. Credentials can be specified in the login and password parts
* and will be used for HTTP Basic Authentication.</p>
*
* <p>Here are a few examples of valid HTTP URLs:
* <code>
* http://www.mucommander.com/index.html<br>
* http://www.mucommander.com/index.php?<br>
* http://john:p4sswd@www.mucommander.com/restricted_area/<br>
* </code>
* </p>
*
* <p>
* A notable feature of HTTPFile is that it handles HTML/XHTML files as archives: when any of the {@link #ls()} methods
* is called, the HTML file is parsed and any link found in the code is considered as a file:
* <ul>
* <li>If the link looks like a link to an HTML file, the child HTTPFile will be 'browsable' ({@link #isBrowsable()}
* will return <code>true</code>).
* <li>If not, the file will just be a regular file.
* </ul>
* </p>
*
* <p>In order to avoid the cost of having to perform a HEAD request for each file, some guessing based on the URL and
* its filename is performed to determine if the file is an HTML/XHTML file or not.
* In practice, this works quite well for most sites but the algorithm will be confused by some non-conventional
* file naming, for instance if an HTML file ends with the '.gif' extension.
* <br>
* A HEAD request is then issued only for non-HTML files, to determine their size and last modified date.
* HTML files will thus have a size returned by {@link #getSize()} of <code>-1</code> (undetermined), and a date
* returned by {@link #getDate()} corresponding to 'now' (current time).</p>
*
* <p>Access to HTTP files is provided by the <code>java.net</code> API. The {@link #getUnderlyingFileObject()} method
* allows to retrieve a <code>java.net.URL</code> instance corresponding to this HTTPFile.</p>
*
* @author Maxence Bernard
*/
public class HTTPFile extends ProtocolFile {
private static final Logger LOGGER = LoggerFactory.getLogger(HTTPFile.class);
/** java.net.URL corresponding to this */
private URL url;
/** Contains the attributes of the remote HTTP resource. Contains default values until the file has been resolved */
private SimpleFileAttributes attributes;
/** True if the file should be resolved on the remote HTTP server to fetch attribute values, false if these are
* guessed. */
private boolean resolve;
/** True if file has been resolved on the remote HTTP server, either successfully or unsuccessfully */
private boolean fileResolved;
private boolean parentValSet;
protected AbstractFile parent;
/** Permissions for HTTP files: r-- (400 octal). Only the 'user' permissions bits are supported. */
private final static FilePermissions PERMISSIONS = new SimpleFilePermissions(256, 448);
/** User agent used for all HTTP connections made by HTTPFile */
// TODO: add file API version, like muCommander-file-API/1.0
public static final String USER_AGENT = "muCommander-file-API (Java "+System.getProperty("java.vm.version")
+ "; " + System.getProperty("os.name") + " " +
System.getProperty("os.version") + " " + System.getProperty("os.arch") + ")";
/** Matches HTML and XHTML attribute key/value pairs, where the value is surrounded by Single Quotes */
private final static Pattern linkAttributePatternSQ = Pattern.compile("(src|href|SRC|HREF)=\\\'.*?\\\'");
/** Matches HTML and XHTML attribute key/value pairs, where the value is surrounded by Double Quotes */
private final static Pattern linkAttributePatternDQ = Pattern.compile("(src|href|SRC|HREF)=\\\".*?\\\"");
protected HTTPFile(FileURL fileURL) throws IOException {
// TODO: optimize this
this(fileURL, new URL(fileURL.toString(false)));
}
protected HTTPFile(FileURL fileURL, URL url) throws IOException {
super(fileURL);
String scheme = fileURL.getScheme().toLowerCase();
if((!scheme.equals(FileProtocols.HTTP) && !scheme.equals(FileProtocols.HTTPS)) || fileURL.getHost()==null)
throw new IOException();
this.url = url;
attributes = getDefaultAttributes();
String mimeType;
String filename = fileURL.getFilename();
// Simple/fuzzy heuristic to avoid file resolution (HEAD) in cases where we have good reasons to believe that
// the URL denotes a HTML/XTHML document:
// - URL's path has no filename (e.g. http://www.mucommander.com/) or path ends with '/' (e.g. http://www.mucommander.com/download/)
// - URL has a query part (works most of the time, must not always)
// - URL has an extension that registered with an HTML/XHTML mime type
if((filename==null || fileURL.getPath().endsWith("/") || fileURL.getQuery()!=null || ((mimeType=MimeTypes.getMimeType(this))!=null && isParsableMimeType(mimeType)))) {
attributes.setDirectory(true);
resolve = false;
}
else {
resolve = true;
}
}
private static SimpleFileAttributes getDefaultAttributes() {
SimpleFileAttributes attributes = new SimpleFileAttributes();
attributes.setDate(System.currentTimeMillis());
attributes.setSize(-1); // Unknown
attributes.setPermissions(PERMISSIONS);
// exist = false
// isDirectory = false
// path = null (unused)
return attributes;
}
/**
* Returns <code>true</code> if the given mime type corresponds to HTML or XHTML and can be parsed.
*
* @param mimeType a MIME type / content type
* @return <code>true</code> if the given mime type corresponds to HTML or XHTML and can be parsed
*/
private boolean isParsableMimeType(String mimeType) {
return mimeType!=null
&& (mimeType.startsWith("text/html") || mimeType.startsWith("application/xhtml+xml") || mimeType.startsWith("application/xml"));
}
/**
* Performs a HEAD request on the HTTP server to retrieve the file's attributes.
*
* @throws IOException if the HEAD request failed, either because the resource doesn't exist (404) or for any other
* reason
*/
private void resolveFile() throws IOException {
try {
LOGGER.info("Resolving {}", url);
// Get URLConnection instance
HttpURLConnection conn = getHttpURLConnection(url);
// Use HEAD instead of GET as we don't need the body
conn.setRequestMethod("HEAD");
// Establish connection
conn.connect();
// Check HTTP response code and throw appropriate IOException if request failed
checkHTTPResponse(conn);
// Resolve date: use last-modified header, if not set use date header, and if still not set use System.currentTimeMillis
long date = conn.getLastModified();
if(date==0) {
date = conn.getDate();
if(date==0)
date = System.currentTimeMillis();
}
attributes.setDate(date);
// Resolve size with content-length header (-1 if not available)
attributes.setSize(conn.getContentLength());
// Test if content is HTML
String contentType = conn.getContentType();
if(isParsableMimeType(contentType))
attributes.setDirectory(true);
// File was successfully resolved on the remote HTTP server and thus exists
attributes.setExists(true);
}
catch(IOException e) {
LOGGER.info("Failed to resolve file {}", url, e);
}
finally {
// Mark the file as resolved, even if the request failed
fileResolved = true;
}
}
/**
* Opens and returns a <code>HttpURLConnection</code> to the resource denoted by the specified URL.
* If the {@link FileURL} contained by this HTTPFile contains {@link Credentials}, these will be used as credentials
* for <i>HTTP Basic Authentication<i>.
*
* @param url the URL to open
* @return a HttpURLConnection to the resource denoted by the specified URL
* @throws IOException if the HttpURLConnection could not be opened
*/
private HttpURLConnection getHttpURLConnection(URL url) throws IOException {
// Get URLConnection instance
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
// If credentials are contained in this HTTPFile's FileURL, use them for Basic HTTP Authentication
Credentials credentials = fileURL.getCredentials();
if(credentials!=null)
conn.setRequestProperty(
"Authorization",
"Basic "+ Base64Encoder.encode(credentials.getLogin()+":"+credentials.getPassword())
);
// Set user-agent header.
conn.setRequestProperty("User-Agent", USER_AGENT);
return conn;
}
/**
* Checks the response code of the given HttpURLConnection and :
* <ul>
* <li>throws an {@link AuthException} if the response code is 401 (Unauthorized)
* <li>throws an IOException if the response code is not in the 2xx - 3xx range (not a positive response)
* <li>does nothing otherwise
*
* @param conn the HttpURLConnection connection to examine
* @throws AuthException if the response code is 401 (Unauthorized)
* @throws IOException if the response code is not in the 2xx - 3xx range (not a positive response)
*/
private void checkHTTPResponse(HttpURLConnection conn) throws AuthException, IOException {
int responseCode = conn.getResponseCode();
LOGGER.info("response code = {}", responseCode);
// If we got a 401 (Unauthorized) response, throw an AuthException to ask for credentials
if(responseCode==401)
throw new AuthException(fileURL, conn.getResponseMessage());
if(responseCode<200 || responseCode>=400)
throw new IOException(conn.getResponseMessage());
}
private void checkResolveFile() {
if(resolve && !fileResolved) {
try {
resolveFile();
}
catch(IOException e) {
LOGGER.info("Failed to resolve {}", url, e);
// file will be considered as resolved
}
}
}
/////////////////////////////////////////
// AbstractFile methods implementation //
/////////////////////////////////////////
@Override
public long getDate() {
checkResolveFile();
return attributes.getDate();
}
/**
* Implementation notes: always throws {@link UnsupportedFileOperationException}.
*
* @throws UnsupportedFileOperationException always.
*/
@Override
@UnsupportedFileOperation
public void changeDate(long date) throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.CHANGE_DATE);
}
@Override
public long getSize() {
checkResolveFile();
return attributes.getSize(); // Size == -1 if not known
}
@Override
public AbstractFile getParent() {
if(!parentValSet) {
FileURL parentURL = fileURL.getParent();
if(parentURL==null)
this.parent = null;
else {
this.parent = FileFactory.getFile(parentURL);
}
this.parentValSet = true;
}
return this.parent;
}
@Override
public void setParent(AbstractFile parent) {
this.parent = parent;
this.parentValSet = true;
}
@Override
public boolean exists() {
if(!fileResolved) {
// Note: file will only be resolved once, even if the request failed
try { resolveFile(); }
catch(IOException e) {}
}
return attributes.exists();
}
@Override
public FilePermissions getPermissions() {
return attributes.getPermissions();
}
@Override
public PermissionBits getChangeablePermissions() {
return PermissionBits.EMPTY_PERMISSION_BITS;
}
@Override
@UnsupportedFileOperation
public void changePermission(PermissionAccess access, PermissionType permission, boolean enabled) throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.CHANGE_PERMISSION);
}
@Override
public String getOwner() {
return null;
}
@Override
public boolean canGetOwner() {
return false;
}
@Override
public String getGroup() {
return null;
}
@Override
public boolean canGetGroup() {
return false;
}
@Override
public boolean isDirectory() {
checkResolveFile();
return attributes.isDirectory();
}
@Override
public boolean isSymlink() {
return false;
}
@Override
public boolean isSystem() {
return false;
}
@Override
public InputStream getInputStream() throws IOException {
HttpURLConnection conn = getHttpURLConnection(this.url);
// Establish connection
conn.connect();
// Check HTTP response code and throw appropriate IOException if request failed
checkHTTPResponse(conn);
return conn.getInputStream();
}
/**
* Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only.
*
* @throws UnsupportedFileOperationException always
*/
@Override
@UnsupportedFileOperation
public OutputStream getOutputStream() throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.WRITE_FILE);
}
/**
* Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only.
*
* @throws UnsupportedFileOperationException always
*/
@Override
@UnsupportedFileOperation
public OutputStream getAppendOutputStream() throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.APPEND_FILE);
}
@Override
public RandomAccessInputStream getRandomAccessInputStream() throws IOException {
return new HTTPRandomAccessInputStream();
}
/**
* Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only.
*
* @throws UnsupportedFileOperationException always
*/
@Override
@UnsupportedFileOperation
public RandomAccessOutputStream getRandomAccessOutputStream() throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.RANDOM_WRITE_FILE);
}
/**
* Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only.
*
* @throws UnsupportedFileOperationException always
*/
@Override
@UnsupportedFileOperation
public void delete() throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.DELETE);
}
/**
* Always throws {@link UnsupportedFileOperationException} when called.
*
* @throws UnsupportedFileOperationException, always
*/
@Override
@UnsupportedFileOperation
public void copyRemotelyTo(AbstractFile destFile) throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.COPY_REMOTELY);
}
/**
* Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only.
*
* @throws UnsupportedFileOperationException always
*/
@Override
@UnsupportedFileOperation
public void renameTo(AbstractFile destFile) throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.RENAME);
}
/**
* Always throws an {@link UnsupportedFileOperationException}: HTTP files are read-only.
*
* @throws UnsupportedFileOperationException always
*/
@Override
@UnsupportedFileOperation
public void mkdir() throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.CREATE_DIRECTORY);
}
/**
* Always throws {@link UnsupportedFileOperationException} when called.
*
* @throws UnsupportedFileOperationException, always
*/
@Override
@UnsupportedFileOperation
public long getFreeSpace() throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.GET_FREE_SPACE);
}
/**
* Always throws {@link UnsupportedFileOperationException} when called.
*
* @throws UnsupportedFileOperationException, always
*/
@Override
@UnsupportedFileOperation
public long getTotalSpace() throws UnsupportedFileOperationException {
throw new UnsupportedFileOperationException(FileOperation.GET_TOTAL_SPACE);
}
/**
* Returns a <code>java.net.URL</code> instance corresponding to this file.
*/
@Override
public Object getUnderlyingFileObject() {
return url;
}
@Override
public AbstractFile[] ls() throws IOException {
// Implementation note: javax.swing.text.html.HTMLEditorKit isn't quite powerful enough to be used
BufferedReader br = null;
try {
URL contextURL = this.url;
HttpURLConnection conn;
do {
// Get a connection instance
conn = getHttpURLConnection(contextURL);
// Disable automatic redirections to track URL change
conn.setInstanceFollowRedirects(false);
// Establish connection
conn.connect();
// Check HTTP response code and throw appropriate IOException if request failed
checkHTTPResponse(conn);
int responseCode = conn.getResponseCode();
// Test if reponse code is in the 3xx range (redirection) and if 'Location' field is set
String locationHeader = conn.getHeaderField("Location");
if(responseCode>=300 && responseCode<400 && locationHeader!=null) {
// Redirect to Location field and remember context url
LOGGER.info("Location header = {}", conn.getHeaderField("Location"));
contextURL = new URL(contextURL, locationHeader);
// One more time
continue;
}
break;
} while(true);
// Retrieve content type and throw an IOException if doesn't correspond to a parsable type (HTML/XHTML)
String contentType = conn.getContentType();
if(contentType==null || !isParsableMimeType(contentType))
throw new IOException("Document cannot be parsed (not HTML or XHTML)"); // Todo: localize this message
int pos;
String enc = null;
// Extract content type information (if any)
if((pos=contentType.indexOf("charset"))!=-1 || (pos=contentType.indexOf("Charset"))!=-1) {
StringTokenizer st = new StringTokenizer(contentType.substring(pos, contentType.length()));
enc = st.nextToken();
}
// Use the encoding reported in HTTP header if there was one, otherwise just use the default encoding
InputStream in = conn.getInputStream();
InputStreamReader ir;
if(enc==null)
ir = new InputStreamReader(in);
else {
try {
ir = new InputStreamReader(in, enc);
}
catch(UnsupportedEncodingException e) {
ir = new InputStreamReader(in);
}
}
br = new BufferedReader(ir);
Vector<AbstractFile> children = new Vector<AbstractFile>();
// List that contains children URL, a TreeSet for fast (log(n)) search operations
TreeSet<String> childrenURL = new TreeSet<String>();
URL childURL;
FileURL childFileURL;
Credentials credentials = fileURL.getCredentials();
String parentPath = fileURL.getPath();
if(!parentPath.endsWith("/"))
parentPath += "/";
String parentHost = fileURL.getHost();
FileURL tempChildURL = (FileURL)fileURL.clone();
Pattern pattern;
String line, match, link;
while((line=br.readLine())!=null) {
for(pattern=linkAttributePatternSQ;; pattern=linkAttributePatternDQ) {
Matcher matcher = pattern.matcher(line);
while(matcher.find()) {
match = matcher.group();
link = match.substring(match.indexOf(pattern==linkAttributePatternSQ?'\'':'\"')+1, match.length()-1);
// These are not proper URLs, skip them
if(link.startsWith("mailto") || link.startsWith("MAILTO")
|| link.startsWith("#")
|| link.startsWith("javascript:"))
continue;
// Don't add the same link more than once
if(childrenURL.contains(link))
continue;
try {
LOGGER.trace("creating child {} context={}", link, contextURL);
childURL = new URL(contextURL, link);
// Create the child FileURL instance
childFileURL = FileURL.getFileURL(childURL.toExternalForm());
// Keep the parent's credentials (HTTP basic authentication), only if the host is the same.
// It would otherwise be unsafe.
if(parentHost.equals(childFileURL.getHost()))
childFileURL.setCredentials(credentials);
// TODO: resolve file here instead of in the constructor, and multiplex requests just like a browser
children.add(FileFactory.getFile(childFileURL, null, childURL, childURL.toString()));
childrenURL.add(link);
}
catch(IOException e) {
LOGGER.info("Cannot create child: {}", e);
}
}
if(pattern==linkAttributePatternDQ)
break;
}
}
AbstractFile childrenArray[] = new AbstractFile[children.size()];
children.toArray(childrenArray);
return childrenArray;
}
catch (Exception e) {
LOGGER.info("Exception caught while parsing HTML, throwing IOException", e);
if(e instanceof IOException)
throw (IOException)e;
throw new IOException();
}
finally {
try {
// Try and close URL connection
if(br!=null)
br.close();
}
catch(IOException e) {}
}
}
////////////////////////
// Overridden methods //
////////////////////////
@Override
public boolean isHidden() {
return false;
}
@Override
public String getName() {
try {return java.net.URLDecoder.decode(super.getName(), "utf-8");}
catch(Exception e) {return super.getName();}
}
/**
* Overrides AbstractFile's getInputStream(long) method to provide a more efficient implementation:
* use the HTTP 1.1 header to start the transfer at the given offset.
*/
@Override
public InputStream getInputStream(long offset) throws IOException {
HttpURLConnection conn = getHttpURLConnection(this.url);
// Set header that allows to resume transfer
conn.setRequestProperty("Range", "bytes="+offset+"-");
// Establish connection
conn.connect();
// Check HTTP response code and throw appropriate IOException if request failed
checkHTTPResponse(conn);
return conn.getInputStream();
}
///////////////////
// Inner classes //
///////////////////
/**
* HTTPRandomAccessInputStream extends BlockRandomInputStream to provide random read access to an HTTPFile.
* It uses the 'Range' request header to read the HTTP resource partially, chunk by chunk and reposition the offset
* when {@link #seek(long)} is called.
*/
private class HTTPRandomAccessInputStream extends BlockRandomInputStream {
/** Amount of data returned */
private final static int CHUNK_SIZE = 1024;
/** Length of the HTTP resource */
private long length;
private HTTPRandomAccessInputStream() throws IOException {
super(CHUNK_SIZE);
// HEAD the HTTP resource to get its length
if(!fileResolved)
resolveFile();
length = getSize();
if(length == -1) // Knowing the content length is required
throw new IOException();
}
///////////////////////////////////////////
// BlockRandomInputStream implementation //
///////////////////////////////////////////
@Override
protected int readBlock(long fileOffset, byte block[], int blockLen) throws IOException {
HttpURLConnection conn = getHttpURLConnection(url);
// Note: 'Range' may not be supported by the HTTP server, in that case an IOException will be thrown
conn.setRequestProperty("Range", "bytes="+fileOffset +"-"+ Math.min(fileOffset+blockLen, length-1));
conn.connect();
checkHTTPResponse(conn);
// Read up to blockLen bytes
InputStream in = conn.getInputStream();
try {
int totalRead = 0;
int read;
while(totalRead<blockLen) {
read = in.read(block, totalRead, blockLen-totalRead);
if(read==-1)
break;
totalRead += read;
}
return totalRead;
}
finally {
in.close();
}
}
public long getLength() throws IOException {
return length;
}
@Override
public void close() throws IOException {
// No-op, the underlying stream is already closed
}
}
}