/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.file;
// JDK imports
import java.net.URL;
import java.util.Date;
import java.util.TreeMap;
import java.io.IOException;
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
// Tika imports
import org.apache.tika.mime.MimeType;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
/************************************
* FileResponse.java mimics file replies as http response.
* It tries its best to follow http's way for headers, response codes
* as well as exceptions.
*
* Comments:
* (1) java.net.URL and java.net.URLConnection can handle file: scheme.
* However they are not flexible enough, so not used in this implementation.
*
* (2) java.io.File is used for its abstractness across platforms.
* Warning:
* java.io.File API (1.4.2) does not elaborate on how special files,
* such as /dev/* in unix and /proc/* on linux, are treated. Tests show
* (a) java.io.File.isFile() return false for /dev/*
* (b) java.io.File.isFile() return true for /proc/*
* (c) java.io.File.length() return 0 for /proc/*
* We are probably oaky for now. Could be buggy here.
* How about special files on windows?
*
* (3) java.io.File API (1.4.2) does not seem to know unix hard link files.
* They are just treated as individual files.
*
* (4) No funcy POSIX file attributes yet. May never need?
*
* @author John Xing
***********************************/
public class FileResponse {
private String orig;
private String base;
private byte[] content;
private static final byte[] EMPTY_CONTENT = new byte[0];
private int code;
private Metadata headers = new Metadata();
private final File file;
private Configuration conf;
private MimeUtil MIME;
/** Returns the response code. */
public int getCode() { return code; }
/** Returns the value of a named header. */
public String getHeader(String name) {
return headers.get(name);
}
public byte[] getContent() { return content; }
public Content toContent() {
return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
getHeader(Response.CONTENT_TYPE),
headers, this.conf);
}
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
throws FileException, IOException {
this.orig = url.toString();
this.base = url.toString();
this.file = file;
this.conf = conf;
MIME = new MimeUtil(conf);
if (!"file".equals(url.getProtocol()))
throw new FileException("Not a file url:" + url);
if (File.LOG.isTraceEnabled()) {
File.LOG.trace("fetching " + url);
}
if (url.getPath() != url.getFile()) {
if (File.LOG.isWarnEnabled()) {
File.LOG.warn("url.getPath() != url.getFile(): " + url);
}
}
String path = "".equals(url.getPath()) ? "/" : url.getPath();
try {
this.content = null;
// url.toURI() is only in j2se 1.5.0
//java.io.File f = new java.io.File(url.toURI());
java.io.File f = new java.io.File(path);
if (!f.exists()) {
this.code = 404; // http Not Found
return;
}
if (!f.canRead()) {
this.code = 401; // http Unauthorized
return;
}
// symbolic link or relative path on unix
// fix me: what's the consequence on windows platform
// where case is insensitive
if (!f.equals(f.getCanonicalFile())) {
// set headers
//hdrs.put("Location", f.getCanonicalFile().toURI());
headers.set(Response.LOCATION, f.getCanonicalFile().toURL().toString());
this.code = 300; // http redirect
return;
}
if (f.lastModified() <= datum.getModifiedTime()) {
this.code = 304;
this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
return;
}
if (f.isDirectory()) {
getDirAsHttpResponse(f);
} else if (f.isFile()) {
getFileAsHttpResponse(f);
} else {
this.code = 500; // http Internal Server Error
return;
}
} catch (IOException e) {
throw e;
}
}
// get file as http response
private void getFileAsHttpResponse(java.io.File f)
throws FileException, IOException {
// ignore file of size larger than
// Integer.MAX_VALUE = 2^31-1 = 2147483647
long size = f.length();
if (size > Integer.MAX_VALUE) {
throw new FileException("file is too large, size: "+size);
// or we can do this?
// this.code = 400; // http Bad request
// return;
}
// capture content
int len = (int) size;
if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
len = this.file.maxContentLength;
this.content = new byte[len];
java.io.InputStream is = new java.io.FileInputStream(f);
int offset = 0; int n = 0;
while (offset < len
&& (n = is.read(this.content, offset, len-offset)) >= 0) {
offset += n;
}
if (offset < len) { // keep whatever already have, but issue a warning
if (File.LOG.isWarnEnabled()) {
File.LOG.warn("not enough bytes read from file: "+f.getPath());
}
}
is.close();
// set headers
headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
.lastModified()));
MimeType mimeType = MIME.getMimeType(f);
String mimeTypeString = mimeType != null ? mimeType.getName() : "";
headers.set(Response.CONTENT_TYPE, mimeTypeString);
// response code
this.code = 200; // http OK
}
// get dir list as http response
private void getDirAsHttpResponse(java.io.File f)
throws IOException {
String path = f.toString();
if (this.file.crawlParents)
this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true);
else
this.content = list2html(f.listFiles(), path, false);
// set headers
headers.set(Response.CONTENT_LENGTH,
new Integer(this.content.length).toString());
headers.set(Response.CONTENT_TYPE, "text/html");
headers.set(Response.LAST_MODIFIED,
HttpDateFormat.toString(f.lastModified()));
// response code
this.code = 200; // http OK
}
// generate html page from dir list
private byte[] list2html(java.io.File[] list,
String path, boolean includeDotDot) {
StringBuffer x = new StringBuffer("<html><head>");
x.append("<title>Index of "+path+"</title></head>\n");
x.append("<body><h1>Index of "+path+"</h1><pre>\n");
if (includeDotDot) {
x.append("<a href='../'>../</a>\t-\t-\t-\n");
}
// fix me: we might want to sort list here! but not now.
java.io.File f;
for (int i=0; i<list.length; i++) {
f = list[i];
String name = f.getName();
String time = HttpDateFormat.toString(f.lastModified());
if (f.isDirectory()) {
// java 1.4.2 api says dir itself and parent dir are not listed
// so the following is not needed.
//if (name.equals(".") || name.equals(".."))
// continue;
x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
x.append(time+"\t-\n");
} else if (f.isFile()) {
x.append("<a href='"+name+ "'>"+name+"</a>\t");
x.append(time+"\t"+f.length()+"\n");
} else {
// ignore any other
}
}
x.append("</pre></body></html>\n");
return new String(x).getBytes();
}
}