/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.protocol.file; // JDK imports import java.net.URL; import java.net.URI; import java.util.Date; import java.util.TreeMap; import java.io.IOException; import java.io.UnsupportedEncodingException; // Nutch imports import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.HttpDateFormat; import org.apache.nutch.net.protocols.Response; // Tika imports import org.apache.tika.Tika; // Hadoop imports import org.apache.hadoop.conf.Configuration; /************************************ * FileResponse.java mimics file replies as http response. It tries its best to * follow http's way for headers, response codes as well as exceptions. * * Comments: (1) java.net.URL and java.net.URLConnection can handle file: * scheme. However they are not flexible enough, so not used in this * implementation. * * (2) java.io.File is used for its abstractness across platforms. Warning: * java.io.File API (1.4.2) does not elaborate on how special files, such as * /dev/* in unix and /proc/* on linux, are treated. Tests show (a) * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile() * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are * probably oaky for now. Could be buggy here. How about special files on * windows? * * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They * are just treated as individual files. * * (4) No funcy POSIX file attributes yet. May never need? * * @author John Xing ***********************************/ public class FileResponse { private String orig; private String base; private byte[] content; private static final byte[] EMPTY_CONTENT = new byte[0]; private int code; private Metadata headers = new Metadata(); private final File file; private Configuration conf; private MimeUtil MIME; private Tika tika; /** Returns the response code. */ public int getCode() { return code; } /** Returns the value of a named header. */ public String getHeader(String name) { return headers.get(name); } public byte[] getContent() { return content; } public Content toContent() { return new Content(orig, base, (content != null ? content : EMPTY_CONTENT), getHeader(Response.CONTENT_TYPE), headers, this.conf); } public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; MIME = new MimeUtil(conf); tika = new Tika(); if (!"file".equals(url.getProtocol())) throw new FileException("Not a file url:" + url); if (File.LOG.isTraceEnabled()) { File.LOG.trace("fetching " + url); } if (url.getPath() != url.getFile()) { if (File.LOG.isWarnEnabled()) { File.LOG.warn("url.getPath() != url.getFile(): " + url); } } String path = "".equals(url.getPath()) ? "/" : url.getPath(); try { // specify the encoding via the config later? path = java.net.URLDecoder.decode(path, "UTF-8"); } catch (UnsupportedEncodingException ex) { } try { this.content = null; // url.toURI() is only in j2se 1.5.0 //java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { this.code = 404; // http Not Found return; } if (!f.canRead()) { this.code = 401; // http Unauthorized return; } // symbolic link or relative path on unix // fix me: what's the consequence on windows platform // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers //hdrs.put("Location", f.getCanonicalFile().toURI()); // // we want to automatically escape characters that are illegal in URLs. // It is recommended that new code convert an abstract pathname into a URL // by first converting it into a URI, via the toURI method, and then // converting the URI into a URL via the URI.toURL method. headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); this.code = 300; // http redirect return; } if (f.lastModified() <= datum.getModifiedTime()) { this.code = 304; this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); return; } if (f.isDirectory()) { getDirAsHttpResponse(f); } else if (f.isFile()) { getFileAsHttpResponse(f); } else { this.code = 500; // http Internal Server Error return; } } catch (IOException e) { throw e; } } // get file as http response private void getFileAsHttpResponse(java.io.File f) throws FileException, IOException { // ignore file of size larger than // Integer.MAX_VALUE = 2^31-1 = 2147483647 long size = f.length(); if (size > Integer.MAX_VALUE) { throw new FileException("file is too large, size: " + size); // or we can do this? // this.code = 400; // http Bad request // return; } // capture content int len = (int) size; if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength) len = this.file.maxContentLength; this.content = new byte[len]; java.io.InputStream is = new java.io.FileInputStream(f); int offset = 0; int n = 0; while (offset < len && (n = is.read(this.content, offset, len - offset)) >= 0) { offset += n; } if (offset < len) { // keep whatever already have, but issue a warning if (File.LOG.isWarnEnabled()) { File.LOG.warn("not enough bytes read from file: " + f.getPath()); } } is.close(); // set headers headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); String mimeType = MIME.getMimeType(f); headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : ""); // response code this.code = 200; // http OK } // get dir list as http response private void getDirAsHttpResponse(java.io.File f) throws IOException { String path = f.toString(); if (this.file.crawlParents) this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); else this.content = list2html(f.listFiles(), path, false); // set headers headers.set(Response.CONTENT_LENGTH, new Integer(this.content.length).toString()); headers.set(Response.CONTENT_TYPE, "text/html"); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); // response code this.code = 200; // http OK } // generate html page from dir list private byte[] list2html(java.io.File[] list, String path, boolean includeDotDot) { StringBuffer x = new StringBuffer("<html><head>"); x.append("<title>Index of " + path + "</title></head>\n"); x.append("<body><h1>Index of " + path + "</h1><pre>\n"); if (includeDotDot) { x.append("<a href='../'>../</a>\t-\t-\t-\n"); } // fix me: we might want to sort list here! but not now. java.io.File f; for (int i = 0; i < list.length; i++) { f = list[i]; String name = f.getName(); String time = HttpDateFormat.toString(f.lastModified()); if (f.isDirectory()) { // java 1.4.2 api says dir itself and parent dir are not listed // so the following is not needed. // if (name.equals(".") || name.equals("..")) // continue; x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); x.append(time + "\t-\n"); } else if (f.isFile()) { x.append("<a href='" + name + "'>" + name + "</a>\t"); x.append(time + "\t" + f.length() + "\n"); } else { // ignore any other } } x.append("</pre></body></html>\n"); return new String(x).getBytes(); } }