package us.codecraft.webmagic.downloader; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import java.io.*; /** * 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。<br> * @author code4crafer@gmail.com * Date: 13-6-24 * Time: 上午7:24 */ public class FileDownloader implements Downloader { private String path = "/data/temp/webmagic/"; private Downloader downloaderWhenFileMiss; private Logger logger = Logger.getLogger(getClass()); public FileDownloader() { this("/data/temp/webmagic/", null); } public FileDownloader(String path) { this(path, null); } public FileDownloader(String path, Downloader downloaderWhenFileMiss) { if (!path.endsWith("/")&&!path.endsWith("\\")){ path+="/"; } this.path = path; this.downloaderWhenFileMiss = downloaderWhenFileMiss; } @Override public Page download(Request request, Task task) { String path = this.path + "/" + task.getUUID() + "/"; Page page = null; try { final File file = new File(path + DigestUtils.md5Hex(request.getUrl())); BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); String line = null; line = bufferedReader.readLine(); if (line.equals("url:\t" + request.getUrl())) { final String html = getHtml(bufferedReader); page = new Page(); page.setRequest(request); page.setUrl(PlainText.create(request.getUrl())); page.setHtml(Html.create(html)); } } catch (IOException e) { if (e instanceof FileNotFoundException) { logger.info("File not exist for url " + request.getUrl()); } else { logger.warn("File read error for url " + request.getUrl(), e); } } if (page == null) { page = downloadWhenMiss(request, task); } return page; } @Override public void setThread(int thread) { } private String getHtml(BufferedReader bufferedReader) throws IOException { String line; StringBuilder htmlBuilder= new StringBuilder(); line = bufferedReader.readLine(); line = StringUtils.removeStart(line, "html:\t"); htmlBuilder.append(line); while ((line=bufferedReader.readLine())!=null){ htmlBuilder.append(line); } return htmlBuilder.toString(); } private Page downloadWhenMiss(Request request, Task task) { Page page = null; if (downloaderWhenFileMiss != null) { page = downloaderWhenFileMiss.download(request, task); } return page; } }