package xyz.anduo.crawler; import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; public class DownLoadFile { /** * 根据url和网页类型生成需要保存的网页的文件名,去掉url中非文件名字符 * * @param url * @param contentType * @return */ public String getFileNameByUrl(String url, String contentType) { // 移除http:// url = url.substring(7); // text/html类型 if (contentType.indexOf("html") != -1) { url = url.replaceAll("[\\?/:*|<>\"]", "_") + ".html"; return url; } // 如果为application/pdf类型 else { return url = url.replaceAll("[\\?/:*|<>\"]", "_") + "." + contentType.substring(contentType.lastIndexOf("/") + 1); } } /** * 保存网页字节流到本地文件 * * @param data * @param filePath 要保存文件的相对地址 */ private void saveToLocal(byte[] data, String filePath) { try { DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath))); for (int i = 0; i < data.length; i++) { out.write(data[i]); } out.flush(); out.close(); } catch (IOException e) { e.printStackTrace(); } } public String downloadFile(String url) { String filePath = null; // 1.生成httpclient对象,并设置参数 CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpget = new HttpGet(url); HttpResponse response; // 执行http get请求 try { response = httpClient.execute(httpget); // 得到访问的状态码 int statusCode = response.getStatusLine().getStatusCode(); // 判断访问的状态码 if (statusCode != HttpStatus.SC_OK) { System.err.println("Method faild: " + response.getStatusLine()); } HttpEntity entity = response.getEntity(); // 得到相应内容 byte[] responseBody = EntityUtils.toByteArray(entity); // 根据网页url生成保存时的文件名 filePath = "d:\\temp\\" + getFileNameByUrl(url, response.getFirstHeader("Content-Type").getValue()); saveToLocal(responseBody, filePath); } catch (IOException e) { e.printStackTrace(); } return filePath; } }