/** * License Agreement for OpenSearchServer * * Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft * * http://www.open-search-server.com * * This file is part of OpenSearchServer. * * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.crawler.web.spider; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import java.util.zip.CRC32; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.io.FilenameUtils; import org.apache.http.Header; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import com.jaeksoft.searchlib.SearchLibException.WrongStatusCodeException; import com.jaeksoft.searchlib.util.IOUtils; import com.jaeksoft.searchlib.util.StringUtils; public class DownloadItem { private URI uri; private URI redirectLocation = null; private Long contentLength = null; private String contentDispositionFilename = null; private String contentBaseType = null; private String contentTypeCharset = null; private String contentEncoding = null; private String contentLocation = null; private Long lastModified = null; private Integer statusCode = null; private String reasonPhrase = null; private InputStream contentInputStream = null; private boolean fromCache = false; private List<String> headers = null; private Header[] httpHeaders = null; public DownloadItem(URI uri) { this.uri = uri; } protected final static String KEY_REDIRECT_LOCATION = "KEY_REDIRECT_LOCATION"; protected final static String KEY_CONTENT_DISPOSITION_FILENAME = "KEY_CONTENT_DISPOSITION_FILENAME"; protected final static String KEY_CONTENT_LENGTH = "KEY_CONTENT_LENGTH"; protected final static String KEY_LAST_MODIFIED = "KEY_LAST_MODIFIED"; protected final static String KEY_CONTENT_BASE_TYPE = "KEY_CONTENT_BASE_TYPE"; protected final static String KEY_CONTENT_TYPE_CHARSET = "KEY_CONTENT_TYPE_CHARSET"; protected final static String KEY_CONTENT_ENCODING = "KEY_CONTENT_ENCODING"; protected final static String KEY_CONTENT_LOCATION = "KEY_CONTENT_LOCATION"; protected final static String KEY_STATUS_CODE = "KEY_STATUS_CODE"; protected final static String KEY_REASON_PHRASE = "KEY_REASON_PHRASE"; protected final static String KEY_HEADERS = "KEY_HEADERS"; public String getMetaAsJson() throws JSONException { JSONObject json = new JSONObject(); if (redirectLocation != null) json.put(KEY_REDIRECT_LOCATION, redirectLocation.toASCIIString()); if (contentLength != null) json.put(KEY_CONTENT_LENGTH, contentLength); if (lastModified != null) json.put(KEY_LAST_MODIFIED, lastModified); if (contentDispositionFilename != null) json.put(KEY_CONTENT_DISPOSITION_FILENAME, contentDispositionFilename); if (contentBaseType != null) json.put(KEY_CONTENT_BASE_TYPE, contentBaseType); if (contentTypeCharset != null) json.put(KEY_CONTENT_TYPE_CHARSET, contentTypeCharset); if (contentEncoding != null) json.put(KEY_CONTENT_ENCODING, contentEncoding); if (contentLocation != null) json.put(KEY_CONTENT_LOCATION, contentLocation); if (statusCode != null) json.put(KEY_STATUS_CODE, statusCode); if (reasonPhrase != null) json.put(KEY_REASON_PHRASE, reasonPhrase); if (headers != null) json.put(KEY_HEADERS, headers); return json.toString(); } public void loadMetaFromJson(org.json.JSONObject json) throws URISyntaxException, JSONException { fromCache = true; if (json.has(KEY_REDIRECT_LOCATION)) { String s = json.getString(KEY_REDIRECT_LOCATION); if (s != null) redirectLocation = new URI(s); } if (json.has(KEY_CONTENT_LENGTH)) contentLength = json.getLong(KEY_CONTENT_LENGTH); if (json.has(KEY_LAST_MODIFIED)) lastModified = json.getLong(KEY_LAST_MODIFIED); if (json.has(KEY_CONTENT_DISPOSITION_FILENAME)) contentDispositionFilename = json .getString(KEY_CONTENT_DISPOSITION_FILENAME); if (json.has(KEY_CONTENT_BASE_TYPE)) contentBaseType = json.getString(KEY_CONTENT_BASE_TYPE); if (json.has(KEY_CONTENT_TYPE_CHARSET)) contentTypeCharset = json.getString(KEY_CONTENT_TYPE_CHARSET); if (json.has(KEY_CONTENT_ENCODING)) contentEncoding = json.getString(KEY_CONTENT_ENCODING); if (json.has(KEY_CONTENT_LOCATION)) contentLocation = json.getString(KEY_CONTENT_LOCATION); if (json.has(KEY_STATUS_CODE)) statusCode = json.getInt(KEY_STATUS_CODE); if (json.has(KEY_REASON_PHRASE)) reasonPhrase = json.getString(KEY_REASON_PHRASE); if (json.has(KEY_HEADERS)) { headers = new ArrayList<String>(); JSONArray headerJsonArray = json.getJSONArray(KEY_HEADERS); if (headerJsonArray != null) for (int i = 0; i < headerJsonArray.length(); i++) headers.add(headerJsonArray.get(i).toString()); } } /** * @return the redirectLocation */ public URI getRedirectLocation() { return redirectLocation; } /** * @param redirectLocation * the redirectLocation to set */ public void setRedirectLocation(URI redirectLocation) { this.redirectLocation = redirectLocation; } /** * @return the contentLength */ public Long getContentLength() { return contentLength; } /** * @return the lastModified */ public Long getLastModified() { return lastModified; } /** * @param lastModified * the lastModified to set */ public void setLastModified(Long lastModified) { this.lastModified = lastModified; } /** * @param contentLength * the contentLength to set */ public void setContentLength(Long contentLength) { this.contentLength = contentLength; } /** * @return the contentDispositionFilename */ public String getContentDispositionFilename() { return contentDispositionFilename; } /** * @param contentDispositionFilename * the contentDispositionFilename to set */ public void setContentDispositionFilename(String contentDispositionFilename) { this.contentDispositionFilename = contentDispositionFilename; } public String getFileName() throws MalformedURLException { if (contentDispositionFilename != null) return contentDispositionFilename; if (uri == null) return null; String urlFile = uri.toURL().getPath(); if (urlFile == null) return null; return FilenameUtils.getName(urlFile); } /** * @return the contentBaseType */ public String getContentBaseType() { return contentBaseType; } /** * @param contentBaseType * the contentBaseType to set */ public void setContentBaseType(String contentBaseType) { this.contentBaseType = contentBaseType; } /** * @return the contentTypeCharset */ public String getContentTypeCharset() { return contentTypeCharset; } /** * @param contentTypeCharset * the contentTypeCharset to set */ public void setContentTypeCharset(String contentTypeCharset) { this.contentTypeCharset = contentTypeCharset; } /** * @return the contentEncoding */ public String getContentEncoding() { return contentEncoding; } /** * @param contentEncoding * the contentEncoding to set */ public void setContentEncoding(String contentEncoding) { this.contentEncoding = contentEncoding; } /** * @return the statusCode */ public Integer getStatusCode() { return statusCode; } public void checkNoErrorRange(int fromInclusive, int toExclusive) throws WrongStatusCodeException { if (statusCode == null) throw new WrongStatusCodeException("No status code - ", uri); if (statusCode < fromInclusive || statusCode >= toExclusive) throw new WrongStatusCodeException("Wrong status code: ", statusCode, ' ', reasonPhrase, " - ", uri); } public void checkNoErrorList(int... validCodes) throws WrongStatusCodeException { if (statusCode == null) throw new WrongStatusCodeException("Wrong status code: ", statusCode, ' ', reasonPhrase, " - ", uri); for (int validCode : validCodes) if (statusCode == validCode) return; throw new WrongStatusCodeException("Wrong status code: ", statusCode, ' ', reasonPhrase, " - ", uri); } /** * @param statusCode * the statusCode to set */ public void setStatusCode(Integer statusCode) { this.statusCode = statusCode; } /** * @return the reasonPhrase */ public String getReasonPhrase() { return reasonPhrase; } /** * @param reasonPhrase * the reasonPhrase to set */ public void setReasonPhrase(String reasonPhrase) { this.reasonPhrase = reasonPhrase; } /** * @return the contentInputStream */ public InputStream getContentInputStream() { return contentInputStream; } /** * @param contentInputStream * the inputStream to set */ public void setContentInputStream(InputStream contentInputStream) { this.contentInputStream = contentInputStream; } /** * @return the uri */ public URI getUri() { return uri; } /** * @return the fromCache */ public boolean isFromCache() { return fromCache; } public List<String> getHeaders() { return headers; } public void setHeaders(Header[] headers) { httpHeaders = headers; if (headers == null) return; this.headers = new ArrayList<String>(headers.length); for (Header header : headers) { StringBuilder sb = new StringBuilder(); sb.append(header.getName()); sb.append(": "); sb.append(header.getValue()); this.headers.add(sb.toString()); } } public String getFirstHttpHeader(String name) { if (httpHeaders == null) return null; for (Header header : httpHeaders) if (header.getName().equalsIgnoreCase(name)) return header.getValue(); return null; } public String getContentAsString() throws IOException { if (contentInputStream == null) return null; return IOUtils.toString(contentInputStream); } /** * @return the contentLocation */ public String getContentLocation() { return contentLocation; } /** * @param contentLocation * the contentLocation to set */ public void setContentLocation(String contentLocation) { this.contentLocation = contentLocation; } public void writeToFile(File file) throws IOException { if (contentInputStream == null) return; FileOutputStream fos = null; BufferedOutputStream bos = null; try { fos = new FileOutputStream(file); bos = new BufferedOutputStream(fos); IOUtils.copy(contentInputStream, bos); } finally { IOUtils.close(bos, fos); } } public void writeToZip(ZipArchiveOutputStream zipOutput) throws IOException { if (contentInputStream == null) return; String[] domainParts = StringUtils.split(uri.getHost(), '.'); StringBuilder path = new StringBuilder(); for (int i = domainParts.length - 1; i >= 0; i--) { path.append(domainParts[i]); path.append('/'); } String[] pathParts = StringUtils.split(uri.getPath(), '/'); for (int i = 0; i < pathParts.length - 1; i++) { if (StringUtils.isEmpty(pathParts[i])) continue; path.append(pathParts[i]); path.append('/'); } if (contentDispositionFilename != null) path.append(contentDispositionFilename); else { String lastPart = pathParts == null || pathParts.length == 0 ? null : pathParts[pathParts.length - 1]; if (StringUtils.isEmpty(lastPart)) path.append("index"); else path.append(lastPart); } if (uri.getPath().endsWith("/")) path.append("/_index"); String query = uri.getQuery(); String fragment = uri.getFragment(); if (!StringUtils.isEmpty(query) || !StringUtils.isEmpty(fragment)) { CRC32 crc32 = new CRC32(); if (!StringUtils.isEmpty(query)) crc32.update(query.getBytes()); if (!StringUtils.isEmpty(fragment)) crc32.update(fragment.getBytes()); path.append('.'); path.append(crc32.getValue()); } ZipArchiveEntry zipEntry = new ZipArchiveEntry(path.toString()); zipOutput.putArchiveEntry(zipEntry); BufferedInputStream bis = null; byte[] buffer = new byte[65536]; try { bis = new BufferedInputStream(contentInputStream); int l; while ((l = bis.read(buffer)) != -1) zipOutput.write(buffer, 0, l); zipOutput.closeArchiveEntry(); } finally { IOUtils.close(bis); } } }