package com.constellio.app.modules.es.connectors.http.fetcher; import java.io.IOException; import java.io.InputStream; import java.security.NoSuchAlgorithmException; import java.util.HashSet; import java.util.Set; import java.util.logging.Logger; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.james.mime4j.io.LimitedInputStream; import com.constellio.app.modules.es.connectors.http.fetcher.config.FetcherConfig; import com.constellio.app.modules.es.connectors.http.utils.DigestUtil; import com.constellio.app.modules.es.connectors.http.utils.HtmlAnchorUtils; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.WebResponse; import com.gargoylesoftware.htmlunit.html.HtmlAnchor; import com.gargoylesoftware.htmlunit.html.HtmlPage; public class FetchedHttpDoc implements FetchedDoc { private static final long MAX_CONTENT_LENGTH = 5 * 1024 * 1024; private static final Logger LOG = Logger.getLogger(FetchedHttpDoc.class.getName()); private final FetcherConfig fetcherHttpConfig; private final String url; private final Page fetchedPage; private final Set<String> uniqueAnchorUrls; private byte[] content; private String contentType; private String digest; public FetchedHttpDoc(FetcherConfig fetcherHttpConfig, String url, Page fetchedPage) throws IOException, NoSuchAlgorithmException { this.url = url; this.fetcherHttpConfig = fetcherHttpConfig; this.fetchedPage = fetchedPage; this.uniqueAnchorUrls = getUniqueAnchors(fetchedPage); WebResponse webResponse = fetchedPage.getWebResponse(); this.content = getContent(webResponse); this.contentType = webResponse.getContentType(); if (fetchedPage instanceof HtmlPage) { this.digest = DigestUtil.digest(((HtmlPage) fetchedPage).asText().getBytes()); } else { this.digest = DigestUtil.digest(this.content); } } private Set<String> getUniqueAnchors(final Page fetchedPage) { final Set<String> uniqueAnchorUrls = new HashSet<String>(); if (fetchedPage instanceof HtmlPage) { HtmlPage htmlPage = (HtmlPage) fetchedPage; for (HtmlAnchor anchor : htmlPage.getAnchors()) { if (HtmlAnchorUtils.isMailto(anchor)) { // Ignore } else if (HtmlAnchorUtils.isJavascript(anchor)) { // FIXME Deal with Javascript links // fetchedPage = this.link.getHtmlAnchor().click(); } else { String anchorUrl = null; try { final String unNormalizedAnchorUrl = HtmlAnchorUtils.getUrl(anchor); anchorUrl = this.fetcherHttpConfig.normalize(unNormalizedAnchorUrl); } catch (Exception e) { LOG.fine("Rejected anchor url :" + anchorUrl + " from " + this.url); continue; } if (StringUtils.isNotBlank(anchorUrl) && fetcherHttpConfig.isAccepted(anchorUrl)) { uniqueAnchorUrls.add(anchorUrl); } } } } return uniqueAnchorUrls; } private byte[] getContent(WebResponse response) throws IOException { final String contentLenghtString = response.getResponseHeaderValue("Content-Length"); if (StringUtils.isNotBlank(contentLenghtString)) { final long contentLength = Long.parseLong(contentLenghtString); if (contentLength > MAX_CONTENT_LENGTH) { throw new IOException("Max content length exceeded: " + contentLength); } } InputStream contentStream = null; try { contentStream = new LimitedInputStream(response.getContentAsStream(), MAX_CONTENT_LENGTH); return IOUtils.toByteArray(contentStream); } finally { IOUtils.closeQuietly(contentStream); } } public Page getFetchedPage() { return fetchedPage; } public Set<String> getUniqueAnchorUrls() { return uniqueAnchorUrls; } public byte[] getContent() { return content; } public String getContentType() { return contentType; } public String getDigest() { return digest; } public String getUrl() { return url; } }