package com.constellio.app.modules.es.connectors.http.utils;
import static org.apache.tika.io.IOUtils.toByteArray;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.james.mime4j.io.LimitedInputStream;
import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException;
import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_CannotDownloadDocument;
import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_CannotParseDocument;
import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_DocumentHasNoParsedContent;
import com.constellio.app.modules.es.connectors.http.fetcher.UrlAcceptor;
import com.constellio.app.modules.es.connectors.http.fetcher.config.BasicUrlNormalizer;
import com.constellio.data.utils.ImpossibleRuntimeException;
import com.constellio.data.utils.hashing.HashingService;
import com.constellio.data.utils.hashing.HashingServiceException;
import com.constellio.model.entities.records.ParsedContent;
import com.constellio.model.services.parser.FileParser;
import com.constellio.model.services.parser.FileParserException;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class HtmlPageParser {
private static final long MAX_CONTENT_LENGTH = 20 * 1024 * 1024;
HashingService hashingService;
FileParser fileParser;
BasicUrlNormalizer urlNormalizer;
UrlAcceptor urlAcceptor;
public HtmlPageParser(UrlAcceptor urlAcceptor, FileParser fileParser, HashingService hashingService) {
this.urlAcceptor = urlAcceptor;
this.urlNormalizer = new BasicUrlNormalizer();
this.hashingService = hashingService;
this.fileParser = fileParser;
}
public HtmlPageParserResults parse(String url, HtmlPage page)
throws ConnectorHttpDocumentFetchException {
ParsedContent parsedContent;
Set<String> uniqueAnchors = getUniqueAnchors(page);
byte[] content;
String digest, title, parsedContentText;
try {
content = getContent(page);
} catch (IOException e) {
throw new ConnectorHttpDocumentFetchException_CannotDownloadDocument(url, e);
}
try {
parsedContent = fileParser.parse(new ByteArrayInputStream(content), true);
title = (String) parsedContent.getNormalizedProperty("title");
parsedContentText = parsedContent.getParsedContent();
} catch (FileParserException e) {
throw new ConnectorHttpDocumentFetchException_CannotParseDocument(url, e);
}
if (parsedContentText.isEmpty()) {
throw new ConnectorHttpDocumentFetchException_DocumentHasNoParsedContent(url);
}
try {
digest = hashingService.getHashFromString(parsedContentText);
} catch (HashingServiceException e) {
throw new ImpossibleRuntimeException(e);
}
return new HtmlPageParserResults(digest, parsedContentText, title, uniqueAnchors,
parsedContent.getMimetypeWithoutCharset(), parsedContent.getLanguage());
}
private byte[] getContent(HtmlPage page)
throws IOException {
WebResponse webResponse = page.getWebResponse();
final String contentLenghtString = webResponse.getResponseHeaderValue("Content-Length");
// if (StringUtils.isNotBlank(contentLenghtString)) {
// final long contentLength = Long.parseLong(contentLenghtString);
// if (contentLength > MAX_CONTENT_LENGTH) {
// throw new IOException("Max content length exceeded: " + contentLength);
// }
// }
InputStream contentStream = null;
try {
contentStream = new LimitedInputStream(webResponse.getContentAsStream(), MAX_CONTENT_LENGTH);
return toByteArray(contentStream);
} finally {
IOUtils.closeQuietly(contentStream);
}
}
private String getTitle(HtmlPage page) {
return null;
}
private Set<String> getUniqueAnchors(HtmlPage page) {
Set<String> uniqueAnchorUrls = new HashSet<>();
for (HtmlAnchor anchor : page.getAnchors()) {
String anchorUrl = null;
try {
final String unNormalizedAnchorUrl = HtmlAnchorUtils.getUrl(anchor);
anchorUrl = urlNormalizer.normalize(unNormalizedAnchorUrl);
} catch (Exception e) {
//Normal, just skipping this url
}
if (StringUtils.isNotBlank(anchorUrl) && !anchorUrl.equals(page.getUrl().toString()) && urlAcceptor
.isAccepted(anchorUrl)) {
uniqueAnchorUrls.add(anchorUrl);
}
}
return uniqueAnchorUrls;
}
public static class HtmlPageParserResults {
private String parsedContent;
private String title;
private Set<String> linkedUrls;
private String language;
private String digest;
private String mimetype;
public HtmlPageParserResults(String digest, String parsedContent, String title, Set<String> linkedUrls, String mimetype,
String language) {
this.digest = digest;
this.parsedContent = parsedContent;
this.title = title;
this.linkedUrls = linkedUrls;
this.mimetype = mimetype;
this.language = language;
}
public String getDigest() {
return digest;
}
public String getParsedContent() {
return parsedContent;
}
public String getMimetype() {
return mimetype;
}
public Set<String> getLinkedUrls() {
return linkedUrls;
}
public String getTitle() {
return title;
}
public String getLanguage() {
return language;
}
}
}