package com.constellio.app.modules.es.connectors.http; import static com.constellio.data.conf.HashingEncoding.BASE64; import static java.util.Arrays.asList; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.exception.ExceptionUtils; import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_CannotDownloadDocument; import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_CannotParseDocument; import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_DocumentHasNoParsedContent; import com.constellio.app.modules.es.connectors.http.fetcher.ConnectorUrlAcceptor; import com.constellio.app.modules.es.connectors.http.fetcher.HttpURLFetchingService; import com.constellio.app.modules.es.connectors.http.fetcher.URLFetchingServiceRuntimeException; import com.constellio.app.modules.es.connectors.http.fetcher.UrlAcceptor; import com.constellio.app.modules.es.connectors.http.utils.HtmlPageParser; import com.constellio.app.modules.es.connectors.http.utils.HtmlPageParser.HtmlPageParserResults; import com.constellio.app.modules.es.connectors.spi.Connector; import com.constellio.app.modules.es.connectors.spi.ConnectorJob; import com.constellio.app.modules.es.connectors.spi.ConnectorLogger; import com.constellio.app.modules.es.model.connectors.ConnectorDocument; import com.constellio.app.modules.es.model.connectors.ConnectorDocumentStatus; import com.constellio.app.modules.es.model.connectors.http.ConnectorHttpDocument; import com.constellio.app.modules.es.model.connectors.http.ConnectorHttpInstance; import com.constellio.app.modules.es.services.ESSchemasRecordsServices; import com.constellio.data.utils.ImpossibleRuntimeException; import com.constellio.data.utils.TimeProvider; import com.constellio.data.utils.hashing.HashingService; import com.constellio.data.utils.hashing.HashingServiceException; import com.constellio.model.entities.records.ParsedContent; import com.constellio.model.entities.records.Record; import com.constellio.model.services.parser.FileParser; import com.constellio.model.services.parser.FileParserException; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.html.HtmlPage; class ConnectorHttpFetchJob extends ConnectorJob { private final ConnectorHttp connectorHttp; private final ConnectorHttpContext context; private final List<ConnectorHttpDocument> documents; private final HtmlPageParser pageParser; private final FileParser fileParser; private final HashingService hashingService; private final ConnectorLogger connectorLogger; private final ESSchemasRecordsServices es; private final int maxLevel; public ConnectorHttpFetchJob(ConnectorHttp connector, ConnectorHttpInstance instance, List<ConnectorHttpDocument> documents, ConnectorHttpContext context, ConnectorLogger connectorLogger) { super(connector, "fetch"); this.connectorHttp = connector; this.context = context; this.documents = documents; this.connectorLogger = connectorLogger; this.es = connectorHttp.getEs(); this.maxLevel = instance.getMaxLevel(); UrlAcceptor urlAcceptor = new ConnectorUrlAcceptor(instance); fileParser = connectorHttp.getEs().getModelLayerFactory().newFileParser(); hashingService = connectorHttp.getEs().getModelLayerFactory().getIOServicesFactory().newHashingService(BASE64); this.pageParser = new HtmlPageParser(urlAcceptor, fileParser, hashingService); } @Override public void execute(Connector connector) { //FIXME Same instance of connector ? try (HttpURLFetchingService fetchingService = connectorHttp.newFetchingService()) { for (ConnectorHttpDocument httpDocument : documents) { Page page = null; //FIXME long beforeFetch = new Date().getTime(); try { page = fetchingService.fetch(httpDocument.getURL()); long afterFetch = new Date().getTime(); httpDocument.setDownloadTime((double) afterFetch - beforeFetch); } catch (URLFetchingServiceRuntimeException e) { long afterFetch = new Date().getTime(); httpDocument.setDownloadTime((double) afterFetch - beforeFetch); handleFetchException(httpDocument, e); } if (page != null) { try { parse(httpDocument, page); } catch (ConnectorHttpDocumentFetchException e) { connectorLogger.error(e); } catch (Throwable t) { connectorLogger.errorUnexpected(t); } } } } } private void handleFetchException(ConnectorHttpDocument httpDocument, URLFetchingServiceRuntimeException e) { httpDocument.setFetched(true); httpDocument.setStatus(ConnectorDocumentStatus.ERROR); if (!e.getErrorCode().equals(httpDocument.getErrorCode())) { httpDocument.resetErrorsCount(); } httpDocument.incrementErrorsCount(); httpDocument.setErrorCode(e.getErrorCode()); httpDocument.setErrorMessage(e.getDescription()); httpDocument.setFetchedDateTime(TimeProvider.getLocalDateTime()); List<ConnectorDocument> documents = asList((ConnectorDocument) httpDocument); if (httpDocument.getErrorsCount() >= 3) { connectorHttp.getEventObserver().deleteEvents(httpDocument); } else { connectorHttp.getEventObserver().push(documents); } } private void parse(ConnectorHttpDocument httpDocument, Page page) throws ConnectorHttpDocumentFetchException { httpDocument.setFetched(true) .setStatus(ConnectorDocumentStatus.OK) .setFetchedDateTime(TimeProvider.getLocalDateTime()); if (page instanceof HtmlPage) { parseHtml(httpDocument, (HtmlPage) page); } else { parseBinary(httpDocument, page); } } private void parseBinary(ConnectorHttpDocument httpDocument, Page page) throws ConnectorHttpDocumentFetchException { try { InputStream inputStream = null; try { try { inputStream = page.getWebResponse().getContentAsStream(); } catch (IOException e) { //TODO Test! throw new ConnectorHttpDocumentFetchException_CannotDownloadDocument(httpDocument.getURL(), e); } ParsedContent parsedContent = fileParser.parse(inputStream, true); if (parsedContent.getParsedContent().isEmpty()) { //TODO Test! throw new ConnectorHttpDocumentFetchException_DocumentHasNoParsedContent(httpDocument.getURL()); } else { httpDocument.addStringProperty("lastModified", page.getWebResponse().getResponseHeaderValue("Last-Modified")); httpDocument.addStringProperty("charset", page.getWebResponse().getContentCharset()); httpDocument.addStringProperty("language", parsedContent.getLanguage()); httpDocument.setParsedContent(parsedContent.getParsedContent()); httpDocument.setTitle(extractFilename(httpDocument.getURL())); httpDocument.setDigest(hashingService.getHashFromString(parsedContent.getParsedContent())); httpDocument.setMimetype(parsedContent.getMimetypeWithoutCharset()); } } catch (FileParserException e) { //TODO Test! throw new ConnectorHttpDocumentFetchException_CannotParseDocument(httpDocument.getURL(), e); } catch (HashingServiceException e) { throw new ImpossibleRuntimeException(e); } finally { IOUtils.closeQuietly(inputStream); } httpDocument.setErrorCode(null) .setErrorMessage(null) .setErrorStackTrace(null) .resetErrorsCount() .setManualTokens(Record.PUBLIC_TOKEN); } catch (Exception e) { httpDocument.setErrorCode(ConnectorHttpFetchJob.class.getSimpleName() + ".parseBinary()") .setErrorMessage(ExceptionUtils.getMessage(e)) .setErrorStackTrace(ExceptionUtils.getFullStackTrace(e)) .incrementErrorsCount(); } saveDocumentDigestAndDetectCopy(httpDocument); connectorHttp.getEventObserver().push(asList((ConnectorDocument) httpDocument)); } private void parseHtml(ConnectorHttpDocument httpDocument, HtmlPage page) throws ConnectorHttpDocumentFetchException { HtmlPageParserResults results = pageParser.parse(httpDocument.getURL(), (HtmlPage) page); List<ConnectorDocument> savedDocuments = new ArrayList<>(); List<String> urls = new ArrayList<>(results.getLinkedUrls()); int linksLevel = httpDocument.getLevel() + 1; if (linksLevel <= maxLevel) for (String url : urls) { if (context.isNewUrl(url)) { context.markAsFetched(url); savedDocuments.add(connectorHttp.newUnfetchedURLDocument(url, linksLevel)); } } ensureNotStopped(); setJobStep("Fetching " + httpDocument.getURL()); String title = results.getTitle() == null ? extractFilename(httpDocument.getURL()) : results.getTitle(); httpDocument.setManualTokens(Record.PUBLIC_TOKEN); savedDocuments.add(httpDocument .setTitle(title) .setErrorCode(null) .setErrorMessage(null) .setErrorStackTrace(null) .resetErrorsCount() .setParsedContent(results.getParsedContent()) .setDigest(results.getDigest()) //.setOutlinks(urls) .setMimetype(results.getMimetype()) .addStringProperty("lastModified", page.getWebResponse().getResponseHeaderValue("Last-Modified")) .addStringProperty("charset", page.getWebResponse().getContentCharset()) .addStringProperty("language", results.getLanguage())); saveDocumentDigestAndDetectCopy(httpDocument); connectorHttp.getEventObserver().push(savedDocuments); } private String extractFilename(String url) { int lastSlash = url.lastIndexOf("/"); if (lastSlash == -1) { return url; } else { return url.substring(lastSlash + 1); } } private void saveDocumentDigestAndDetectCopy(ConnectorHttpDocument httpDocument) { Record record = httpDocument.getWrappedRecord(); httpDocument.setSearchable(true); httpDocument.setCopyOf(null); String originalDigest = null; if (record.isSaved()) { originalDigest = record.getCopyOfOriginalRecord().get(es.connectorHttpDocument.digest()); } if (originalDigest != null && !originalDigest.equals(httpDocument.getDigest())) { context.removeDocumentDigest(originalDigest, httpDocument.getURL()); } if (httpDocument.getDigest() != null) { String documentUrlWithDigest = context.getDocumentUrlWithDigest(httpDocument.getDigest()); if (documentUrlWithDigest != null && !httpDocument.getURL().equals(documentUrlWithDigest)) { httpDocument.setParsedContent(null); httpDocument.setCopyOf(documentUrlWithDigest); httpDocument.setSearchable(false); } else { context.addDocumentDigest(httpDocument.getDigest(), httpDocument.getURL()); } } } }