ConnectorHttpFetchJob.java example

Explorer
constellio-master
package com.constellio.app.modules.es.connectors.http;

import static com.constellio.data.conf.HashingEncoding.BASE64;
import static java.util.Arrays.asList;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.exception.ExceptionUtils;

import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_CannotDownloadDocument;
import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_CannotParseDocument;
import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException.ConnectorHttpDocumentFetchException_DocumentHasNoParsedContent;
import com.constellio.app.modules.es.connectors.http.fetcher.ConnectorUrlAcceptor;
import com.constellio.app.modules.es.connectors.http.fetcher.HttpURLFetchingService;
import com.constellio.app.modules.es.connectors.http.fetcher.URLFetchingServiceRuntimeException;
import com.constellio.app.modules.es.connectors.http.fetcher.UrlAcceptor;
import com.constellio.app.modules.es.connectors.http.utils.HtmlPageParser;
import com.constellio.app.modules.es.connectors.http.utils.HtmlPageParser.HtmlPageParserResults;
import com.constellio.app.modules.es.connectors.spi.Connector;
import com.constellio.app.modules.es.connectors.spi.ConnectorJob;
import com.constellio.app.modules.es.connectors.spi.ConnectorLogger;
import com.constellio.app.modules.es.model.connectors.ConnectorDocument;
import com.constellio.app.modules.es.model.connectors.ConnectorDocumentStatus;
import com.constellio.app.modules.es.model.connectors.http.ConnectorHttpDocument;
import com.constellio.app.modules.es.model.connectors.http.ConnectorHttpInstance;
import com.constellio.app.modules.es.services.ESSchemasRecordsServices;
import com.constellio.data.utils.ImpossibleRuntimeException;
import com.constellio.data.utils.TimeProvider;
import com.constellio.data.utils.hashing.HashingService;
import com.constellio.data.utils.hashing.HashingServiceException;
import com.constellio.model.entities.records.ParsedContent;
import com.constellio.model.entities.records.Record;
import com.constellio.model.services.parser.FileParser;
import com.constellio.model.services.parser.FileParserException;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

class ConnectorHttpFetchJob extends ConnectorJob {

	private final ConnectorHttp connectorHttp;

	private final ConnectorHttpContext context;

	private final List<ConnectorHttpDocument> documents;

	private final HtmlPageParser pageParser;

	private final FileParser fileParser;

	private final HashingService hashingService;

	private final ConnectorLogger connectorLogger;

	private final ESSchemasRecordsServices es;

	private final int maxLevel;

	public ConnectorHttpFetchJob(ConnectorHttp connector, ConnectorHttpInstance instance, List<ConnectorHttpDocument> documents,
			ConnectorHttpContext context, ConnectorLogger connectorLogger) {
		super(connector, "fetch");
		this.connectorHttp = connector;
		this.context = context;
		this.documents = documents;
		this.connectorLogger = connectorLogger;
		this.es = connectorHttp.getEs();
		this.maxLevel = instance.getMaxLevel();
		UrlAcceptor urlAcceptor = new ConnectorUrlAcceptor(instance);
		fileParser = connectorHttp.getEs().getModelLayerFactory().newFileParser();
		hashingService = connectorHttp.getEs().getModelLayerFactory().getIOServicesFactory().newHashingService(BASE64);
		this.pageParser = new HtmlPageParser(urlAcceptor, fileParser, hashingService);
	}

	@Override
	public void execute(Connector connector) {
		//FIXME Same instance of connector ?
		try (HttpURLFetchingService fetchingService = connectorHttp.newFetchingService()) {
			for (ConnectorHttpDocument httpDocument : documents) {
				Page page = null;
				//FIXME
				long beforeFetch = new Date().getTime();
				try {
					page = fetchingService.fetch(httpDocument.getURL());
					long afterFetch = new Date().getTime();
					httpDocument.setDownloadTime((double) afterFetch - beforeFetch);

				} catch (URLFetchingServiceRuntimeException e) {
					long afterFetch = new Date().getTime();
					httpDocument.setDownloadTime((double) afterFetch - beforeFetch);
					handleFetchException(httpDocument, e);
				}

				if (page != null) {
					try {
						parse(httpDocument, page);
					} catch (ConnectorHttpDocumentFetchException e) {
						connectorLogger.error(e);
					} catch (Throwable t) {
						connectorLogger.errorUnexpected(t);
					}
				}
			}
		}
	}

	private void handleFetchException(ConnectorHttpDocument httpDocument, URLFetchingServiceRuntimeException e) {

		httpDocument.setFetched(true);
		httpDocument.setStatus(ConnectorDocumentStatus.ERROR);
		if (!e.getErrorCode().equals(httpDocument.getErrorCode())) {
			httpDocument.resetErrorsCount();
		}
		httpDocument.incrementErrorsCount();
		httpDocument.setErrorCode(e.getErrorCode());
		httpDocument.setErrorMessage(e.getDescription());
		httpDocument.setFetchedDateTime(TimeProvider.getLocalDateTime());
		List<ConnectorDocument> documents = asList((ConnectorDocument) httpDocument);
		if (httpDocument.getErrorsCount() >= 3) {
			connectorHttp.getEventObserver().deleteEvents(httpDocument);
		} else {
			connectorHttp.getEventObserver().push(documents);
		}
	}

	private void parse(ConnectorHttpDocument httpDocument, Page page)
			throws ConnectorHttpDocumentFetchException {
		httpDocument.setFetched(true)
				.setStatus(ConnectorDocumentStatus.OK)
				.setFetchedDateTime(TimeProvider.getLocalDateTime());
		if (page instanceof HtmlPage) {
			parseHtml(httpDocument, (HtmlPage) page);

		} else {
			parseBinary(httpDocument, page);
		}
	}

	private void parseBinary(ConnectorHttpDocument httpDocument, Page page)
			throws ConnectorHttpDocumentFetchException {

		try {
			InputStream inputStream = null;
			try {
				try {
					inputStream = page.getWebResponse().getContentAsStream();
				} catch (IOException e) {
					//TODO Test!
					throw new ConnectorHttpDocumentFetchException_CannotDownloadDocument(httpDocument.getURL(), e);
				}
				ParsedContent parsedContent = fileParser.parse(inputStream, true);
				if (parsedContent.getParsedContent().isEmpty()) {
					//TODO Test!
					throw new ConnectorHttpDocumentFetchException_DocumentHasNoParsedContent(httpDocument.getURL());
				} else {
					httpDocument.addStringProperty("lastModified", page.getWebResponse().getResponseHeaderValue("Last-Modified"));
					httpDocument.addStringProperty("charset", page.getWebResponse().getContentCharset());
					httpDocument.addStringProperty("language", parsedContent.getLanguage());
					httpDocument.setParsedContent(parsedContent.getParsedContent());

					httpDocument.setTitle(extractFilename(httpDocument.getURL()));
					httpDocument.setDigest(hashingService.getHashFromString(parsedContent.getParsedContent()));
					httpDocument.setMimetype(parsedContent.getMimetypeWithoutCharset());
				}
			} catch (FileParserException e) {
				//TODO Test!
				throw new ConnectorHttpDocumentFetchException_CannotParseDocument(httpDocument.getURL(), e);

			} catch (HashingServiceException e) {
				throw new ImpossibleRuntimeException(e);

			} finally {
				IOUtils.closeQuietly(inputStream);
			}

			httpDocument.setErrorCode(null)
					.setErrorMessage(null)
					.setErrorStackTrace(null)
					.resetErrorsCount()
					.setManualTokens(Record.PUBLIC_TOKEN);

		} catch (Exception e) {
			httpDocument.setErrorCode(ConnectorHttpFetchJob.class.getSimpleName() + ".parseBinary()")
					.setErrorMessage(ExceptionUtils.getMessage(e))
					.setErrorStackTrace(ExceptionUtils.getFullStackTrace(e))
					.incrementErrorsCount();
		}

		saveDocumentDigestAndDetectCopy(httpDocument);
		connectorHttp.getEventObserver().push(asList((ConnectorDocument) httpDocument));
	}

	private void parseHtml(ConnectorHttpDocument httpDocument, HtmlPage page)
			throws ConnectorHttpDocumentFetchException {
		HtmlPageParserResults results = pageParser.parse(httpDocument.getURL(), (HtmlPage) page);

		List<ConnectorDocument> savedDocuments = new ArrayList<>();
		List<String> urls = new ArrayList<>(results.getLinkedUrls());
		int linksLevel = httpDocument.getLevel() + 1;
		if (linksLevel <= maxLevel)
			for (String url : urls) {
				if (context.isNewUrl(url)) {
					context.markAsFetched(url);

					savedDocuments.add(connectorHttp.newUnfetchedURLDocument(url, linksLevel));
				}
			}

		ensureNotStopped();
		setJobStep("Fetching " + httpDocument.getURL());

		String title = results.getTitle() == null ? extractFilename(httpDocument.getURL()) : results.getTitle();

		httpDocument.setManualTokens(Record.PUBLIC_TOKEN);
		savedDocuments.add(httpDocument
				.setTitle(title)
				.setErrorCode(null)
				.setErrorMessage(null)
				.setErrorStackTrace(null)
				.resetErrorsCount()
				.setParsedContent(results.getParsedContent())
				.setDigest(results.getDigest())
				//.setOutlinks(urls)
				.setMimetype(results.getMimetype())
				.addStringProperty("lastModified", page.getWebResponse().getResponseHeaderValue("Last-Modified"))
				.addStringProperty("charset", page.getWebResponse().getContentCharset())
				.addStringProperty("language", results.getLanguage()));

		saveDocumentDigestAndDetectCopy(httpDocument);
		connectorHttp.getEventObserver().push(savedDocuments);
	}

	private String extractFilename(String url) {
		int lastSlash = url.lastIndexOf("/");
		if (lastSlash == -1) {
			return url;
		} else {
			return url.substring(lastSlash + 1);
		}
	}

	private void saveDocumentDigestAndDetectCopy(ConnectorHttpDocument httpDocument) {
		Record record = httpDocument.getWrappedRecord();

		httpDocument.setSearchable(true);
		httpDocument.setCopyOf(null);
		String originalDigest = null;
		if (record.isSaved()) {
			originalDigest = record.getCopyOfOriginalRecord().get(es.connectorHttpDocument.digest());
		}
		if (originalDigest != null && !originalDigest.equals(httpDocument.getDigest())) {
			context.removeDocumentDigest(originalDigest, httpDocument.getURL());
		}

		if (httpDocument.getDigest() != null) {
			String documentUrlWithDigest = context.getDocumentUrlWithDigest(httpDocument.getDigest());
			if (documentUrlWithDigest != null && !httpDocument.getURL().equals(documentUrlWithDigest)) {
				httpDocument.setParsedContent(null);
				httpDocument.setCopyOf(documentUrlWithDigest);
				httpDocument.setSearchable(false);
			} else {
				context.addDocumentDigest(httpDocument.getDigest(), httpDocument.getURL());
			}
		}

	}

}