package fr.openwide.core.jpa.externallinkchecker.business.service; import static fr.openwide.core.jpa.externallinkchecker.property.JpaExternalLinkCheckerPropertyIds.BATCH_SIZE; import static fr.openwide.core.jpa.externallinkchecker.property.JpaExternalLinkCheckerPropertyIds.MAX_REDIRECTS; import static fr.openwide.core.jpa.externallinkchecker.property.JpaExternalLinkCheckerPropertyIds.MIN_DELAY_BETWEEN_TWO_CHECKS_IN_DAYS; import static fr.openwide.core.jpa.externallinkchecker.property.JpaExternalLinkCheckerPropertyIds.RETRY_ATTEMPTS_NUMBER; import static fr.openwide.core.jpa.externallinkchecker.property.JpaExternalLinkCheckerPropertyIds.THREAD_POOL_SIZE; import static fr.openwide.core.jpa.externallinkchecker.property.JpaExternalLinkCheckerPropertyIds.TIMEOUT; import static fr.openwide.core.jpa.externallinkchecker.property.JpaExternalLinkCheckerPropertyIds.USER_AGENT; import io.mola.galimatias.GalimatiasParseException; import java.io.IOException; import java.net.SocketTimeoutException; import java.net.URI; import java.net.URISyntaxException; import java.util.Collection; import java.util.Date; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import javax.net.ssl.SSLHandshakeException; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.http.HttpStatus; import org.apache.http.StatusLine; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpHead; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.conn.ConnectionPoolTimeoutException; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.message.BasicHeader; import org.apache.http.protocol.HTTP; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ConfigurableApplicationContext; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.LinkedListMultimap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import fr.openwide.core.jpa.exception.SecurityServiceException; import fr.openwide.core.jpa.exception.ServiceException; import fr.openwide.core.jpa.externallinkchecker.business.model.ExternalLinkErrorType; import fr.openwide.core.jpa.externallinkchecker.business.model.ExternalLinkStatus; import fr.openwide.core.jpa.externallinkchecker.business.model.ExternalLinkWrapper; import fr.openwide.core.spring.property.service.IPropertyService; @Service("externalLinkCheckerService") public class ExternalLinkCheckerServiceImpl implements IExternalLinkCheckerService { private static final Logger LOGGER = LoggerFactory.getLogger(ExternalLinkCheckerServiceImpl.class); @Autowired private IExternalLinkWrapperService externalLinkWrapperService; @Autowired private ConfigurableApplicationContext applicationContext; @Autowired private IPropertyService propertyService; private CloseableHttpClient httpClient = null; /** * we can put here some URLs known to fail. Otherwise, it's better to do it directly in the application. */ private List<Pattern> ignorePatterns = Lists.newArrayList( //Pattern.compile("^http://translate.googleusercontent.com/.*") ); private int batchSize; private int minDelayBetweenTwoChecks; @PostConstruct private void initialize() { RequestConfig requestConfig = RequestConfig.custom() .setMaxRedirects(propertyService.get(MAX_REDIRECTS)) .setSocketTimeout(propertyService.get(TIMEOUT)) .setConnectionRequestTimeout(propertyService.get(TIMEOUT)) .setConnectTimeout(propertyService.get(TIMEOUT)) .setStaleConnectionCheckEnabled(true) // waiting for this to be resolved: https://issues.apache.org/jira/browse/HTTPCLIENT-1656 .build(); httpClient = HttpClientBuilder.create() .setUserAgent(propertyService.get(USER_AGENT)) .setDefaultRequestConfig(requestConfig) .setDefaultHeaders(Lists.newArrayList( new BasicHeader(HTTP.CONN_DIRECTIVE, HTTP.CONN_CLOSE), new BasicHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"), new BasicHeader("Accept-Language", "fr,en;q=0.8,fr-fr;q=0.6,en-us;q=0.4,en-gb;q=0.2") )) .build(); batchSize = propertyService.get(BATCH_SIZE); minDelayBetweenTwoChecks = propertyService.get(MIN_DELAY_BETWEEN_TWO_CHECKS_IN_DAYS); } @PreDestroy private void destroy() { if (httpClient != null) { try { httpClient.close(); } catch (IOException e) { LOGGER.error("Unable to close the HTTP client", e); } } } // Public methods @Override public void checkBatch() throws ServiceException, SecurityServiceException { List<ExternalLinkWrapper> links = externalLinkWrapperService.listNextCheckingBatch( batchSize, minDelayBetweenTwoChecks); runTasksInParallel(createTasksByDomain(links), 10, TimeUnit.HOURS); } @Override public void checkLink(final ExternalLinkWrapper link) throws ServiceException, SecurityServiceException { try { io.mola.galimatias.URL url = io.mola.galimatias.URL.parse(link.getUrl()); checkLinksWithSameUrl(url, ImmutableList.of(link)); } catch (GalimatiasParseException e) { markAsInvalid(link); } } @Override public void checkLinksWithSameUrl(io.mola.galimatias.URL url, Collection<ExternalLinkWrapper> links) throws ServiceException, SecurityServiceException { StatusLine httpStatus = null; ExternalLinkErrorType errorType = null; // Special casing the ignore patterns for (Pattern pattern : ignorePatterns) { if (pattern.matcher(url.toHumanString()).matches()) { markAsIgnored(links); return; } } URI uri; try { uri = url.toJavaURI(); } catch (URISyntaxException e) { // java.net.URI is buggy and doesn't support domain names with underscores. As galimatias already check // the URL format, if we are here, it's probably because java.net.URI doesn't handle this case very well // so we simply ignore the link instead of marking it as dead. // see http://bugs.java.com/view_bug.do?bug_id=6587184 and https://issues.apache.org/jira/browse/HTTPCLIENT-911 markAsIgnored(links); return; } boolean headIfTrueElseGet = true; boolean retry = false; int numAttempts = 0; do { // Check the URL and update the links try { // If we are retrying, be sure to have these variables at initial values errorType = null; httpStatus = null; // We try a HEAD request if (headIfTrueElseGet) { httpStatus = sendRequest(new HttpHead(uri)); if (httpStatus != null && httpStatus.getStatusCode() != HttpStatus.SC_OK) { // If the result of the HEAD request is not OK, we try a GET request // Using HttpStatus.SC_METHOD_NOT_ALLOWED looked like a clever trick but a lot of sites return // 400 or 500 errors for HEAD requests httpStatus = sendRequest(new HttpGet(uri)); } } else { // If HEAD request didn't work, we try a GET request httpStatus = sendRequest(new HttpGet(uri)); } if (httpStatus == null) { errorType = ExternalLinkErrorType.UNKNOWN_HTTPCLIENT_ERROR; } else if (httpStatus.getStatusCode() != HttpStatus.SC_OK) { errorType = ExternalLinkErrorType.HTTP; } } catch (IllegalArgumentException e) { errorType = ExternalLinkErrorType.INVALID_IDN; LOGGER.debug("IllegalArgumentException while checking external link (" + uri.toString() + ").", e); } catch (SocketTimeoutException | ClientProtocolException e) { // If HEAD request failed with socket timeout, let's try with GET request if (headIfTrueElseGet) { // Sample url : http://myspace.com/ (same results with https) // If we try with curl or wget, it seems to give us the same waiting result. // curl --head 'http://myspace.com/' // wget --method=HEAD 'http://myspace.com/' headIfTrueElseGet = false; retry = true; LOGGER.warn("HEAD request on external link (" + uri.toString() + ") resulted in a timeout" + " or another weird exception, we will try a GET request.", e); } else { errorType = ExternalLinkErrorType.fromException(e); } } catch (SSLHandshakeException e) { // certificate not supported by Java: we ignore the links markAsIgnored(links); return; } catch (ConnectionPoolTimeoutException e) { // If we have connection pool problem, the problem is at our side, we don't affect links status. LOGGER.debug("ConnectionPoolTimeoutException while checking external link (" + uri.toString() + ").", e); return; } catch (IOException e) { errorType = ExternalLinkErrorType.fromException(e); } numAttempts++; // If retry is needed, go back to start of do...while // We never try more than 2 attempts } while (numAttempts < 2 && retry); if (errorType == null) { markAsOnline(links); } else { markAsOfflineOrDead(links, errorType, httpStatus); } } // Private methods private Collection<Callable<Void>> createTasksByDomain(List<ExternalLinkWrapper> links) throws ServiceException, SecurityServiceException { Collection<Callable<Void>> tasks = Lists.newArrayList(); Map<String, Multimap<io.mola.galimatias.URL, Long>> domainToUrlToIds = Maps.newLinkedHashMap(); for (ExternalLinkWrapper link : links) { try { // there's no need to normalize the URL further as galimatias already normalizes the host and it's the // only part we can normalize. io.mola.galimatias.URL url = io.mola.galimatias.URL.parse(link.getUrl()); String domain = url.host().toHumanString(); Multimap<io.mola.galimatias.URL, Long> urlToIds = domainToUrlToIds.get(domain); if (urlToIds == null) { urlToIds = LinkedListMultimap.create(); domainToUrlToIds.put(domain, urlToIds); } urlToIds.put(url, link.getId()); } catch (RuntimeException | GalimatiasParseException e) { // if we cannot parse the URI, there's no need to go further, we mark it as invalid and we ignore it markAsInvalid(link); } } for (Multimap<io.mola.galimatias.URL, Long> urlToIds : domainToUrlToIds.values()) { tasks.add(new ExternalLinkCheckByDomainTask(applicationContext, urlToIds.asMap())); } return tasks; } private void markAsInvalid(ExternalLinkWrapper link) throws ServiceException, SecurityServiceException { link.setLastCheckDate(new Date()); link.setStatus(ExternalLinkStatus.DEAD_LINK); link.setLastErrorType(ExternalLinkErrorType.URI_SYNTAX); externalLinkWrapperService.update(link); } private void markAsIgnored(Collection<ExternalLinkWrapper> links) throws ServiceException, SecurityServiceException { Date checkDate = new Date(); for (ExternalLinkWrapper link : links) { link.setLastCheckDate(checkDate); link.setConsecutiveFailures(0); link.setStatus(ExternalLinkStatus.IGNORED); externalLinkWrapperService.update(link); } } private void markAsOnline(Collection<ExternalLinkWrapper> links) throws ServiceException, SecurityServiceException { Date checkDate = new Date(); for (ExternalLinkWrapper link : links) { link.setLastCheckDate(checkDate); link.setConsecutiveFailures(0); link.setStatus(ExternalLinkStatus.ONLINE); link.setLastStatusCode(HttpStatus.SC_OK); link.setFailureAudit(null); externalLinkWrapperService.update(link); } } private void markAsOfflineOrDead(Collection<ExternalLinkWrapper> links, final ExternalLinkErrorType errorType, final StatusLine httpStatus) throws ServiceException, SecurityServiceException { Date checkDate = new Date(); for (ExternalLinkWrapper link : links) { link.setLastCheckDate(checkDate); link.setLastErrorType(errorType); int consecutiveFailures = link.getConsecutiveFailures() + 1; link.setConsecutiveFailures(consecutiveFailures); Integer statusCode; if (httpStatus != null) { statusCode = httpStatus.getStatusCode(); } else { statusCode = null; } link.setLastStatusCode(statusCode); ExternalLinkStatus status; if (link.getConsecutiveFailures() >= propertyService.get(RETRY_ATTEMPTS_NUMBER)) { status = ExternalLinkStatus.DEAD_LINK; } else { status = ExternalLinkStatus.OFFLINE; } link.setStatus(status); // Failure audit StringBuilder failureAuditBuilder = new StringBuilder(); String failureAudit = link.getFailureAudit(); if (StringUtils.hasText(failureAudit)) { failureAuditBuilder.append(failureAudit).append("\n"); } failureAuditBuilder.append( new ToStringBuilder(null, ToStringStyle.DEFAULT_STYLE) .append("checkDate", checkDate) .append("errorType", errorType) .append("consecutiveFailures", consecutiveFailures) .append("statusCode", statusCode) .append("status", status) .build() ); link.setFailureAudit(failureAuditBuilder.toString()); externalLinkWrapperService.update(link); } } private StatusLine sendRequest(HttpRequestBase request) throws IOException { CloseableHttpResponse response = null; try { response = httpClient.execute(request); return response.getStatusLine(); } finally { if (request != null) { request.reset(); } if (response != null) { try { response.close(); } catch (IOException e) { LOGGER.error("Unable to close the HTTP response", e); } } } } private void runTasksInParallel(Collection<? extends Callable<Void>> tasks, long timeout, TimeUnit timeoutUnit) throws ServiceException { final int threadPoolSize = propertyService.get(THREAD_POOL_SIZE); final ThreadPoolExecutor executor = new ThreadPoolExecutor( threadPoolSize, threadPoolSize, 100, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>()); executor.prestartAllCoreThreads(); try { List<Future<Void>> futures = executor.invokeAll(tasks, timeout, timeoutUnit); for (Future<Void> future : futures) { future.get(); // Check that no error has occurred } } catch (RuntimeException | InterruptedException | ExecutionException e) { throw new ServiceException("Interrupted request", e); } finally { try { executor.shutdown(); } catch (RuntimeException e) { LOGGER.warn("An error occurred while shutting down threads", e); } } } @Override public void addIgnorePattern(Pattern ignorePattern) { this.ignorePatterns.add(ignorePattern); } }