/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.ingestion.google.webmaster; import java.io.IOException; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.Queue; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import org.apache.commons.lang3.tuple.Pair; import com.google.api.client.googleapis.batch.BatchRequest; import com.google.api.client.googleapis.batch.json.JsonBatchCallback; import com.google.api.client.repackaged.com.google.common.base.Preconditions; import com.google.api.services.webmasters.model.ApiDimensionFilter; import com.google.api.services.webmasters.model.SearchAnalyticsQueryResponse; import com.google.common.base.Optional; import lombok.extern.slf4j.Slf4j; import gobblin.configuration.State; import gobblin.util.ExecutorsUtils; import gobblin.util.limiter.RateBasedLimiter; import static gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.Dimension; import static gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.FilterOperator; import static gobblin.ingestion.google.webmaster.GoogleWebmasterFilter.countryFilterToString; @Slf4j public class GoogleWebmasterDataFetcherImpl extends GoogleWebmasterDataFetcher { private final double API_REQUESTS_PER_SECOND; private final RateBasedLimiter LIMITER; private final int GET_PAGE_SIZE_TIME_OUT; private final int RETRY; private final String _siteProperty; private final GoogleWebmasterClient _client; private final List<ProducerJob> _jobs; GoogleWebmasterDataFetcherImpl(String siteProperty, GoogleWebmasterClient client, State wuState) throws IOException { _siteProperty = siteProperty; Preconditions.checkArgument(_siteProperty.endsWith("/"), "The site property must end in \"/\""); _client = client; _jobs = getHotStartJobs(wuState); API_REQUESTS_PER_SECOND = wuState.getPropAsDouble(GoogleWebMasterSource.KEY_PAGES_TUNING_REQUESTS_PER_SECOND, 5.0); GET_PAGE_SIZE_TIME_OUT = wuState.getPropAsInt(GoogleWebMasterSource.KEY_PAGES_TUNING_TIME_OUT, 2); LIMITER = new RateBasedLimiter(API_REQUESTS_PER_SECOND, TimeUnit.SECONDS); RETRY = wuState.getPropAsInt(GoogleWebMasterSource.KEY_PAGES_TUNING_MAX_RETRIES, 120); } private static List<ProducerJob> getHotStartJobs(State wuState) { String hotStartString = wuState.getProp(GoogleWebMasterSource.KEY_REQUEST_HOT_START, ""); if (!hotStartString.isEmpty()) { return SimpleProducerJob.deserialize(hotStartString); } return new ArrayList<>(); } /** * Due to the limitation of the API, we can get a maximum of 5000 rows at a time. Another limitation is that, results are sorted by click count descending. If two rows have the same click count, they are sorted in an arbitrary way. (Read more at https://developers.google.com/webmaster-tools/v3/searchanalytics). So we try to get all pages by partitions, if a partition has 5000 rows returned. We try partition current partition into more granular levels. * */ @Override public Collection<ProducerJob> getAllPages(String startDate, String endDate, String country, int rowLimit) throws IOException { if (!_jobs.isEmpty()) { log.info("Service got hot started."); return _jobs; } ApiDimensionFilter countryFilter = GoogleWebmasterFilter.countryEqFilter(country); List<GoogleWebmasterFilter.Dimension> requestedDimensions = new ArrayList<>(); requestedDimensions.add(GoogleWebmasterFilter.Dimension.PAGE); Collection<String> allPages = _client .getPages(_siteProperty, startDate, endDate, country, rowLimit, requestedDimensions, Arrays.asList(countryFilter), 0); int actualSize = allPages.size(); if (rowLimit < GoogleWebmasterClient.API_ROW_LIMIT || actualSize < GoogleWebmasterClient.API_ROW_LIMIT) { log.info(String .format("A total of %d pages fetched for property %s at country-%s from %s to %s", actualSize, _siteProperty, country, startDate, endDate)); } else { int expectedSize = getPagesSize(startDate, endDate, country, requestedDimensions, Arrays.asList(countryFilter)); log.info(String.format("Total number of pages is %d for market-%s from %s to %s", expectedSize, GoogleWebmasterFilter.countryFilterToString(countryFilter), startDate, endDate)); Queue<Pair<String, FilterOperator>> jobs = new ArrayDeque<>(); expandJobs(jobs, _siteProperty); allPages = getPages(startDate, endDate, requestedDimensions, countryFilter, jobs); allPages.add(_siteProperty); actualSize = allPages.size(); if (actualSize != expectedSize) { log.warn(String .format("Expected page size for country-%s is %d, but only able to get %d", country, expectedSize, actualSize)); } log.info(String .format("A total of %d pages fetched for property %s at country-%s from %s to %s", actualSize, _siteProperty, country, startDate, endDate)); } ArrayDeque<ProducerJob> jobs = new ArrayDeque<>(actualSize); for (String page : allPages) { jobs.add(new SimpleProducerJob(page, startDate, endDate)); } return jobs; } private int getPagesSize(final String startDate, final String endDate, final String country, final List<Dimension> requestedDimensions, final List<ApiDimensionFilter> apiDimensionFilters) throws IOException { final ExecutorService es = Executors.newCachedThreadPool( ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of(this.getClass().getSimpleName()))); int startRow = 0; long groupSize = Math.max(1, Math.round(API_REQUESTS_PER_SECOND)); List<Future<Integer>> results = new ArrayList<>((int) groupSize); while (true) { for (int i = 0; i < groupSize; ++i) { startRow += GoogleWebmasterClient.API_ROW_LIMIT; final int start = startRow; final String interruptedMsg = String .format("Interrupted while trying to get the size of all pages for %s. Current start row is %d.", country, start); Future<Integer> submit = es.submit(new Callable<Integer>() { @Override public Integer call() { log.info(String.format("Getting page size from %s...", start)); while (true) { try { LIMITER.acquirePermits(1); } catch (InterruptedException e) { log.error("RateBasedLimiter: " + interruptedMsg, e); return -1; } if (Thread.interrupted()) { log.error(interruptedMsg); return -1; } try { List<String> pages = _client .getPages(_siteProperty, startDate, endDate, country, GoogleWebmasterClient.API_ROW_LIMIT, requestedDimensions, apiDimensionFilters, start); if (pages.size() < GoogleWebmasterClient.API_ROW_LIMIT) { return pages.size() + start; //Figured out the size } else { return -1; } } catch (IOException e) { log.info(String.format("Getting page size from %s failed. Retrying...", start)); } } } }); results.add(submit); } //Check the results group in order. The first non-negative count indicates the size of total pages. for (Future<Integer> result : results) { try { Integer integer = result.get(GET_PAGE_SIZE_TIME_OUT, TimeUnit.MINUTES); if (integer >= 0) { es.shutdownNow(); return integer; } } catch (InterruptedException | ExecutionException e) { throw new RuntimeException(e); } catch (TimeoutException e) { throw new RuntimeException(String .format("Exceeding the timeout of %d minutes while getting the total size of all pages.", GET_PAGE_SIZE_TIME_OUT), e); } } results.clear(); } } /** * Get all pages in an async mode. */ private Collection<String> getPages(String startDate, String endDate, List<Dimension> dimensions, ApiDimensionFilter countryFilter, Queue<Pair<String, FilterOperator>> toProcess) throws IOException { String country = GoogleWebmasterFilter.countryFilterToString(countryFilter); ConcurrentLinkedDeque<String> allPages = new ConcurrentLinkedDeque<>(); int r = 0; while (r <= RETRY) { ++r; log.info(String.format("Get pages at round %d with size %d.", r, toProcess.size())); ConcurrentLinkedDeque<Pair<String, FilterOperator>> nextRound = new ConcurrentLinkedDeque<>(); ExecutorService es = Executors.newFixedThreadPool(10, ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of(this.getClass().getSimpleName()))); while (!toProcess.isEmpty()) { submitJob(toProcess.poll(), countryFilter, startDate, endDate, dimensions, es, allPages, nextRound); } //wait for jobs to finish and start next round if necessary. try { es.shutdown(); boolean terminated = es.awaitTermination(5, TimeUnit.MINUTES); if (!terminated) { es.shutdownNow(); log.warn(String .format("Timed out while getting all pages for country-%s at round %d. Next round now has size %d.", country, r, nextRound.size())); } } catch (InterruptedException e) { throw new RuntimeException(e); } if (nextRound.isEmpty()) { break; } toProcess = nextRound; } if (r == RETRY) { throw new RuntimeException(String .format("Getting all pages reaches the maximum number of retires %d. Date range: %s ~ %s. Country: %s.", RETRY, startDate, endDate, country)); } return allPages; } private void submitJob(final Pair<String, FilterOperator> job, final ApiDimensionFilter countryFilter, final String startDate, final String endDate, final List<Dimension> dimensions, ExecutorService es, final ConcurrentLinkedDeque<String> allPages, final ConcurrentLinkedDeque<Pair<String, FilterOperator>> nextRound) { es.submit(new Runnable() { @Override public void run() { try { LIMITER.acquirePermits(1); } catch (InterruptedException e) { throw new RuntimeException("RateBasedLimiter got interrupted.", e); } String countryString = countryFilterToString(countryFilter); List<ApiDimensionFilter> filters = new LinkedList<>(); filters.add(countryFilter); String prefix = job.getLeft(); FilterOperator operator = job.getRight(); String jobString = String.format("job(prefix: %s, operator: %s)", prefix, operator); filters.add(GoogleWebmasterFilter.pageFilter(operator, prefix)); List<String> pages; try { pages = _client .getPages(_siteProperty, startDate, endDate, countryString, GoogleWebmasterClient.API_ROW_LIMIT, dimensions, filters, 0); log.debug(String .format("%d pages fetched for %s market-%s from %s to %s.", pages.size(), jobString, countryString, startDate, endDate)); } catch (IOException e) { log.debug(String.format("%s failed due to %s. Retrying...", jobString, e.getMessage())); nextRound.add(job); return; } //If the number of pages is at the LIMIT, it must be a "CONTAINS" job. //We need to create sub-tasks, and check current page with "EQUALS" if (pages.size() == GoogleWebmasterClient.API_ROW_LIMIT) { log.info(String.format("Expanding the prefix '%s'", prefix)); expandJobs(nextRound, prefix); nextRound.add(Pair.of(prefix, FilterOperator.EQUALS)); } else { //Otherwise, we've done with current job. allPages.addAll(pages); } } }); } private void expandJobs(Queue<Pair<String, FilterOperator>> jobs, String prefix) { for (String expanded : getUrlPartitions(prefix)) { jobs.add(Pair.of(expanded, FilterOperator.CONTAINS)); } } /** * This doesn't cover all cases but more than 99.9% captured. * * According to the standard (RFC-3986), here are possible characters: * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" * reserved = gen-delims / sub-delims * gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" * * * Not included: * reserved = gen-delims / sub-delims * gen-delims = "[" / "]" * sub-delims = "(" / ")" / "," / ";" */ private ArrayList<String> getUrlPartitions(String prefix) { ArrayList<String> expanded = new ArrayList<>(); //The page prefix is case insensitive, A-Z is not necessary. for (char c = 'a'; c <= 'z'; ++c) { expanded.add(prefix + c); } for (int num = 0; num <= 9; ++num) { expanded.add(prefix + num); } expanded.add(prefix + "-"); expanded.add(prefix + "."); expanded.add(prefix + "_"); //most important expanded.add(prefix + "~"); expanded.add(prefix + "/"); //most important expanded.add(prefix + "%"); //most important expanded.add(prefix + ":"); expanded.add(prefix + "?"); expanded.add(prefix + "#"); expanded.add(prefix + "@"); expanded.add(prefix + "!"); expanded.add(prefix + "$"); expanded.add(prefix + "&"); expanded.add(prefix + "+"); expanded.add(prefix + "*"); expanded.add(prefix + "'"); expanded.add(prefix + "="); return expanded; } @Override public List<String[]> performSearchAnalyticsQuery(String startDate, String endDate, int rowLimit, List<Dimension> requestedDimensions, List<Metric> requestedMetrics, Collection<ApiDimensionFilter> filters) throws IOException { SearchAnalyticsQueryResponse response = _client .createSearchAnalyticsQuery(_siteProperty, startDate, endDate, requestedDimensions, GoogleWebmasterFilter.andGroupFilters(filters), rowLimit, 0).execute(); return convertResponse(requestedMetrics, response); } @Override public void performSearchAnalyticsQueryInBatch(List<ProducerJob> jobs, List<ArrayList<ApiDimensionFilter>> filterList, List<JsonBatchCallback<SearchAnalyticsQueryResponse>> callbackList, List<Dimension> requestedDimensions, int rowLimit) throws IOException { BatchRequest batchRequest = _client.createBatch(); for (int i = 0; i < jobs.size(); ++i) { ProducerJob job = jobs.get(i); ArrayList<ApiDimensionFilter> filters = filterList.get(i); JsonBatchCallback<SearchAnalyticsQueryResponse> callback = callbackList.get(i); _client.createSearchAnalyticsQuery(_siteProperty, job.getStartDate(), job.getEndDate(), requestedDimensions, GoogleWebmasterFilter.andGroupFilters(filters), rowLimit, 0).queue(batchRequest, callback); } batchRequest.execute(); } @Override public String getSiteProperty() { return _siteProperty; } }