/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.source.extractor.extract.google; import java.io.Closeable; import java.io.IOException; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codahale.metrics.Timer; import com.github.rholder.retry.RetryException; import com.github.rholder.retry.Retryer; import com.google.api.client.auth.oauth2.Credential; import com.google.api.services.analytics.Analytics; import com.google.api.services.analytics.model.UnsampledReport; import com.google.api.services.analytics.Analytics.Management.UnsampledReports.Insert; import com.google.api.services.drive.Drive; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.collect.ImmutableMap; import com.google.common.io.Closer; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import static gobblin.retry.RetryerFactory.*; import static gobblin.configuration.ConfigurationKeys.*; import static gobblin.source.extractor.extract.google.GoogleCommonKeys.*; import static gobblin.source.extractor.extract.google.GoogleAnalyticsUnsampledSource.*; import gobblin.config.ConfigBuilder; import gobblin.configuration.State; import gobblin.configuration.WorkUnitState; import gobblin.instrumented.Instrumented; import gobblin.metrics.GobblinMetrics; import gobblin.retry.RetryerFactory; import gobblin.source.extractor.DataRecordException; import gobblin.source.extractor.Extractor; import gobblin.source.extractor.extract.LongWatermark; import gobblin.source.extractor.filebased.CsvFileDownloader; import gobblin.source.workunit.WorkUnit; import gobblin.writer.exception.NonTransientException; /** * Extracts Google Analytics(GA) unsampled report data. * GA provides unsampled report by client requesting it via GA asynchronous api and GA (server) creates unsampled report * on their background and put into Google drive by default. * (GoogleAnalyticsUnsampledExtractor currently does not support use case on Google cloud storage) * * While being created in background, GoogleAnalyticsExtractor will poll for status of the report request. Once report is generated, * GoogleAnalyticsUnsampledExtractor will use GoogleDriveExtractor to extract records. * * @param <S> * @param <D> */ public class GoogleAnalyticsUnsampledExtractor<S, D> implements Extractor<S, D> { private static final Logger LOG = LoggerFactory.getLogger(GoogleAnalyticsUnsampledExtractor.class); static final String GA_UNSAMPLED_REPORT_PREFIX = GA_REPORT_PREFIX + "unsampled."; static final String GA_UNSAMPLED_REPORT_CREATION_TIMER = GA_UNSAMPLED_REPORT_PREFIX + "creation.timer"; static final String REQUEST_RETRY_PREFIX = GA_REPORT_PREFIX + "request_retry."; static final String POLL_RETRY_PREFIX = GA_REPORT_PREFIX + "poll."; static final Config POLL_RETRY_DEFAULTS; static { Map<String, Object> configMap = ImmutableMap.<String, Object>builder() .put(RETRY_TIME_OUT_MS, TimeUnit.HOURS.toMillis(1L)) //Overall try to poll for 1 hour .put(RETRY_INTERVAL_MS, TimeUnit.MINUTES.toMillis(1L)) //Try to poll every 1 minutes .put(RETRY_TYPE, RetryType.FIXED.name()) .build(); POLL_RETRY_DEFAULTS = ConfigFactory.parseMap(configMap); }; static final String WATERMARK_INPUTFORMAT = "yyyyMMddHHmmss"; static final String DELETE_TEMP_UNSAMPLED_REPORT = GA_UNSAMPLED_REPORT_PREFIX + "delete_temp_unsampled_report"; static enum ReportCreationStatus { FAILED, PENDING, COMPLETED } static final String DOWNLOAD_TYPE_GOOGLE_DRIVE = "GOOGLE_DRIVE"; private final Closer closer = Closer.create(); private final Analytics gaService; private final WorkUnitState wuState; private final Extractor<S, D> actualExtractor; private final DateTimeFormatter googleAnalyticsFormatter; private final DateTimeFormatter watermarkFormatter; private final long nextWatermark; /** * For unsampled report, it will call GA service to produce unsampled CSV report into GoogleDrive so that getExtractor will * use Google drive to extract record from CSV file. * * @param wuState * @param sampleRate * @throws IOException */ public GoogleAnalyticsUnsampledExtractor(WorkUnitState wuState) throws IOException { this.wuState = wuState; this.googleAnalyticsFormatter = DateTimeFormat.forPattern(DATE_FORMAT) .withZone(DateTimeZone.forID(wuState.getProp(SOURCE_TIMEZONE, DEFAULT_SOURCE_TIMEZONE))); this.watermarkFormatter = DateTimeFormat.forPattern(WATERMARK_INPUTFORMAT) .withZone(DateTimeZone.forID(wuState.getProp(SOURCE_TIMEZONE, DEFAULT_SOURCE_TIMEZONE))); Credential credential = new GoogleCommon.CredentialBuilder(wuState.getProp(SOURCE_CONN_PRIVATE_KEY), wuState.getPropAsList(API_SCOPES)) .fileSystemUri(wuState.getProp(PRIVATE_KEY_FILESYSTEM_URI)) .proxyUrl(wuState.getProp(SOURCE_CONN_USE_PROXY_URL)) .port(wuState.getProp(SOURCE_CONN_USE_PROXY_PORT)) .serviceAccountId(wuState.getProp(SOURCE_CONN_USERNAME)) .build(); this.gaService = new Analytics.Builder(credential.getTransport(), GoogleCommon.getJsonFactory(), credential) .setApplicationName(Preconditions.checkNotNull(wuState.getProp(APPLICATION_NAME))) .build(); Drive driveClient = new Drive.Builder(credential.getTransport(), GoogleCommon.getJsonFactory(), Preconditions.checkNotNull(credential, "Credential is required")) .setApplicationName(Preconditions.checkNotNull(wuState.getProp(APPLICATION_NAME), "ApplicationName is required")) .build(); GoogleDriveFsHelper fsHelper = closer.register(new GoogleDriveFsHelper(wuState, driveClient)); UnsampledReport request = new UnsampledReport() .setAccountId(Preconditions.checkNotNull(wuState.getProp(ACCOUNT_ID), ACCOUNT_ID + " is required")) .setWebPropertyId(Preconditions.checkNotNull(wuState.getProp(WEB_PROPERTY_ID), WEB_PROPERTY_ID + " is required")) .setProfileId(Preconditions.checkNotNull(wuState.getProp(VIEW_ID), VIEW_ID + " is required")) .setTitle(Preconditions.checkNotNull(wuState.getProp(SOURCE_ENTITY), SOURCE_ENTITY + " is required.")) .setStartDate(convertFormat(wuState.getWorkunit().getLowWatermark(LongWatermark.class).getValue())) .setEndDate(convertFormat(wuState.getWorkunit().getExpectedHighWatermark(LongWatermark.class).getValue())) .setMetrics(Preconditions.checkNotNull(wuState.getProp(METRICS), METRICS + " is required.")) .setDimensions(wuState.getProp(DIMENSIONS)) //Optional .setSegment(wuState.getProp(SEGMENTS)) //Optional .setFilters(wuState.getProp(FILTERS)); //Optional UnsampledReport createdReport = prepareUnsampledReport(request, fsHelper, wuState.getPropAsBoolean(DELETE_TEMP_UNSAMPLED_REPORT, true)); DateTime nextWatermarkDateTime = googleAnalyticsFormatter.parseDateTime(createdReport.getEndDate()).plusDays(1); nextWatermark = Long.parseLong(watermarkFormatter.print(nextWatermarkDateTime)); this.actualExtractor = closer.register(new GoogleDriveExtractor<S, D>(copyOf(wuState), fsHelper)); } @VisibleForTesting GoogleAnalyticsUnsampledExtractor(WorkUnitState state, Extractor<S, D> actualExtractor, Analytics gaService) throws IOException { this.wuState = state; this.googleAnalyticsFormatter = DateTimeFormat.forPattern(DATE_FORMAT) .withZone(DateTimeZone.forID(state.getProp(SOURCE_TIMEZONE, DEFAULT_SOURCE_TIMEZONE))); this.watermarkFormatter = DateTimeFormat.forPattern(WATERMARK_INPUTFORMAT) .withZone(DateTimeZone.forID(state.getProp(SOURCE_TIMEZONE, DEFAULT_SOURCE_TIMEZONE))); this.actualExtractor = actualExtractor; this.gaService = gaService; this.nextWatermark = -1; } /** * Copy WorkUnitState so that work unit also contains job state. FileBasedExtractor needs properties from job state (mostly source.* properties), * where it has been already removed when reached here. * * @param src * @return */ private WorkUnitState copyOf(WorkUnitState src) { WorkUnit copiedWorkUnit = WorkUnit.copyOf(src.getWorkunit()); copiedWorkUnit.addAllIfNotExist(src.getJobState()); WorkUnitState workUnitState = new WorkUnitState(copiedWorkUnit, src.getJobState()); workUnitState.addAll(src); return workUnitState; } /** * Create unsampled report in Google drive and add google drive file id into state so that Google drive extractor * can extract record from it. Also, update the state to use CsvFileDownloader unless other downloader is defined. * * It also register closer to delete the file from Google Drive unless explicitly requested to not deleting it. * @return documentID of unsampled report in Google drive * @throws IOException * */ @VisibleForTesting UnsampledReport prepareUnsampledReport(UnsampledReport request, final GoogleDriveFsHelper fsHelper, boolean isDeleteTempReport) throws IOException { UnsampledReport createdReport = createUnsampledReports(request); final String fileId = createdReport.getDriveDownloadDetails().getDocumentId(); LOG.info("Temporary unsampled report created in Google Drive: " + fileId); if (isDeleteTempReport) { closer.register(new Closeable() { @Override public void close() throws IOException { LOG.info("Deleting created temporary unsampled report from Google drive " + fileId); fsHelper.deleteFile(fileId); } }); } else { LOG.warn("Temporary unsampled report will not be deleted as requested. File ID: " + fileId); } wuState.setProp(SOURCE_FILEBASED_FILES_TO_PULL, fileId); if (!wuState.contains(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS)) { wuState.setProp(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS, CsvFileDownloader.class.getName()); } return createdReport; } @VisibleForTesting UnsampledReport createUnsampledReports(UnsampledReport request) throws IOException { long startTimeInMillis = System.currentTimeMillis(); try { UnsampledReport requestedReport = requestUnsampledReport(request); UnsampledReport createdReport = pollForCompletion(wuState, gaService, requestedReport); createdReport.setEndDate(requestedReport.getEndDate()); return createdReport; } finally { long delta = System.currentTimeMillis() - startTimeInMillis; if (GobblinMetrics.isEnabled(wuState)) { Timer timer = Instrumented.getMetricContext(wuState, getClass()).timer(GA_UNSAMPLED_REPORT_CREATION_TIMER); Instrumented.updateTimer(Optional.of(timer), delta, TimeUnit.MILLISECONDS); } } } @VisibleForTesting UnsampledReport requestUnsampledReport(UnsampledReport request) throws IOException { String accountId = request.getAccountId(); String webPropertyId = request.getWebPropertyId(); String profileId = request.getProfileId(); request.setAccountId(null).setWebPropertyId(null).setProfileId(null); //GA somehow does not allow these values in it. final String endDate = request.getEndDate(); final Insert insertRequest = gaService.management() .unsampledReports() .insert(accountId, webPropertyId, profileId, request); Config config = ConfigBuilder.create().loadProps(wuState.getProperties(), REQUEST_RETRY_PREFIX).build(); Retryer<UnsampledReport> retryer = RetryerFactory.newInstance(config); LOG.info("Requesting to create unsampled report " + request); try { return retryer.call(new Callable<UnsampledReport>() { @Override public UnsampledReport call() throws Exception { UnsampledReport response = insertRequest.execute(); if (ReportCreationStatus.FAILED.name().equals(response.getStatus())) { //No retry if it's explicitly failed from server throw new NonTransientException("Failed to create unsampled report " + response); } response.setEndDate(endDate); //response does not have end date where we need it later for next watermark calculation. return response; } }); } catch (ExecutionException e) { throw new IOException(e); } catch (RetryException e) { throw new RuntimeException(e); } } /** * Converts date format from watermark format to Google analytics format * @param watermark * @return */ private String convertFormat(long watermark) { Preconditions.checkArgument(watermark > 0, "Watermark should be positive number."); return googleAnalyticsFormatter.print(watermarkFormatter.parseDateTime(Long.toString(watermark))); } @VisibleForTesting UnsampledReport pollForCompletion(State state, final Analytics gaService, final UnsampledReport requestedReport) throws IOException { Config config = ConfigBuilder.create() .loadProps(state.getProperties(), POLL_RETRY_PREFIX) .build() .withFallback(POLL_RETRY_DEFAULTS); Retryer<UnsampledReport> retryer = RetryerFactory.newInstance(config); LOG.info("Will poll for completion on unsampled report with retry config: " + config); final Stopwatch stopwatch = Stopwatch.createStarted(); UnsampledReport result = null; try { result = retryer.call(new Callable<UnsampledReport>() { @Override public UnsampledReport call() throws Exception { UnsampledReport response = null; try { response = gaService.management() .unsampledReports() .get(requestedReport.getAccountId(), requestedReport.getWebPropertyId(), requestedReport.getProfileId(), requestedReport.getId()) .execute(); } catch (Exception e) { LOG.warn("Encountered exception while polling for unsampled report. Will keep polling. " + "Elasped so far: " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds", e); throw e; } ReportCreationStatus status = ReportCreationStatus.valueOf(response.getStatus()); switch(status) { case FAILED: //Stop retrying if it explicitly failed from server. throw new NonTransientException("Unsampled report has failed to be generated. " + response); case PENDING: LOG.info("Waiting for report completion. Elasped so far: " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds for unsampled report: " + response); //Throw so that Retryer will retry throw new RuntimeException("Not completed yet. This will be retried. " + response); case COMPLETED: return response; default: throw new NonTransientException(status + " is not supported. " + response); } } }); } catch (ExecutionException e) { throw new IOException(e); } catch (RetryException e) { throw new RuntimeException(e); } LOG.info("Unsampled report creation has been completed. " + result); Preconditions.checkArgument(DOWNLOAD_TYPE_GOOGLE_DRIVE.equals(result.getDownloadType()), result.getDownloadType() + " DownloadType is not supported."); return result; } @Override public void close() throws IOException { LOG.info("Updating the current state high water mark with " + nextWatermark); this.wuState.setActualHighWatermark(new LongWatermark(nextWatermark)); closer.close(); } @Override public S getSchema() throws IOException { return actualExtractor.getSchema(); } @Override public D readRecord(D reuse) throws DataRecordException, IOException { return actualExtractor.readRecord(reuse); } @Override public long getExpectedRecordCount() { return actualExtractor.getExpectedRecordCount(); } @Override public long getHighWatermark() { return actualExtractor.getHighWatermark(); } }