/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.example.wikipedia; import java.io.BufferedReader; import java.io.Closeable; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.utils.URIBuilder; import org.apache.http.client.utils.URLEncodedUtils; import org.apache.http.message.BasicNameValuePair; import org.joda.time.DateTime; import org.joda.time.Period; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Charsets; import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.io.Closer; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.WorkUnitState; import gobblin.http.HttpClientConfigurator; import gobblin.http.HttpClientConfiguratorLoader; import gobblin.source.extractor.DataRecordException; import gobblin.source.extractor.Extractor; import gobblin.source.extractor.extract.LongWatermark; /** * An implementation of {@link Extractor} for the Wikipedia example. * * <p> * This extractor uses the MediaWiki web API to retrieve a certain number of latest revisions * for each specified title from Wikipedia. Each revision is returned as a JSON document. * </p> * * @author Ziyang Liu */ public class WikipediaExtractor implements Extractor<String, JsonElement> { private static final Logger LOG = LoggerFactory.getLogger(WikipediaExtractor.class); private static final DateTimeFormatter WIKIPEDIA_TIMESTAMP_FORMAT = DateTimeFormat.forPattern("YYYYMMddHHmmss"); public static final String CONFIG_PREFIX = "gobblin.wikipediaSource."; public static final String MAX_REVISION_PER_PAGE = CONFIG_PREFIX+ "maxRevisionsPerPage"; public static final int DEFAULT_MAX_REVISIONS_PER_PAGE = -1; public static final String HTTP_CLIENT_CONFIG_PREFIX = CONFIG_PREFIX + "httpClient."; public static final String SOURCE_PAGE_TITLES = "source.page.titles"; public static final String BOOTSTRAP_PERIOD = "wikipedia.source.bootstrap.lookback"; public static final String DEFAULT_BOOTSTRAP_PERIOD = "P2D"; public static final String WIKIPEDIA_API_ROOTURL = "wikipedia.api.rooturl"; public static final String WIKIPEDIA_AVRO_SCHEMA = "wikipedia.avro.schema"; private static final String JSON_MEMBER_QUERY = "query"; private static final String JSON_MEMBER_PAGES = "pages"; private static final String JSON_MEMBER_REVISIONS = "revisions"; private static final String JSON_MEMBER_PAGEID = "pageid"; private static final String JSON_MEMBER_TITLE = "title"; private static final Gson GSON = new Gson(); private final WikiResponseReader reader; private final String rootUrl; private final String schema; private final String requestedTitle; private final int batchSize; private final long lastRevisionId; private Queue<JsonElement> currentBatch; private final ImmutableMap<String, String> baseQuery; private final WorkUnitState workUnitState; private final int maxRevisionsPulled; private final HttpClientConfigurator httpClientConfigurator; private HttpClient httpClient; private class WikiResponseReader implements Iterator<JsonElement> { private long lastPulledRevision; private long revisionsPulled = 0; public WikiResponseReader(long latestPulledRevision) { this.lastPulledRevision = latestPulledRevision; } @Override public boolean hasNext() { if (WikipediaExtractor.this.maxRevisionsPulled > -1 && this.revisionsPulled >= WikipediaExtractor.this.maxRevisionsPulled) { WikipediaExtractor.this.workUnitState.setActualHighWatermark(new LongWatermark(this.lastPulledRevision)); LOG.info("Pulled max number of records {}, final revision pulled {}.", this.revisionsPulled, this.lastPulledRevision); return false; } if (!WikipediaExtractor.this.currentBatch.isEmpty()) { return true; } else { /* * Retrieve revisions for the next title. Repeat until we find a title that has at least one revision, * otherwise return false */ if (this.lastPulledRevision >= WikipediaExtractor.this.lastRevisionId) { return false; } try { WikipediaExtractor.this.currentBatch = retrievePageRevisions(ImmutableMap.<String, String>builder() .putAll(WikipediaExtractor.this.baseQuery) .put("rvprop", "ids|timestamp|user|userid|size") .put("titles", WikipediaExtractor.this.requestedTitle) .put("rvlimit", Integer.toString(WikipediaExtractor.this.batchSize + 1)) .put("rvstartid", Long.toString(this.lastPulledRevision)) .put("rvendid", Long.toString(WikipediaExtractor.this.lastRevisionId)) .put("rvdir", "newer") .build()); // discard the first one (we've already pulled it) WikipediaExtractor.this.currentBatch.poll(); } catch (URISyntaxException | IOException use) { LOG.error("Could not retrieve more revisions.", use); return false; } return !WikipediaExtractor.this.currentBatch.isEmpty(); } } @Override public JsonElement next() { if (!hasNext()) { return null; } JsonElement element = WikipediaExtractor.this.currentBatch.poll(); this.lastPulledRevision = parseRevision(element); this.revisionsPulled++; return element; } @Override public void remove() { throw new UnsupportedOperationException(); } } public WikipediaExtractor(WorkUnitState workUnitState) throws IOException { this.workUnitState = workUnitState; this.rootUrl = readProp(WIKIPEDIA_API_ROOTURL, workUnitState); this.schema = readProp(WIKIPEDIA_AVRO_SCHEMA, workUnitState); this.batchSize = 5; this.requestedTitle = workUnitState.getProp(ConfigurationKeys.DATASET_URN_KEY); this.baseQuery = ImmutableMap.<String, String>builder().put("format", "json").put("action","query").put("prop","revisions").build(); HttpClientConfiguratorLoader httpClientConfiguratorLoader = new HttpClientConfiguratorLoader(workUnitState); this.httpClientConfigurator = httpClientConfiguratorLoader.getConfigurator(); this.httpClientConfigurator.setStatePropertiesPrefix(HTTP_CLIENT_CONFIG_PREFIX) .configure(workUnitState); try { Queue<JsonElement> lastRevision = retrievePageRevisions(ImmutableMap.<String, String>builder().putAll(this.baseQuery) .put("rvprop","ids").put("titles",this.requestedTitle).put("rvlimit","1").build()); this.lastRevisionId = lastRevision.isEmpty() ? -1 : parseRevision(lastRevision.poll()); } catch (URISyntaxException use) { throw new IOException(use); } long baseRevision = workUnitState.getWorkunit().getLowWatermark(LongWatermark.class, new Gson()).getValue(); if (baseRevision < 0) { try { baseRevision = createLowWatermarkForBootstrap(workUnitState); } catch (IOException ioe) { baseRevision = this.lastRevisionId; } } this.reader = new WikiResponseReader(baseRevision); workUnitState.setActualHighWatermark(new LongWatermark(this.lastRevisionId)); this.currentBatch = new LinkedList<>(); LOG.info(String.format("Will pull revisions %s to %s for page %s.",this.reader.lastPulledRevision, this.lastRevisionId, this.requestedTitle)); this.maxRevisionsPulled = workUnitState.getPropAsInt(MAX_REVISION_PER_PAGE, DEFAULT_MAX_REVISIONS_PER_PAGE); } private long parseRevision(JsonElement element) { return element.getAsJsonObject().get("revid").getAsLong(); } private long createLowWatermarkForBootstrap(WorkUnitState state) throws IOException { String bootstrapPeriodString = state.getProp(BOOTSTRAP_PERIOD, DEFAULT_BOOTSTRAP_PERIOD); Period period = Period.parse(bootstrapPeriodString); DateTime startTime = DateTime.now().minus(period); try { Queue<JsonElement> firstRevision = retrievePageRevisions(ImmutableMap.<String, String>builder().putAll(this.baseQuery) .put("rvprop", "ids") .put("titles", this.requestedTitle) .put("rvlimit", "1") .put("rvstart", WIKIPEDIA_TIMESTAMP_FORMAT.print(startTime)) .put("rvdir", "newer") .build()); if (firstRevision.isEmpty()) { throw new IOException("Could not retrieve oldest revision, returned empty revisions list."); } return parseRevision(firstRevision.poll()); } catch (URISyntaxException use) { throw new IOException(use); } } private String readProp(String key, WorkUnitState workUnitState) { String value = workUnitState.getWorkunit().getProp(key); if (StringUtils.isBlank(value)) { value = workUnitState.getProp(key); } if (StringUtils.isBlank(value)) { value = workUnitState.getJobState().getProp(key); } return value; } private JsonElement performHttpQuery(String rootUrl, Map<String, String> query) throws URISyntaxException, IOException { if (null == this.httpClient) { this.httpClient = createHttpClient(); } HttpUriRequest req = createHttpRequest(rootUrl, query); Closer closer = Closer.create(); StringBuilder sb = new StringBuilder(); try { HttpResponse response = sendHttpRequest(req, this.httpClient); if (response instanceof CloseableHttpResponse) { closer.register((CloseableHttpResponse)response); } BufferedReader br = closer.register( new BufferedReader(new InputStreamReader(response.getEntity().getContent(), ConfigurationKeys.DEFAULT_CHARSET_ENCODING))); String line; while ((line = br.readLine()) != null) { sb.append(line + "\n"); } } catch (Throwable t) { throw closer.rethrow(t); } finally { try { closer.close(); } catch (IOException e) { LOG.error("IOException in Closer.close() while performing query " + req + ": " + e, e); } } if (Strings.isNullOrEmpty(sb.toString())) { LOG.warn("Received empty response for query: " + req); return new JsonObject(); } JsonElement jsonElement = GSON.fromJson(sb.toString(), JsonElement.class); return jsonElement; } public static URI createRequestURI(String rootUrl, Map<String, String> query) throws MalformedURLException, URISyntaxException { List<NameValuePair> queryTokens = Lists.newArrayList(); for (Map.Entry<String, String> entry : query.entrySet()) { queryTokens.add(new BasicNameValuePair(entry.getKey(), entry.getValue())); } String encodedQuery = URLEncodedUtils.format(queryTokens, Charsets.UTF_8); URI actualURL = new URIBuilder(rootUrl).setQuery(encodedQuery).build(); return actualURL; } HttpUriRequest createHttpRequest(String rootUrl, Map<String, String> query) throws MalformedURLException, URISyntaxException { URI requestUri = createRequestURI(rootUrl, query); HttpGet req = new HttpGet(requestUri); return req; } HttpResponse sendHttpRequest(HttpUriRequest req, HttpClient httpClient) throws ClientProtocolException, IOException { LOG.debug("Sending request {}", req); HttpResponse response = httpClient.execute(req); if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK || null == response.getEntity()) { if (response instanceof CloseableHttpResponse) { ((CloseableHttpResponse)response).close(); } throw new IOException("HTTP Request " + req + " returned unexpected response " + response); } return response; } private Queue<JsonElement> retrievePageRevisions(Map<String, String> query) throws IOException, URISyntaxException { Queue<JsonElement> retrievedRevisions = new LinkedList<>(); JsonElement jsonElement = performHttpQuery(this.rootUrl, query); if (jsonElement == null || !jsonElement.isJsonObject()) { return retrievedRevisions; } JsonObject jsonObj = jsonElement.getAsJsonObject(); if (jsonObj == null || !jsonObj.has(JSON_MEMBER_QUERY)) { return retrievedRevisions; } JsonObject queryObj = jsonObj.getAsJsonObject(JSON_MEMBER_QUERY); if (!queryObj.has(JSON_MEMBER_PAGES)) { return retrievedRevisions; } JsonObject pagesObj = queryObj.getAsJsonObject(JSON_MEMBER_PAGES); if (pagesObj.entrySet().isEmpty()) { return retrievedRevisions; } JsonObject pageIdObj = pagesObj.getAsJsonObject(pagesObj.entrySet().iterator().next().getKey()); if (!pageIdObj.has(JSON_MEMBER_REVISIONS)) { return retrievedRevisions; } //retrieve revisions of the current pageTitle JsonArray jsonArr = pageIdObj.getAsJsonArray(JSON_MEMBER_REVISIONS); for (JsonElement revElement : jsonArr) { JsonObject revObj = revElement.getAsJsonObject(); /*'pageid' and 'title' are associated with the parent object * of all revisions. Add them to each individual revision. */ if (pageIdObj.has(JSON_MEMBER_PAGEID)) { revObj.add(JSON_MEMBER_PAGEID, pageIdObj.get(JSON_MEMBER_PAGEID)); } if (pageIdObj.has(JSON_MEMBER_TITLE)) { revObj.add(JSON_MEMBER_TITLE, pageIdObj.get(JSON_MEMBER_TITLE)); } retrievedRevisions.add(revObj); } LOG.info(retrievedRevisions.size() + " record(s) retrieved for title " + this.requestedTitle); return retrievedRevisions; } protected HttpClient createHttpClient() { return this.httpClientConfigurator.createClient(); } @Override public void close() throws IOException { if (null != this.httpClient && this.httpClient instanceof Closeable) { ((Closeable)this.httpClient).close(); } } @Override public String getSchema() { return this.schema; } @Override public JsonElement readRecord(@Deprecated JsonElement reuse) throws DataRecordException, IOException { if (this.reader == null) { return null; } if (this.reader.hasNext()) { return this.reader.next(); } return null; } @Override public long getExpectedRecordCount() { return 0; } @Override public long getHighWatermark() { return this.lastRevisionId; } }