/**
* Copyright 2014 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.spring.hbase.example.service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.kitesdk.data.DatasetReader;
import org.kitesdk.data.Key;
import org.kitesdk.data.RandomAccessDataset;
import org.kitesdk.spring.hbase.example.model.WebPageRedirectModel;
import org.kitesdk.spring.hbase.example.model.WebPageSnapshotModel;
import org.kitesdk.spring.hbase.example.model.frontend.WebPageSnapshotContent;
import org.kitesdk.spring.hbase.example.model.frontend.WebPageSnapshotMeta;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.convert.ConversionService;
import org.springframework.stereotype.Component;
/**
* Service for WebPageSnapshot operations
*/
@Component
public class WebPageSnapshotService {
@Autowired
private RandomAccessDataset<WebPageSnapshotModel> webPageSnapshotModels;
@Autowired
private RandomAccessDataset<WebPageRedirectModel> webPageRedirectModels;
@Autowired
private ConversionService conversionService;
/**
* Take a snapshot of an URL. This WebPageSnapshot is stored in HBase. Returns
* the WebPageSnapshotMeta
*
* If the URL is a redirect, the snapshot is stored under the final URL
* destination. A WebPageRedirectModel is stored in the redirect table so when
* fetching snapshots, we can follow the proper redirect path.
*
* @param url
* The URL to take a snapshot of
* @return The WebPageSnapshotMeta for the page that we snapshotted.
* @throws IOException
*/
public WebPageSnapshotMeta takeSnapshot(String url) throws IOException {
WebPageSnapshotModel webPageSnapshotModel = fetchWebPage(url);
if (!webPageSnapshotModel.getUrl().equals(url)) {
// Url is different, so must have redirected. Store the redirect model
WebPageRedirectModel redirectModel = WebPageRedirectModel.newBuilder()
.setUrl(url).setDestinationUrl(webPageSnapshotModel.getUrl()).build();
webPageRedirectModels.put(redirectModel);
} else {
// If redirect exists, remove it since this URL no longer redirects
Key key = new Key.Builder(webPageRedirectModels).add("url", url).build();
WebPageRedirectModel redirectModel = webPageRedirectModels.get(key);
if (redirectModel != null) {
webPageRedirectModels.delete(key);
}
}
webPageSnapshotModels.put(webPageSnapshotModel);
return conversionService.convert(webPageSnapshotModel,
WebPageSnapshotMeta.class);
}
/**
* Get the most recent WebPageSnapshotMeta from HBase
*
* @param url
* The URL of the WebPageSnapshotMeta to get from HBase.
* @return The WebPageSnapshotMeta, or null if one doesn't exist for this URL.
*/
public WebPageSnapshotMeta getWebPageSnapshotMeta(String url) {
WebPageSnapshotModel model = getMostRecentWebPageSnapshot(url);
if (model != null) {
return conversionService.convert(model, WebPageSnapshotMeta.class);
} else {
return null;
}
}
/**
* Get the WebPageSnapshotMeta that was fetched at a particular timestamp from
* HBase
*
* @param url
* The URL of the WebPageSnapshotMeta to get from HBase.
* @param ts
* The snapshot timestamp of the WebPageSnapshotMeta to get from
* HBase.
* @return The WebPageSnapshotMeta, or null if one doesn't exist for this URL
* at this timestamp.
*/
public WebPageSnapshotMeta getWebPageSnapshotMeta(String url, long ts) {
WebPageSnapshotModel model = this.getWebPageSnapshot(url, ts);
if (model != null) {
return conversionService.convert(model, WebPageSnapshotMeta.class);
} else {
return null;
}
}
/**
* Get all WebPageSnapshotMeta from an URL that have been snapshotted since
* the "since" param.
*
* @param url
* The URL to get WebPageSnapshotMeta instances from
* @param since
* The epoch timestamp
* @return The list of WebPageSnapshotMeta instances.
*/
public List<WebPageSnapshotMeta> getWebPageSnapshotMetaSince(String url,
long since) {
return convertList(getWebPageSnapshotsSince(url, since),
WebPageSnapshotMeta.class);
}
/**
* Get the most recent WebPageSnapshotContent from HBase
*
* @param url
* The URL to fetch the most recent WebPageSnapshotContent from
* @return The WebPageSnapshotContent, or null if one doesn't exists for this
* URL.
*/
public WebPageSnapshotContent getWebPageSnapshotContent(String url) {
WebPageSnapshotModel model = getMostRecentWebPageSnapshot(url);
if (model != null) {
return conversionService.convert(model, WebPageSnapshotContent.class);
} else {
return null;
}
}
/**
* Get the WebPageSnapshotContent that was fetched at a particular timestamp
* from HBase
*
* @param url
* The URL of the WebPageSnapshotContent to get from HBase.
* @param ts
* The snapshot timestamp of the WebPageSnapshotContent to get from
* HBase.
* @return The WebPageSnapshotContent, or null if one doesn't exist for this
* URL at this timestamp.
*/
public WebPageSnapshotContent getWebPageSnapshotContent(String url, long ts) {
WebPageSnapshotModel model = getWebPageSnapshot(url, ts);
if (model != null) {
return conversionService.convert(model, WebPageSnapshotContent.class);
} else {
return null;
}
}
/**
* Get all WebPageSnapshotContent from an URL that have been snapshotted since
* the "since" param.
*
* @param url
* The URL to get WebPageSnapshotContent instances from
* @param since
* The epoch timestamp
* @return The list of WebPageSnapshotContent instances.
*/
public List<WebPageSnapshotContent> getWebPageSnapshotContentSince(
String url, long since) {
return convertList(getWebPageSnapshotsSince(url, since),
WebPageSnapshotContent.class);
}
/**
* Get the epoch timestamps for every snapshot time of an URL in HBase.
*
* @param url
* The URL of the page to get snapshot timestamps for
* @return The list of timestamps
*/
public List<Long> getSnapshotTimestamps(String url) {
url = normalizeUrl(url);
List<Long> snapshotTimestamps = new ArrayList<Long>();
DatasetReader<WebPageSnapshotModel> reader = null;
try {
reader = webPageSnapshotModels.from("url", url)
.from("fetchedAtRevTs", 0L).to("url", url)
.to("fetchedAtRevTs", Long.MAX_VALUE).newReader();
while (reader.hasNext()) {
snapshotTimestamps.add(reader.next().getFetchedAt());
}
} finally {
if (reader != null) {
reader.close();
}
}
return snapshotTimestamps;
}
/**
* Get the most recent WebPageSnapshotModel from HBase
*
* @param url
* The URL to get the snapshotted page from HBase
* @return The WebPageSnapshotModel, or null if there are no fetches for this
* URL
*/
private WebPageSnapshotModel getMostRecentWebPageSnapshot(String url) {
url = normalizeUrl(url);
DatasetReader<WebPageSnapshotModel> reader = null;
try {
// we don't know the exact timestamp in the key, but we know since keys
// are in timestamp descending order that the first row for an URL will be
// the most recent.
reader = webPageSnapshotModels.from("url", url)
.from("fetchedAtRevTs", 0L).to("url", url)
.to("fetchedAtRevTs", Long.MAX_VALUE).newReader();
if (reader.hasNext()) {
return reader.next();
} else {
return null;
}
} finally {
if (reader != null) {
reader.close();
}
}
}
/**
* Get the WebPageSnapshotModel from HBase
*
* @param url
* The URL of the WebPageSnapshotModel
* @param ts
* The snapshot timestamp of the WebPageSnapshotModel
* @return The WebPageSnapshotModel, or null if there is no snapshot for the
* URL at this timestamp.
*/
private WebPageSnapshotModel getWebPageSnapshot(String url, long ts) {
url = normalizeUrl(url);
Key key = new Key.Builder(webPageSnapshotModels).add("url", url)
.add("fetchedAtRevTs", Long.MAX_VALUE - ts).build();
return webPageSnapshotModels.get(key);
}
/**
* Get WebPageSnapshotModels for an URL from HBase since the since param.
*
* @param url
* The URL of the page to fetch
* @param since
* The models to fetch since
* @return The list of models that have been fetched for an URL since the
* since param.
*/
private List<WebPageSnapshotModel> getWebPageSnapshotsSince(String url,
long since) {
url = normalizeUrl(url);
List<WebPageSnapshotModel> models = new ArrayList<WebPageSnapshotModel>();
DatasetReader<WebPageSnapshotModel> reader = null;
try {
reader = webPageSnapshotModels.from("url", url)
.from("fetchedAtRevTs", 0L).to("url", url)
.to("fetchedAtRevTs", since).newReader();
while (reader.hasNext()) {
models.add(reader.next());
}
} finally {
if (reader != null) {
reader.close();
}
}
return models;
}
/**
* Normalize an URL, which currently only consists of returning a redirect
* destination if an URL is a redirect, or otherwise the passed in url.
*
* @param url
* The url to normalize
* @return The normalized URL;
*/
private String normalizeUrl(String url) {
// If this url is a redirect, get it's destination URL to fetch from our
// HBase store since we store all snapshots under the final destination the
// page lives at.
WebPageRedirectModel redirectModel = getRedirect(url);
if (redirectModel != null) {
return redirectModel.getDestinationUrl();
} else {
return url;
}
}
/**
* Return a WebPageRedirectModel if an URL is one that redirects to a
* different source. Otherwise, returns null.
*
* @return The WebPageRedirectModel
*/
private WebPageRedirectModel getRedirect(String url) {
Key key = new Key.Builder(webPageRedirectModels).add("url", url).build();
return webPageRedirectModels.get(key);
}
/**
* Fetch the web page from the URL, parse the HTML to populate the metadata
* required by WebPageSnapshotModel, and return the constructed
* WebPageSnapshotModel.
*
* @param url
* The URL to fetch the web page from
* @return The WebPageSnapshotModel
* @throws IOException
* Thrown if there's an issue fetching the web page.
*/
private WebPageSnapshotModel fetchWebPage(String url) throws IOException {
long fetchTime = System.currentTimeMillis();
Connection connection = Jsoup.connect(url);
Response response = connection.execute();
long postFetchTime = System.currentTimeMillis();
int timeToFetch = (int) (postFetchTime - fetchTime);
Document doc = response.parse();
String destinationUrl = response.url().toString();
String title = doc.title();
String description = getDescriptionFromDocument(doc);
List<String> keywords = getKeywordsFromDocument(doc);
List<String> outlinks = getOutlinksFromDocument(doc);
return WebPageSnapshotModel.newBuilder().setUrl(destinationUrl)
.setFetchedAtRevTs(Long.MAX_VALUE - fetchTime)
.setSize(doc.html().length()).setFetchedAt(fetchTime)
.setFetchTimeMs(timeToFetch).setTitle(title)
.setDescription(description).setKeywords(keywords)
.setOutlinks(outlinks).setContent(doc.html()).build();
}
/**
* Parse the description out of the meta tag if one exists. Otherwise, return
* null
*
* @param doc
* The Document to parse
* @return The description if it exists in the HTML, otherwise null.
*/
private String getDescriptionFromDocument(Document doc) {
Elements metaDescriptionElements = doc.select("meta[name=description]");
return metaDescriptionElements.size() > 0 ? metaDescriptionElements
.attr("content") : "";
}
/**
* Parse the keywords out of the meta tag if one exists. Otherwise, return an
* empty list.
*
* @param doc
* The Document ot parse
* @return The list of keywords.
*/
private List<String> getKeywordsFromDocument(Document doc) {
List<String> keywords = new ArrayList<String>();
Elements keywordsElements = doc.select("meta[name=keywords]");
for (Element keywordsElement : keywordsElements) {
for (String keyword : keywordsElement.attr("content").split(",")) {
keywords.add(keyword.trim());
}
}
return keywords;
}
/**
* Parse the outlinks from a href tags in the document, and return them as a
* list
*
* @param doc
* The document to parse
* @return The list of outlinks as URL strings.
*/
private List<String> getOutlinksFromDocument(Document doc) {
List<String> outlinks = new ArrayList<String>();
Elements linkElements = doc.select("a[href]");
for (Element linkElement : linkElements) {
outlinks.add(linkElement.attr("href").trim());
}
return outlinks;
}
/**
* Use the conversionService to convert a list of objects to clazz
*
* @param list
* The list of objects to convert
* @param clazz
* The class to convert those objects to
* @return The list of converted objects.
*/
private <T> List<T> convertList(List<?> list, Class<T> clazz) {
List<T> returnList = new ArrayList<T>();
for (Object o : list) {
returnList.add(conversionService.convert(o, clazz));
}
return returnList;
}
}