package org.apache.nutch.parse.googleplay;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class GoogleplayParser implements Parser {
public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.googleplay");
static Pattern appUrlPattern = Pattern.compile("https://play.google.com/store/apps/details\\?id=[a-zA-Z0-9\\._]+");
static Pattern titlePattern = Pattern.compile("<title.*?>(.*?)</title>");
static Pattern appNamePattern= Pattern.compile("<div class=\"document-title\" itemprop=\"name\"> <div.*?>(.*?)</div");
static Pattern linkPattern = Pattern.compile("href=\"/store/apps/details\\?id=([a-zA-Z0-9\\._]+)");
static Pattern publisherPattern = Pattern.compile("<meta content=\"/store/apps/developer\\?id=(.*?)\"");
static Pattern updateTimePattern = Pattern.compile("<div class=\"document-subtitle\">- (.*?)</div>");
static Pattern categoryPattern = Pattern.compile("<span itemprop=\"genre\">(.*?)</span>");
static Pattern pricePattern = Pattern.compile("<span class=\"price buy\"> <span>(.*?)<span>(.*?)</span>");
static Pattern reviewPattern = Pattern.compile("<div class=\"score-container\"(.*?)<meta content=\"(.*?)\" itemprop=\"ratingValue\">(.*?)<meta content=\"(.*)?\" itemprop=\"ratingCount\">");
static Pattern installPattern = Pattern.compile("<div class=\"content\" itemprop=\"numDownloads\">(.*?)</div>");
static Pattern versionPattern = Pattern.compile("<div class=\"content\" itemprop=\"softwareVersion\">(.*?)</div>");
static Pattern ratingPattern = Pattern.compile("<div class=\"content\" itemprop=\"contentRating\">(.*?)</div>");
static Pattern developerSitePattern = Pattern.compile("<a class=\"dev-link\" href=\"https://www.google.com/url\\?q=(.*?)&");
static Pattern developerEmailPattern = Pattern.compile("<a class=\"dev-link\" href=\"mailto:(.*?)\"");
static Pattern descriptionPattern = Pattern.compile("<div class=\"show-more-content text-body\" itemprop=\"description\"> <div class=\"id-app-orig-desc\">(.*?)</div>");
private Configuration conf;
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public ParseResult getParse(Content content) {
String thisId = content.getBaseUrl().substring(content.getBaseUrl().indexOf("=")+1);
byte[] contentInOctets = content.getContent();
String htmlText = new String(contentInOctets);
Metadata meta = content.getMetadata();
String title = null;
String appName = null;
Set<String> ids = new HashSet<String>();
String publisher = null;
String updateTime = null;
String category = null;
String price = null;
String reviewScore = null;
String reviewCount = null;
String install = null;
String version = null;
String rating = null;
String developerSite = null;
String developerEmail = null;
String description = null;
Matcher m = titlePattern.matcher(htmlText);
if (m.find()) {
title = m.group(1);
}
m = linkPattern.matcher(htmlText);
while (m.find()) {
if (!m.group(1).equals(thisId)) {
ids.add(m.group(1));
}
}
List<Outlink> outlinks = new ArrayList<Outlink>();
for (String id : ids) {
try {
outlinks.add(new Outlink("https://play.google.com/store/apps/details?id=" + id, ""));
} catch (MalformedURLException mue) {
LOG.warn("Invalid url: '" + id + "', skipping.");
}
}
m = appUrlPattern.matcher(content.getBaseUrl());
if (m.matches()) { // App page
m = appNamePattern.matcher(htmlText);
if (m.find()) {
appName = m.group(1);
}
meta.set("name", appName);
m = publisherPattern.matcher(htmlText);
if (m.find()) {
publisher = m.group(1);
}
meta.set("publisher", publisher!=null?publisher:"");
m = updateTimePattern.matcher(htmlText);
if (m.find()) {
updateTime = m.group(1);
}
meta.set("updateTime", updateTime!=null?updateTime:"");
m = categoryPattern.matcher(htmlText);
if (m.find()) {
category = m.group(1);
category = category.replace("&", "and");
}
meta.set("category", category!=null?category:"");
m = pricePattern.matcher(htmlText);
if (m.find()) {
price = m.group(2);
}
meta.set("price", price!=null?price:"");
m = reviewPattern.matcher(htmlText);
if (m.find()) {
reviewScore = m.group(2);
reviewCount = m.group(4);
}
meta.set("reviewScore", reviewScore!=null?reviewScore:"");
meta.set("reviewCount", reviewCount!=null?reviewCount:"");
m = installPattern.matcher(htmlText);
if (m.find()) {
install = m.group(1)!=null?m.group(1):"";
install = install.trim();
}
meta.set("install", install);
m = versionPattern.matcher(htmlText);
if (m.find()) {
version = m.group(1)!=null?m.group(1):"";
version = version.trim();
}
meta.set("version", version);
m = ratingPattern.matcher(htmlText);
if (m.find()) {
rating = m.group(1)!=null?m.group(1):"";
rating = rating.trim();
}
meta.set("rating", rating);
m = developerSitePattern.matcher(htmlText);
if (m.find()) {
developerSite = m.group(1)!=null?m.group(1):"";
developerSite = developerSite.trim();
}
meta.set("developerSite", developerSite);
m = developerEmailPattern.matcher(htmlText);
if (m.find()) {
developerEmail = m.group(1)!=null?m.group(1):"";
developerEmail = developerEmail.trim();
}
meta.set("developerEmail", developerEmail);
m = descriptionPattern.matcher(htmlText);
if (m.find()) {
description = m.group(1);
}
meta.set("description", description!=null?description:"");
}
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks.toArray(new Outlink[0]), meta);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
new ParseImpl("", parseData));
try {
Thread.sleep(200);
} catch (InterruptedException e) {
}
return parseResult;
}
}