package com.sun.bingo.util.UrlParse;
import android.os.AsyncTask;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class TextCrawler {
public static final int ALL = -1;
public static final int NONE = -2;
private final String HTTP_PROTOCOL = "http://";
private final String HTTPS_PROTOCOL = "https://";
private LinkViewCallback callback;
public TextCrawler() {
}
public void makePreview(LinkViewCallback callback, String url) {
this.callback = callback;
new GetCode(ALL).execute(url);
}
public void makePreview(LinkViewCallback callback, String url,
int imageQuantity) {
this.callback = callback;
new GetCode(imageQuantity).execute(url);
}
/** Get html code */
public class GetCode extends AsyncTask<String, Void, Void> {
private LinkSourceContent linkSourceContent = new LinkSourceContent();
private int imageQuantity;
private ArrayList<String> urls;
public GetCode(int imageQuantity) {
this.imageQuantity = imageQuantity;
}
@Override
protected void onPreExecute() {
if (callback != null) {
callback.onBeforeLoading();
}
super.onPreExecute();
}
@Override
protected void onPostExecute(Void result) {
if (callback != null) {
callback.onAfterLoading(linkSourceContent, isNull());
}
super.onPostExecute(result);
}
@Override
protected Void doInBackground(String... params) {
// Don't forget the http:// or https://
urls = Utils.matches(params[0]);
if (urls.size() > 0)
linkSourceContent
.setFinalUrl(unshortenUrl(extendedTrim(urls.get(0))));
else
linkSourceContent.setFinalUrl("");
if (!linkSourceContent.getFinalUrl().equals("")) {
if (isImage(linkSourceContent.getFinalUrl())
&& !linkSourceContent.getFinalUrl().contains("dropbox")) {
linkSourceContent.setSuccess(true);
linkSourceContent.getImages().add(linkSourceContent.getFinalUrl());
linkSourceContent.setTitle("");
linkSourceContent.setDescription("");
} else {
try {
Document doc = Jsoup
.connect(linkSourceContent.getFinalUrl())
.userAgent("Mozilla").get();
linkSourceContent.setHtmlCode(extendedTrim(doc.toString()));
HashMap<String, String> metaTags = getMetaTags(linkSourceContent
.getHtmlCode());
linkSourceContent.setMetaTags(metaTags);
linkSourceContent.setTitle(metaTags.get("title"));
linkSourceContent.setDescription(metaTags
.get("description"));
if (linkSourceContent.getTitle().equals("")) {
String matchTitle = Utils.pregMatch(
linkSourceContent.getHtmlCode(),
Constants.TITLE_PATTERN, 2);
if (!matchTitle.equals(""))
linkSourceContent.setTitle(htmlDecode(matchTitle));
}
if (linkSourceContent.getDescription().equals(""))
linkSourceContent
.setDescription(crawlCode(linkSourceContent
.getHtmlCode()));
linkSourceContent.setDescription(linkSourceContent
.getDescription().replaceAll(
Constants.SCRIPT_PATTERN, ""));
if (imageQuantity != NONE) {
if (!metaTags.get("image").equals(""))
linkSourceContent.getImages().add(
metaTags.get("image"));
else {
linkSourceContent.setImages(getImages(doc,
imageQuantity));
}
}
linkSourceContent.setSuccess(true);
} catch (Exception e) {
linkSourceContent.setSuccess(false);
}
}
}
String[] finalLinkSet = linkSourceContent.getFinalUrl().split("&");
linkSourceContent.setUrl(finalLinkSet[0]);
linkSourceContent.setCannonicalUrl(cannonicalPage(linkSourceContent
.getFinalUrl()));
linkSourceContent.setDescription(stripTags(linkSourceContent
.getDescription()));
return null;
}
/** Verifies if the content could not be retrieved */
public boolean isNull() {
return !linkSourceContent.isSuccess() &&
extendedTrim(linkSourceContent.getHtmlCode()).equals("") &&
!isImage(linkSourceContent.getFinalUrl());
}
}
/** Gets content from a html tag */
private String getTagContent(String tag, String content) {
String pattern = "<" + tag + "(.*?)>(.*?)</" + tag + ">";
String result = "", currentMatch = "";
List<String> matches = Utils.pregMatchAll(content, pattern, 2);
int matchesSize = matches.size();
for (int i = 0; i < matchesSize; i++) {
currentMatch = stripTags(matches.get(i));
if (currentMatch.length() >= 120) {
result = extendedTrim(currentMatch);
break;
}
}
if (result.equals("")) {
String matchFinal = Utils.pregMatch(content, pattern, 2);
result = extendedTrim(matchFinal);
}
result = result.replaceAll(" ", "");
return htmlDecode(result);
}
/** Gets images from the html code */
public List<String> getImages(Document document, int imageQuantity) {
List<String> matches = new ArrayList<String>();
Elements media = document.select("[src]");
for (Element srcElement : media) {
if (srcElement.tagName().equals("img")) {
matches.add(srcElement.attr("abs:src"));
}
}
if (imageQuantity != ALL)
matches = matches.subList(0, imageQuantity);
return matches;
}
/** Transforms from html to normal string */
private String htmlDecode(String content) {
return Jsoup.parse(content).text();
}
/** Crawls the code looking for relevant information */
private String crawlCode(String content) {
String result = "";
String resultSpan = "";
String resultParagraph = "";
String resultDiv = "";
resultSpan = getTagContent("span", content);
resultParagraph = getTagContent("p", content);
resultDiv = getTagContent("div", content);
result = resultSpan;
if (resultParagraph.length() > resultSpan.length()
&& resultParagraph.length() >= resultDiv.length())
result = resultParagraph;
else if (resultParagraph.length() > resultSpan.length()
&& resultParagraph.length() < resultDiv.length())
result = resultDiv;
else
result = resultParagraph;
return htmlDecode(result);
}
/** Returns the cannoncial url */
private String cannonicalPage(String url) {
String cannonical = "";
if (url.startsWith(HTTP_PROTOCOL)) {
url = url.substring(HTTP_PROTOCOL.length());
} else if (url.startsWith(HTTPS_PROTOCOL)) {
url = url.substring(HTTPS_PROTOCOL.length());
}
int urlLength = url.length();
for (int i = 0; i < urlLength; i++) {
if (url.charAt(i) != '/')
cannonical += url.charAt(i);
else
break;
}
return cannonical;
}
/** Strips the tags from an element */
private String stripTags(String content) {
return Jsoup.parse(content).text();
}
/** Verifies if the url is an image */
private boolean isImage(String url) {
return url.matches(Constants.IMAGE_PATTERN);
}
/**
* Returns meta tags from html code
*/
private HashMap<String, String> getMetaTags(String content) {
HashMap<String, String> metaTags = new HashMap<String, String>();
metaTags.put("url", "");
metaTags.put("title", "");
metaTags.put("description", "");
metaTags.put("image", "");
List<String> matches = Utils.pregMatchAll(content,
Constants.METATAG_PATTERN, 1);
for (String match : matches) {
if (match.toLowerCase().contains("property=\"og:url\"")
|| match.toLowerCase().contains("property='og:url'")
|| match.toLowerCase().contains("name=\"url\"")
|| match.toLowerCase().contains("name='url'"))
metaTags.put("url", separeMetaTagsContent(match));
else if (match.toLowerCase().contains("property=\"og:title\"")
|| match.toLowerCase().contains("property='og:title'")
|| match.toLowerCase().contains("name=\"title\"")
|| match.toLowerCase().contains("name='title'"))
metaTags.put("title", separeMetaTagsContent(match));
else if (match.toLowerCase()
.contains("property=\"og:description\"")
|| match.toLowerCase()
.contains("property='og:description'")
|| match.toLowerCase().contains("name=\"description\"")
|| match.toLowerCase().contains("name='description'"))
metaTags.put("description", separeMetaTagsContent(match));
else if (match.toLowerCase().contains("property=\"og:image\"")
|| match.toLowerCase().contains("property='og:image'")
|| match.toLowerCase().contains("name=\"image\"")
|| match.toLowerCase().contains("name='image'"))
metaTags.put("image", separeMetaTagsContent(match));
}
return metaTags;
}
/** Gets content from metatag */
private String separeMetaTagsContent(String content) {
String result = Utils.pregMatch(content, Constants.METATAG_CONTENT_PATTERN,
1);
return htmlDecode(result);
}
/**
* Unshortens a short url
*/
private String unshortenUrl(String shortURL) {
if (!shortURL.startsWith(HTTP_PROTOCOL)
&& !shortURL.startsWith(HTTPS_PROTOCOL))
return "";
URLConnection urlConn = connectURL(shortURL);
urlConn.getHeaderFields();
String finalResult = urlConn.getURL().toString();
urlConn = connectURL(finalResult);
urlConn.getHeaderFields();
shortURL = urlConn.getURL().toString();
while (!shortURL.equals(finalResult)) {
finalResult = unshortenUrl(finalResult);
}
return finalResult;
}
/**
* Takes a valid url and return a URL object representing the url address.
*/
private URLConnection connectURL(String strURL) {
URLConnection conn = null;
try {
URL inputURL = new URL(strURL);
conn = inputURL.openConnection();
} catch (MalformedURLException e) {
System.out.println("Please input a valid URL");
} catch (IOException ioe) {
System.out.println("Can not connect to the URL");
}
return conn;
}
/** Removes extra spaces and trim the string */
public static String extendedTrim(String content) {
return content.replaceAll("\\s+", " ").replace("\n", " ")
.replace("\r", " ").trim();
}
}