package com.manuelmaly.hn.parser;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.w3c.dom.Node;
import java.net.URI;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
public abstract class BaseHTMLParser<T> {
public static final int UNDEFINED = -1;
public T parse(String input) throws Exception {
return parseDocument(Jsoup.parse(input));
}
public abstract T parseDocument(Element doc) throws Exception;
public static String getDomainName(String url) {
URI uri;
try {
uri = new URI(url);
String domain = uri.getHost();
return domain.startsWith("www.") ? domain.substring(4) : domain;
} catch (Exception e) {
return url;
}
}
public static String getFirstTextValueInElementChildren(Element element) {
if (element == null)
return "";
for (org.jsoup.nodes.Node node : element.childNodes())
if (node instanceof TextNode)
return ((TextNode) node).text();
return "";
}
public static String getStringValue(String query, Node source, XPath xpath) {
try {
return ((Node)xpath.evaluate(query, source, XPathConstants.NODE)).getNodeValue();
} catch (Exception e) {
//TODO insert Google Analytics tracking here?
}
return "";
}
public static Integer getIntValueFollowedBySuffix(String value, String suffix) {
if (value == null || suffix == null)
return 0;
int suffixWordIdx = value.indexOf(suffix);
if (suffixWordIdx >= 0) {
String extractedValue = value.substring(0, suffixWordIdx).replaceAll("\\u00A0", "").trim();
try {
return Integer.parseInt(extractedValue);
} catch (NumberFormatException e) {
return UNDEFINED;
}
}
return UNDEFINED;
}
public static String getStringValuePrefixedByPrefix(String value, String prefix) {
int prefixWordIdx = value.indexOf(prefix);
if (prefixWordIdx >= 0) {
return value.substring(prefixWordIdx + prefix.length());
}
return null;
}
}