package org.Webgatherer.ExperimentalLabs.HtmlProcessing;
import com.google.inject.Inject;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import java.util.HashMap;
import java.util.Map;
/**
* @author Rick Dane
*/
public class HtmlParserImpl implements HtmlParser {
private HtmlCleaner htmlCleaner;
private CleanerProperties htmlCleanerProperties;
@Inject
public HtmlParserImpl(HtmlCleaner htmlCleaner) {
this.htmlCleaner = htmlCleaner;
htmlCleanerProperties = htmlCleaner.getProperties();
}
public Map<String, String> extractLinks(String baseUrl, String htmlPage) {
TagNode node = htmlCleaner.clean(htmlPage);
TagNode[] nodesHref = node.getElementsByName("a", true);
Map<String, String> urlList = new HashMap<String, String>();
for (TagNode curNode : nodesHref) {
Map<String, String> attributes = curNode.getAttributes();
if (attributes.containsKey("href")) {
String url = curNode.getAttributeByName("href").trim();
url = getRelativeLink(url, baseUrl);
urlList.put(curNode.getText().toString().toLowerCase().trim(), url);
}
}
return urlList;
}
private String getRelativeLink(String url, String baseUrl) {
String origUrl = url;
if (!url.contains("http") && !url.contains("www")) {
int urlLength = baseUrl.length();
try {
String checkForSlash = baseUrl.substring(urlLength - 1, urlLength);
if (!checkForSlash.equals("/") && url.indexOf("/") != 0) {
url = "/" + url;
}
if (checkForSlash.equals("/") && url.indexOf("/") == 0) {
url = url.substring(1, url.length());
}
url = baseUrl + url;
} catch (Exception e) {
return origUrl;
}
}
return url;
}
public String getText(String htmlPage) {
TagNode node = htmlCleaner.clean(htmlPage);
StringBuffer stringBuffer = node.getText();
return stringBuffer.toString();
}
private void defaultConfigHtmlCleaner() {
htmlCleanerProperties.setOmitComments(true);
htmlCleanerProperties.setTreatUnknownTagsAsContent(true);
}
}