package me.ccrama.redditslide.Views; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by ccrama on 5/17/2015. */ class HTMLLinkExtractor { private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>"; private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))"; private final Pattern patternTag; private final Pattern patternLink; public HTMLLinkExtractor() { patternTag = Pattern.compile(HTML_A_TAG_PATTERN); patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN); } /** * Validate html with regular expression * * @param html html content for validation * @return Vector links and link text */ public ArrayList<HtmlLink> grabHTMLLinks(final String html) { ArrayList<HtmlLink> result = new ArrayList<>(); Matcher matcherTag = patternTag.matcher(html); while (matcherTag.find()) { String href = matcherTag.group(1); // href String linkText = matcherTag.group(2); // link text Matcher matcherLink = patternLink.matcher(href); while (matcherLink.find()) { String link = matcherLink.group(1); // link HtmlLink obj = new HtmlLink(); obj.setLink(link); obj.setLinkText(linkText); result.add(obj); } } return result; } public class HtmlLink { String link; String linkText; HtmlLink() { } @Override public String toString() { return "Link : " + this.link + " Link Text : " + this.linkText; } public String getLink() { return link; } public void setLink(String link) { this.link = replaceInvalidChar(link); } public String getLinkText() { return linkText; } public void setLinkText(String linkText) { this.linkText = linkText; } private String replaceInvalidChar(String link) { link = link.replaceAll("'", ""); link = link.replaceAll("\"", ""); return link; } } }