/** ========================================================================
* handytrowel: src/main/java/extraction/LinkExtractor.java
* Get 'a' link href's from links that are children of the article.
* ========================================================================
* Copyright (c) 2014, Asim Ihsan, All rights reserved.
* <http://www.asimihsan.com>
* https://github.com/asimihsan/handytrowel/blob/master/LICENSE
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ========================================================================
*/
package com.asimihsan.handytrowel.extraction;
import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.xerces.parsers.AbstractSAXParser;
import org.cyberneko.html.HTMLConfiguration;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;
import de.l3s.boilerpipe.sax.HTMLDocument;
import de.l3s.boilerpipe.sax.HTMLFetcher;
public final class LinkExtractor {
public static final LinkExtractor INSTANCE = new LinkExtractor();
/**
* Black list of regular expression strings for matching links never to return.
*/
private static final List<String> blackListRegularExpressions = Lists.newArrayList(
"https://www.facebook.com/sharer/sharer.php",
"http://www.facebook.com/share.php",
"https://twitter.com/intent/tweet",
"http://pinterest.com/pin/create/bookmarklet",
"http://www.reddit.com/submit",
"https://plus.google.com/share",
"http://www.reddit.com/submit",
"http://del.icio.us/post",
"http://tapiture.com/bookmarklet/image",
"http://www.stumbleupon.com/submit",
"http://www.linkedin.com/shareArticle",
"http://slashdot.org/bookmark.pl",
"http://technorati.com/faves",
"http://posterous.com/share",
"http://www.tumblr.com/share",
"http://www.newsvine.com/_tools/seed",
"http://ping.fm/ref",
"http://www.friendfeed.com/share"
);
/**
* Regular expression of black-listed links to never return.
*/
private static final Pattern blackList = Pattern.compile(
Joiner.on("|").join(blackListRegularExpressions));
/**
* Returns the singleton instance of {@link ImageExtractor}.
*
* @return the singleton instance of {@link ImageExtractor}.
*/
public static LinkExtractor getInstance() {
return INSTANCE;
}
private LinkExtractor() {
}
/**
* Processes the given {@link TextDocument} and the original HTML text (as a
* String).
*
* @param doc
* The processed {@link TextDocument}.
* @param origHTML
* The original HTML document.
* @return A List of enclosed {@link Image}s
* @throws BoilerpipeProcessingException
*/
public List<String> process(final TextDocument doc,
final String origHTML) throws BoilerpipeProcessingException {
return process(doc, new InputSource(
new StringReader(origHTML)));
}
/**
* Processes the given {@link TextDocument} and the original HTML text (as an
* {@link InputSource}).
*
* @param doc
* The processed {@link TextDocument}.
* The original HTML document.
* @return A List of enclosed links
* @throws BoilerpipeProcessingException
*/
public List<String> process(final TextDocument doc,
final InputSource is) throws BoilerpipeProcessingException {
final Implementation implementation = new Implementation();
implementation.process(doc, is);
return implementation.linksHighlight;
}
/**
* Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
* retrieved HTML using the specified {@link BoilerpipeExtractor}.
*
* The processed {@link TextDocument}.
* The original HTML document.
* @return A List of enclosed links
* @throws BoilerpipeProcessingException
*/
public List<String> process(final URL url, final BoilerpipeExtractor extractor)
throws IOException, BoilerpipeProcessingException, SAXException {
final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
.getTextDocument();
extractor.process(doc);
final InputSource is = htmlDoc.toInputSource();
return process(doc, is);
}
private static final class Implementation extends AbstractSAXParser implements
ContentHandler {
List<String> linksHighlight = new ArrayList<>();
private List<String> linksBuffer = new ArrayList<>();
private int inIgnorableElement = 0;
private int characterElementIdx = 0;
private final BitSet contentBitSet = new BitSet();
private boolean inHighlight = false;
Implementation() {
super(new HTMLConfiguration());
setContentHandler(this);
}
void process(final TextDocument doc, final InputSource is)
throws BoilerpipeProcessingException {
for (TextBlock block : doc.getTextBlocks()) {
if (block.isContent()) {
final BitSet bs = block.getContainedTextElements();
if (bs != null) {
contentBitSet.or(bs);
}
}
}
try {
parse(is);
} catch (SAXException e) {
throw new BoilerpipeProcessingException(e);
} catch (IOException e) {
throw new BoilerpipeProcessingException(e);
}
}
public void endDocument() throws SAXException {
}
public void endPrefixMapping(String prefix) throws SAXException {
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
}
public void processingInstruction(String target, String data)
throws SAXException {
}
public void setDocumentLocator(Locator locator) {
}
public void skippedEntity(String name) throws SAXException {
}
public void startDocument() throws SAXException {
}
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
TagAction ta = TAG_ACTIONS.get(localName);
if (ta != null) {
ta.beforeStart(this, localName);
}
try {
if ((inIgnorableElement == 0) && (inHighlight && "A".equalsIgnoreCase(localName))) {
String href = atts.getValue("href");
if((href != null) &&
(href.length() > 0) &&
!(blackList.matcher(href).lookingAt())) {
linksBuffer.add(href);
}
}
} finally {
if (ta != null) {
ta.afterStart(this, localName);
}
}
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
TagAction ta = TAG_ACTIONS.get(localName);
if (ta != null) {
ta.beforeEnd(this, localName);
}
if (ta != null) {
ta.afterEnd(this, localName);
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
characterElementIdx++;
if (inIgnorableElement == 0) {
boolean highlight = contentBitSet.get(characterElementIdx);
if(!highlight) {
if(length == 0) {
return;
}
boolean justWhitespace = true;
for(int i=start; i<start+length; i++) {
if(!Character.isWhitespace(ch[i])) {
justWhitespace = false;
break;
}
}
if(justWhitespace) {
return;
}
}
inHighlight = highlight;
if(inHighlight) {
linksHighlight.addAll(linksBuffer);
linksBuffer.clear();
}
}
}
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
}
}
private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
void beforeStart(final Implementation instance, final String localName) {
instance.inIgnorableElement++;
}
void afterEnd(final Implementation instance, final String localName) {
instance.inIgnorableElement--;
}
};
private static Map<String, TagAction> TAG_ACTIONS = new HashMap<String, TagAction>();
static {
TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT);
TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT);
TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT);
TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT);
TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT);
TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT);
TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT);
TAG_ACTIONS.put("HEAD", TA_IGNORABLE_ELEMENT);
}
private abstract static class TagAction {
void beforeStart(final Implementation instance, final String localName) {
}
void afterStart(final Implementation instance, final String localName) {
}
void beforeEnd(final Implementation instance, final String localName) {
}
void afterEnd(final Implementation instance, final String localName) {
}
}
}