/**
* Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe.sax;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.labels.LabelAction;
/**
* Defines an action that is to be performed whenever a particular tag occurs during HTML parsing.
*
* @author Christian Kohlschütter
*/
public abstract class CommonTagActions {
private CommonTagActions() {
}
public static final class Chained implements TagAction {
private final TagAction t1;
private final TagAction t2;
public Chained(final TagAction t1, final TagAction t2) {
this.t1 = t1;
this.t2 = t2;
}
public boolean start(BoilerpipeHTMLContentHandler instance,
String localName, String qName, Attributes atts)
throws SAXException {
return t1.start(instance, localName, qName, atts)
| t2.start(instance, localName, qName, atts);
}
public boolean end(BoilerpipeHTMLContentHandler instance,
String localName, String qName) throws SAXException {
return t1.end(instance, localName, qName)
| t2.end(instance, localName, qName);
}
public boolean changesTagLevel() {
return t1.changesTagLevel() || t2.changesTagLevel();
}
}
/**
* Marks this tag as "ignorable", i.e. all its inner content is silently skipped.
*/
public static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
public boolean start(final BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
instance.inIgnorableElement++;
return true;
}
public boolean end(final BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
instance.inIgnorableElement--;
return true;
}
public boolean changesTagLevel() {
return true;
}
};
/**
* Marks this tag as "anchor" (this should usually only be set for the <code><A></code> tag).
* Anchor tags may not be nested.
*
* There is a bug in certain versions of NekoHTML which still allows nested tags.
* If boilerpipe encounters such nestings, a SAXException is thrown.
*/
public static final TagAction TA_ANCHOR_TEXT = new TagAction() {
public boolean start(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) throws SAXException {
if (instance.inAnchor++ > 0) {
// as nested A elements are not allowed per specification, we
// are probably reaching this branch due to a bug in the XML
// parser
System.err.println("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...");
end(instance, localName, qName);
}
if (instance.inIgnorableElement == 0) {
instance.addWhitespaceIfNecessary();
instance.tokenBuffer
.append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_START);
instance.tokenBuffer.append(' ');
instance.sbLastWasWhitespace = true;
}
return false;
}
public boolean end(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
if (--instance.inAnchor == 0) {
if (instance.inIgnorableElement == 0) {
instance.addWhitespaceIfNecessary();
instance.tokenBuffer
.append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_END);
instance.tokenBuffer.append(' ');
instance.sbLastWasWhitespace = true;
}
}
return false;
}
public boolean changesTagLevel() {
return true;
}
};
/**
* Marks this tag the body element (this should usually only be set for the <code><BODY></code> tag).
*/
public static final TagAction TA_BODY = new TagAction() {
public boolean start(final BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
instance.flushBlock();
instance.inBody++;
return false;
}
public boolean end(final BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
instance.flushBlock();
instance.inBody--;
return false;
}
public boolean changesTagLevel() {
return true;
}
};
/**
* Marks this tag a simple "inline" element, which generates whitespace, but no new block.
*/
public static final TagAction TA_INLINE_WHITESPACE = new TagAction() {
public boolean start(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
instance.addWhitespaceIfNecessary();
return false;
}
public boolean end(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
instance.addWhitespaceIfNecessary();
return false;
}
public boolean changesTagLevel() {
return false;
}
};
/**
* @deprecated Use {@link #TA_INLINE_WHITESPACE} instead
*/
@Deprecated
public static final TagAction TA_INLINE = TA_INLINE_WHITESPACE;
/**
* Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
*/
public static final TagAction TA_INLINE_NO_WHITESPACE = new TagAction() {
public boolean start(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
return false;
}
public boolean end(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
return false;
}
public boolean changesTagLevel() {
return false;
}
};
private static final Pattern PAT_FONT_SIZE = Pattern
.compile("([\\+\\-]?)([0-9])");
/**
* Explicitly marks this tag a simple "block-level" element, which always generates whitespace
*/
public static final TagAction TA_BLOCK_LEVEL = new TagAction() {
public boolean start(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
return true;
}
public boolean end(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
return true;
}
public boolean changesTagLevel() {
return true;
}
};
/**
* Special TagAction for the <code><FONT></code> tag, which keeps track of the
* absolute and relative font size.
*/
public static final TagAction TA_FONT = new TagAction() {
public boolean start(final BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
String sizeAttr = atts.getValue("size");
if (sizeAttr != null) {
Matcher m = PAT_FONT_SIZE.matcher(sizeAttr);
if (m.matches()) {
String rel = m.group(1);
final int val = Integer.parseInt(m.group(2));
final int size;
if (rel.length() == 0) {
// absolute
size = val;
} else {
// relative
int prevSize;
if (instance.fontSizeStack.isEmpty()) {
prevSize = 3;
} else {
prevSize = 3;
for (Integer s : instance.fontSizeStack) {
if (s != null) {
prevSize = s;
break;
}
}
}
if (rel.charAt(0) == '+') {
size = prevSize + val;
} else {
size = prevSize - val;
}
}
instance.fontSizeStack.add(0, size);
} else {
instance.fontSizeStack.add(0, null);
}
} else {
instance.fontSizeStack.add(0, null);
}
return false;
}
public boolean end(final BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
instance.fontSizeStack.removeFirst();
return false;
}
public boolean changesTagLevel() {
return false;
}
};
/**
* {@link de.l3s.boilerpipe.sax.CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated
* {@link TextBlock}.
*/
public static final class InlineTagLabelAction implements TagAction {
private final LabelAction action;
public InlineTagLabelAction(final LabelAction action) {
this.action = action;
}
public boolean start(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
instance.addWhitespaceIfNecessary();
instance.addLabelAction(action);
return false;
}
public boolean end(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
instance.addWhitespaceIfNecessary();
return false;
}
public boolean changesTagLevel() {
return false;
}
}
/**
* {@link de.l3s.boilerpipe.sax.CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated
* {@link TextBlock}.
*/
public static final class BlockTagLabelAction implements TagAction {
private final LabelAction action;
public BlockTagLabelAction(final LabelAction action) {
this.action = action;
}
public boolean start(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName,
final Attributes atts) {
instance.addLabelAction(action);
return true;
}
public boolean end(BoilerpipeHTMLContentHandler instance,
final String localName, final String qName) {
return true;
}
public boolean changesTagLevel() {
return true;
}
}
}