/**
* Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe.sax;
import de.l3s.boilerpipe.labels.DefaultLabels;
import de.l3s.boilerpipe.labels.LabelAction;
/**
* Default {@link de.l3s.boilerpipe.sax.TagAction}s. Seem to work well.
*
* @see de.l3s.boilerpipe.sax.TagActionMap
*/
public class DefaultTagActionMap extends TagActionMap {
/**
*
*/
private static final long serialVersionUID = 1L;
public static final TagActionMap INSTANCE = new DefaultTagActionMap();
protected DefaultTagActionMap() {
setTagAction("STYLE", CommonTagActions.TA_IGNORABLE_ELEMENT);
setTagAction("SCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT);
setTagAction("OPTION", CommonTagActions.TA_IGNORABLE_ELEMENT);
setTagAction("OBJECT", CommonTagActions.TA_IGNORABLE_ELEMENT);
setTagAction("EMBED", CommonTagActions.TA_IGNORABLE_ELEMENT);
setTagAction("APPLET", CommonTagActions.TA_IGNORABLE_ELEMENT);
setTagAction("LINK", CommonTagActions.TA_IGNORABLE_ELEMENT);
setTagAction("A", CommonTagActions.TA_ANCHOR_TEXT);
setTagAction("BODY", CommonTagActions.TA_BODY);
setTagAction("STRIKE", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("U", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("B", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("I", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("EM", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("STRONG", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("SPAN", CommonTagActions.TA_INLINE_NO_WHITESPACE);
// New in 1.1 (especially to improve extraction quality from Wikipedia etc.)
setTagAction("SUP", CommonTagActions.TA_INLINE_NO_WHITESPACE);
// New in 1.2
setTagAction("CODE", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("TT", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("SUB", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("VAR", CommonTagActions.TA_INLINE_NO_WHITESPACE);
setTagAction("ABBR", CommonTagActions.TA_INLINE_WHITESPACE);
setTagAction("ACRONYM", CommonTagActions.TA_INLINE_WHITESPACE);
setTagAction("FONT", CommonTagActions.TA_INLINE_NO_WHITESPACE); // could also use TA_FONT
// added in 1.1.1
setTagAction("NOSCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT);
// New in 1.3
setTagAction("LI", new CommonTagActions.BlockTagLabelAction(
new LabelAction(DefaultLabels.LI)));
setTagAction("H1", new CommonTagActions.BlockTagLabelAction(
new LabelAction(DefaultLabels.H1, DefaultLabels.HEADING)));
setTagAction("H2", new CommonTagActions.BlockTagLabelAction(
new LabelAction(DefaultLabels.H2, DefaultLabels.HEADING)));
setTagAction("H3", new CommonTagActions.BlockTagLabelAction(
new LabelAction(DefaultLabels.H3, DefaultLabels.HEADING)));
}
}