/** * Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.labels.DefaultLabels; import de.l3s.boilerpipe.labels.LabelAction; /** * Assigns labels for element CSS classes and ids to the corresponding * {@link TextBlock}. CSS classes are prefixed by * <code>{@link DefaultLabels#MARKUP_PREFIX}.</code>, and IDs are prefixed by * <code>{@link DefaultLabels#MARKUP_PREFIX}#</code> * * @author Christian Kohlschütter */ public final class MarkupTagAction implements TagAction { private final boolean isBlockLevel; private LinkedList<List<String>> labelStack = new LinkedList<List<String>>(); public MarkupTagAction(final boolean isBlockLevel) { this.isBlockLevel = isBlockLevel; } private static final Pattern PAT_NUM = Pattern.compile("[0-9]+"); public boolean start(BoilerpipeHTMLContentHandler instance, String localName, String qName, Attributes atts) throws SAXException { List<String> labels = new ArrayList<String>(5); labels.add(DefaultLabels.MARKUP_PREFIX + localName); String classVal = atts.getValue("class"); if (classVal != null && classVal.length() > 0) { classVal = PAT_NUM.matcher(classVal).replaceAll("#"); classVal = classVal.trim(); String[] vals = classVal.split("[ ]+"); labels.add(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.')); if (vals.length > 1) { for (String s : vals) { labels.add(DefaultLabels.MARKUP_PREFIX + "." + s); } } } String id = atts.getValue("id"); if (id != null && id.length() > 0) { id = PAT_NUM.matcher(id).replaceAll("#"); labels.add(DefaultLabels.MARKUP_PREFIX + "#" + id); } Set<String> ancestors = getAncestorLabels(); List<String> labelsWithAncestors = new ArrayList<String>( (ancestors.size() + 1) * labels.size()); for (String l : labels) { for (String an : ancestors) { labelsWithAncestors.add(an); labelsWithAncestors.add(an + " " + l); } labelsWithAncestors.add(l); } instance.addLabelAction(new LabelAction(labelsWithAncestors .toArray(new String[labelsWithAncestors.size()]))); labelStack.add(labels); return isBlockLevel; } public boolean end(BoilerpipeHTMLContentHandler instance, String localName, String qName) throws SAXException { labelStack.removeLast(); return isBlockLevel; } public boolean changesTagLevel() { return isBlockLevel; } private Set<String> getAncestorLabels() { Set<String> set = new HashSet<String>(); for (List<String> labels : labelStack) { if (labels == null) { continue; } set.addAll(labels); } return set; } }