/**
* Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe.filters.heuristics;
import java.util.List;
import java.util.ListIterator;
import de.l3s.boilerpipe.BoilerpipeFilter;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.labels.DefaultLabels;
/**
* Keeps the largest {@link TextBlock} only (by the number of words). In case of
* more than one block with the same number of words, the first block is chosen.
* All discarded blocks are marked "not content" and flagged as
* {@link de.l3s.boilerpipe.labels.DefaultLabels#MIGHT_BE_CONTENT}.
*
* Note that, by default, only TextBlocks marked as "content" are taken into consideration.
*
* @author Christian Kohlschütter
*/
public final class KeepLargestBlockFilter implements BoilerpipeFilter {
public static final KeepLargestBlockFilter INSTANCE = new KeepLargestBlockFilter(
false, 0);
public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL = new KeepLargestBlockFilter(
true, 0);
public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS = new KeepLargestBlockFilter(
true, 150);
private final boolean expandToSameLevelText;
private final int minWords;
public KeepLargestBlockFilter(boolean expandToSameLevelText, final int minWords) {
this.expandToSameLevelText = expandToSameLevelText;
this.minWords = minWords;
}
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
if (textBlocks.size() < 2) {
return false;
}
int maxNumWords = -1;
TextBlock largestBlock = null;
int level = -1;
int i = 0;
int n = -1;
for (TextBlock tb : textBlocks) {
if (tb.isContent()) {
final int nw = tb.getNumWords();
if (nw > maxNumWords) {
largestBlock = tb;
maxNumWords = nw;
n = i;
if (expandToSameLevelText) {
level = tb.getTagLevel();
}
}
}
i++;
}
for (TextBlock tb : textBlocks) {
if (tb == largestBlock) {
tb.setIsContent(true);
tb.addLabel(DefaultLabels.VERY_LIKELY_CONTENT);
} else {
tb.setIsContent(false);
tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT);
}
}
if (expandToSameLevelText && n != -1) {
for (ListIterator<TextBlock> it = textBlocks.listIterator(n); it
.hasPrevious();) {
TextBlock tb = it.previous();
final int tl = tb.getTagLevel();
if(tl < level) {
break;
} else if(tl == level) {
if(tb.getNumWords() >= minWords) {
tb.setIsContent(true);
}
}
}
for (ListIterator<TextBlock> it = textBlocks.listIterator(n); it
.hasNext();) {
TextBlock tb = it.next();
final int tl = tb.getTagLevel();
if(tl < level) {
break;
} else if(tl == level) {
if(tb.getNumWords() >= minWords) {
tb.setIsContent(true);
}
}
}
}
return true;
}
}