/**
* Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe.filters.english;
import java.util.Iterator;
import de.l3s.boilerpipe.BoilerpipeFilter;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.labels.DefaultLabels;
/**
* Marks all blocks as "non-content" that occur after blocks that have been
* marked {@link de.l3s.boilerpipe.labels.DefaultLabels#INDICATES_END_OF_TEXT}. These marks are ignored
* unless a minimum number of words in content blocks occur before this mark (default: 60).
* This can be used in conjunction with an upstream {@link de.l3s.boilerpipe.filters.english.TerminatingBlocksFinder}.
*
* @author Christian Kohlschütter
* @see de.l3s.boilerpipe.filters.english.TerminatingBlocksFinder
*/
public final class IgnoreBlocksAfterContentFilter extends HeuristicFilterBase implements BoilerpipeFilter {
public static final IgnoreBlocksAfterContentFilter DEFAULT_INSTANCE = new IgnoreBlocksAfterContentFilter(
60);
public static final IgnoreBlocksAfterContentFilter INSTANCE_200 = new IgnoreBlocksAfterContentFilter(
200);
private final int minNumWords;
/**
* Returns the singleton instance for DeleteBlocksAfterContentFilter.
*/
public static IgnoreBlocksAfterContentFilter getDefaultInstance() {
return DEFAULT_INSTANCE;
}
public IgnoreBlocksAfterContentFilter(final int minNumWords) {
this.minNumWords = minNumWords;
}
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
int numWords = 0;
boolean foundEndOfText = false;
for (Iterator<TextBlock> it = doc.getTextBlocks().iterator(); it.hasNext();) {
TextBlock block = it.next();
final boolean endOfText = block
.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
if (block.isContent()) {
numWords += getNumFullTextWords(block);
}
if (endOfText && numWords >= minNumWords) {
foundEndOfText = true;
}
if (foundEndOfText) {
changes = true;
block.setIsContent(false);
}
}
return changes;
}
}