/**
* Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe.filters.english;
import de.l3s.boilerpipe.BoilerpipeFilter;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.labels.DefaultLabels;
/**
* Finds blocks which are potentially indicating the end of an article text and
* marks them with {@link de.l3s.boilerpipe.labels.DefaultLabels#INDICATES_END_OF_TEXT}. This can be used
* in conjunction with a downstream {@link IgnoreBlocksAfterContentFilter}.
*
* @author Christian Kohlsch��tter
* @see IgnoreBlocksAfterContentFilter
*/
public class TerminatingBlocksFinder implements BoilerpipeFilter {
public static final TerminatingBlocksFinder INSTANCE = new TerminatingBlocksFinder();
/**
* Returns the singleton instance for TerminatingBlocksFinder.
*/
public static TerminatingBlocksFinder getInstance() {
return INSTANCE;
}
// public static long timeSpent = 0;
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
// long t = System.currentTimeMillis();
for (TextBlock tb : doc.getTextBlocks()) {
final int numWords = tb.getNumWords();
if (numWords < 15) {
final String text = tb.getText().trim();
final int len = text.length();
if (len >= 8) {
final String textLC = text.toLowerCase();
if (textLC.startsWith("comments")
|| startsWithNumber(textLC, len, " comments",
" users responded in")
|| textLC.startsWith("�� reuters")
|| textLC.startsWith("please rate this")
|| textLC.startsWith("post a comment")
|| textLC.contains("what you think...")
|| textLC.contains("add your comment")
|| textLC.contains("add comment")
|| textLC.contains("reader views")
|| textLC.contains("have your say")
|| textLC.contains("reader comments")
|| textLC.contains("r��tta artikeln")
|| textLC.contains("Réagir")
|| textLC.contains("Vos réactions ")
|| textLC
.equals("thanks for your comments - this feedback is now closed")) {
tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT);
changes = true;
}
} else if(tb.getLinkDensity() == 1.0) {
if(text.equals("Comment")) {
tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT);
}
}
}
}
// timeSpent += System.currentTimeMillis() - t;
return changes;
}
/**
* Checks whether the given text t starts with a sequence of digits,
* followed by one of the given strings.
*
* @param t
* The text to examine
* @param len
* The length of the text to examine
* @param str
* Any strings that may follow the digits.
* @return true if at least one combination matches
*/
private static boolean startsWithNumber(final String t, final int len,
final String... str) {
int j = 0;
while (j < len && isDigit(t.charAt(j))) {
j++;
}
if (j != 0) {
for (String s : str) {
if (t.startsWith(s, j)) {
return true;
}
}
}
return false;
}
private static boolean isDigit(final char c) {
return c >= '0' && c <= '9';
}
}