/**
* Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe.filters.heuristics;
import java.util.regex.Pattern;
import de.l3s.boilerpipe.BoilerpipeFilter;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.labels.DefaultLabels;
public class ArticleMetadataFilter implements BoilerpipeFilter {
private static final Pattern[] PATTERNS_SHORT = new Pattern[] {
Pattern
.compile("^[0-9 \\,\\./]*\\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\\b[0-9 \\,\\:apm\\./]*([CPSDMGET]{2,3})?$"),
Pattern.compile("^[Bb]y ")
};
public static final ArticleMetadataFilter INSTANCE = new ArticleMetadataFilter();
private ArticleMetadataFilter() {
}
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changed = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (tb.getNumWords() > 10) {
continue;
}
final String text = tb.getText();
for (Pattern p : PATTERNS_SHORT) {
if (p.matcher(text).find()) {
changed = true;
tb.setIsContent(true);
tb.addLabel(DefaultLabels.ARTICLE_METADATA);
}
}
}
return changed;
}
}