/** * Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.HashSet; import java.util.Set; import java.util.regex.Pattern; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Marks {@link TextBlock}s which contain parts of the HTML * <code><TITLE></code> tag, using some heuristics which are quite * specific to the news domain. * * @author Christian Kohlschütter */ public final class DocumentTitleMatchClassifier implements BoilerpipeFilter { private final Set<String> potentialTitles; public DocumentTitleMatchClassifier(String title) { if (title == null) { this.potentialTitles = null; } else { title = title.replace('\u00a0', ' '); title = title.replace("'", ""); title = title.trim().toLowerCase(); if (title.length() == 0) { this.potentialTitles = null; } else { this.potentialTitles = new HashSet<String>(); potentialTitles.add(title); String p; p = getLongestPart(title, "[ ]*[\\|»|-][ ]*"); if (p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|:][ ]*"); if (p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)][ ]*"); if (p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)\\-][ ]*"); if (p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-][ ]*"); if (p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-\u00a0][ ]*"); if (p != null) { potentialTitles.add(p); } addPotentialTitles(potentialTitles, title, "[ ]+[\\|][ ]+", 4); addPotentialTitles(potentialTitles, title, "[ ]+[\\-][ ]+", 4); potentialTitles.add(title.replaceFirst(" - [^\\-]+$", "")); potentialTitles.add(title.replaceFirst("^[^\\-]+ - ", "")); } } } public Set<String> getPotentialTitles() { return potentialTitles; } private void addPotentialTitles(final Set<String> potentialTitles, final String title, final String pattern, final int minWords) { String[] parts = title.split(pattern); if (parts.length == 1) { return; } for (int i = 0; i < parts.length; i++) { String p = parts[i]; if (p.contains(".com")) { continue; } final int numWords = p.split("[\b ]+").length; if (numWords >=minWords) { potentialTitles.add(p); } } } private String getLongestPart(final String title, final String pattern) { String[] parts = title.split(pattern); if (parts.length == 1) { return null; } int longestNumWords = 0; String longestPart = ""; for (int i = 0; i < parts.length; i++) { String p = parts[i]; if (p.contains(".com")) { continue; } final int numWords = p.split("[\b ]+").length; if (numWords > longestNumWords || p.length() > longestPart.length()) { longestNumWords = numWords; longestPart = p; } } if (longestPart.length() == 0) { return null; } else { return longestPart.trim(); } } private static final Pattern PAT_REMOVE_CHARACTERS = Pattern.compile("[\\?\\!\\.\\-\\:]+"); public boolean process(TextDocument doc) throws BoilerpipeProcessingException { if (potentialTitles == null) { return false; } boolean changes = false; for (final TextBlock tb : doc.getTextBlocks()) { String text = tb.getText(); text = text.replace('\u00a0', ' '); text = text.replace("'", ""); text = text.trim().toLowerCase(); if (potentialTitles.contains(text)) { tb.addLabel(DefaultLabels.TITLE); changes = true; break; } text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim(); if (potentialTitles.contains(text)) { tb.addLabel(DefaultLabels.TITLE); changes = true; break; } } return changes; } }