DocumentTitleMatchClassifier.java example

/**
 * Copyright (C) 2013 Christian Kohlschütter (ckkohl79@gmail.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.l3s.boilerpipe.filters.heuristics;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;

import de.l3s.boilerpipe.BoilerpipeFilter;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.labels.DefaultLabels;

/**
 * Marks {@link TextBlock}s which contain parts of the HTML
 * <code><TITLE></code> tag, using some heuristics which are quite
 * specific to the news domain.
 * 
 * @author Christian Kohlschütter
 */
public final class DocumentTitleMatchClassifier implements BoilerpipeFilter {

	private final Set<String> potentialTitles;

	public DocumentTitleMatchClassifier(String title) {
		if (title == null) {
			this.potentialTitles = null;
		} else {
			
			title = title.replace('\u00a0', ' ');
			title = title.replace("'", "");
			
			title = title.trim().toLowerCase();
			
			if (title.length() == 0) {
				this.potentialTitles = null;
			} else {
				this.potentialTitles = new HashSet<String>();

				potentialTitles.add(title);

				String p;

				p = getLongestPart(title, "[ ]*[\\|»|-][ ]*");
				if (p != null) {
					potentialTitles.add(p);
				}
				p = getLongestPart(title, "[ ]*[\\|»|:][ ]*");
				if (p != null) {
					potentialTitles.add(p);
				}
				p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)][ ]*");
				if (p != null) {
					potentialTitles.add(p);
				}
				p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)\\-][ ]*");
				if (p != null) {
					potentialTitles.add(p);
				}
				p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-][ ]*");
				if (p != null) {
					potentialTitles.add(p);
				}
				p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-\u00a0][ ]*");
				if (p != null) {
					potentialTitles.add(p);
				}
				
				addPotentialTitles(potentialTitles, title, "[ ]+[\\|][ ]+", 4);
				addPotentialTitles(potentialTitles, title, "[ ]+[\\-][ ]+", 4);
				
				potentialTitles.add(title.replaceFirst(" - [^\\-]+$", ""));
				potentialTitles.add(title.replaceFirst("^[^\\-]+ - ", ""));
			}
		}
	}

	public Set<String> getPotentialTitles() {
		return potentialTitles;
	}
	
	private void addPotentialTitles(final Set<String> potentialTitles, final String title, final String pattern, final int minWords) {
		String[] parts = title.split(pattern);
		if (parts.length == 1) {
			return;
		}
		for (int i = 0; i < parts.length; i++) {
			String p = parts[i];
			if (p.contains(".com")) {
				continue;
			}
			final int numWords = p.split("[\b ]+").length;
			if (numWords >=minWords) {
				potentialTitles.add(p);
			}
		}
	}

	private String getLongestPart(final String title, final String pattern) {
		String[] parts = title.split(pattern);
		if (parts.length == 1) {
			return null;
		}
		int longestNumWords = 0;
		String longestPart = "";
		for (int i = 0; i < parts.length; i++) {
			String p = parts[i];
			if (p.contains(".com")) {
				continue;
			}
			final int numWords = p.split("[\b ]+").length;
			if (numWords > longestNumWords || p.length() > longestPart.length()) {
				longestNumWords = numWords;
				longestPart = p;
			}
		}
		if (longestPart.length() == 0) {
			return null;
		} else {
			return longestPart.trim();
		}
	}
	
	private static final Pattern PAT_REMOVE_CHARACTERS = Pattern.compile("[\\?\\!\\.\\-\\:]+");

	public boolean process(TextDocument doc)
			throws BoilerpipeProcessingException {
		if (potentialTitles == null) {
			return false;
		}
		boolean changes = false;
		
		for (final TextBlock tb : doc.getTextBlocks()) {
			String text = tb.getText();
			
			text = text.replace('\u00a0', ' ');
			text = text.replace("'", "");

			text = text.trim().toLowerCase();

			if (potentialTitles.contains(text)) {
				tb.addLabel(DefaultLabels.TITLE);
				changes = true;
				break;
			}
			
			text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim();
			if (potentialTitles.contains(text)) {
				tb.addLabel(DefaultLabels.TITLE);
				changes = true;
				break;
			}
		}
		return changes;
	}

}