/* * InitialArticleWord.java * * Version: $Revision: 3738 $ * * Date: $Date: 2009-04-24 04:32:12 +0000 (Fri, 24 Apr 2009) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.text.filter; /** * Abstract class for implementing initial article word filters * Allows you to create new classes with their own rules for mapping * languages to article word lists. * * @author Graham Triggs */ public abstract class InitialArticleWord implements TextFilter { /** * When no language is passed, use null and let implementation decide what to do */ public String filter(String str) { return filter(str, null); } /** * Do an initial definite/indefinite article filter on the passed string. * On matching an initial word, can strip or move to the end, depending on the * configuration of the implementing class. * * @param str The string to parse * @param lang The language of the passed string * @return String The filtered string */ public String filter(String str, String lang) { // Get the list of article words for this language String[] articleWordArr = getArticleWords(lang); // If we have an article word array, process the string if (articleWordArr != null && articleWordArr.length > 0) { String initialArticleWord = null; int curPos = 0; int initialStart = -1; int initialEnd = -1; // Iterate through the characters until we find something significant, or hit the end while (initialEnd < 0 && curPos < str.length()) { // Have we found a significant character if (Character.isLetterOrDigit(str.charAt(curPos))) { // Mark this as the cut point for the initial word initialStart = curPos; // Loop through the article words looking for a match for (int idx = 0; initialEnd < 0 && idx < articleWordArr.length; idx++) { // Extract a fragment from the string to test // Must be same length as the article word if (idx > 1 && initialArticleWord != null) { // Only need to do so if we haven't already got one // of the right length if (initialArticleWord.length() != articleWordArr[idx].length()) initialArticleWord = extractText(str, curPos, articleWordArr[idx].length()); } else { initialArticleWord = extractText(str, curPos, articleWordArr[idx].length()); } // Does the fragment match an article word? if (initialArticleWord!= null && initialArticleWord.equalsIgnoreCase(articleWordArr[idx])) { // Check to see if the next character in the source // is a whitespace boolean isNextWhitespace = Character.isWhitespace( str.charAt(curPos + articleWordArr[idx].length()) ); // Check to see if the last character of the article word is a letter or digit boolean endsLetterOrDigit = Character.isLetterOrDigit(initialArticleWord.charAt(initialArticleWord.length() - 1)); // If the last character of the article word is a letter or digit, // then it must be followed by whitespace, if not, it can be anything // Setting endPos signifies that we have found an article word if (endsLetterOrDigit && isNextWhitespace) initialEnd = curPos + initialArticleWord.length(); else if (!endsLetterOrDigit) initialEnd = curPos + initialArticleWord.length(); } } // Quit the loop, as we have a significant character break; } // Keep going curPos++; } // If endPos is positive, then we've found an article word if (initialEnd > 0) { // Find a cut point in the source string, removing any whitespace after the article word int cutPos = initialEnd; while (cutPos < str.length() && Character.isWhitespace(str.charAt(cutPos))) cutPos++; // Are we stripping the article word? if (stripInitialArticle) { // Yes, simply return everything after the cut return str.substring(cutPos); } else { // No - move the initial article word to the end return new StringBuffer(str.substring(cutPos)) .append(wordSeperator) .append(str.substring(initialStart, initialEnd)) .toString(); } } } // Didn't do any processing, or didn't find an initial article word // Return the original string return str; } protected InitialArticleWord(boolean stripWord) { stripInitialArticle = stripWord; } protected InitialArticleWord() { stripInitialArticle = false; } /** * Abstract method to get the list of words to use in the initial word filter * * @param lang The language to retrieve article words for * @return An array of definite/indefinite article words */ protected abstract String[] getArticleWords(String lang); // Seperator to use when appending article to end private String wordSeperator = ", "; // Flag to signify initial article word should be removed // If false, then the initial article word is appended to the end private boolean stripInitialArticle = false; /** * Helper method to extract text from a string. * Ensures that there is significant data (ie. non-whitespace) * after the segment requested. * * @param str * @param pos * @param len * @return */ private String extractText(String str, int pos, int len) { int testPos = pos + len; while (testPos < str.length() && Character.isWhitespace(str.charAt(testPos))) testPos++; if (testPos < str.length()) return str.substring(pos, pos + len); return null; } }