InitialArticleWord.java example

Explorer
DSpace-master
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.text.filter;

/**
 * Abstract class for implementing initial article word filters
 * Allows you to create new classes with their own rules for mapping
 * languages to article word lists.
 * 
 * @author Graham Triggs
 */
public abstract class InitialArticleWord implements TextFilter
{
    /**
     * When no language is passed, use null and let implementation decide what to do
     */
    @Override
    public String filter(String str)
    {
        return filter(str, null);
    }
    
    /**
     * Do an initial definite/indefinite article filter on the passed string.
     * On matching an initial word, can strip or move to the end, depending on the 
     * configuration of the implementing class.
     * 
     * @param str  The string to parse
     * @param lang The language of the passed string
     * @return String The filtered string
     */
    @Override
    public String filter(String str, String lang)
    {
        // Get the list of article words for this language
        String[] articleWordArr = getArticleWords(lang);

        // If we have an article word array, process the string
        if (articleWordArr != null && articleWordArr.length > 0)
        {
            String initialArticleWord = null;
            int curPos =  0;
            int initialStart = -1;
            int initialEnd   = -1;
            
            // Iterate through the characters until we find something significant, or hit the end
            while (initialEnd < 0 && curPos < str.length())
            {
                // Have we found a significant character
                if (Character.isLetterOrDigit(str.charAt(curPos)))
                {
                    // Mark this as the cut point for the initial word
                    initialStart = curPos;
                    
                    // Loop through the article words looking for a match
                    for (int idx = 0; initialEnd < 0 && idx < articleWordArr.length; idx++)
                    {
                        // Extract a fragment from the string to test
                        // Must be same length as the article word
                        if (idx > 1 && initialArticleWord != null)
                        {
                            // Only need to do so if we haven't already got one
                            // of the right length
                            if (initialArticleWord.length() != articleWordArr[idx].length())
                            {
                                initialArticleWord = extractText(str, curPos, articleWordArr[idx].length());
                            }
                        }
                        else
                        {
                            initialArticleWord = extractText(str, curPos, articleWordArr[idx].length());
                        }

                        // Does the fragment match an article word?
                        if (initialArticleWord!= null && initialArticleWord.equalsIgnoreCase(articleWordArr[idx]))
                        {
                            // Check to see if the next character in the source
                            // is a whitespace
                            boolean isNextWhitespace = Character.isWhitespace(
                                    str.charAt(curPos + articleWordArr[idx].length())
                                );
                            
                            // Check to see if the last character of the article word is a letter or digit
                            boolean endsLetterOrDigit = Character.isLetterOrDigit(initialArticleWord.charAt(initialArticleWord.length() - 1));
                         
                            // If the last character of the article word is  a letter or digit,
                            // then it must be followed by whitespace, if not, it can be anything
                            // Setting endPos signifies that we have found an article word
                            if (endsLetterOrDigit && isNextWhitespace)
                            {
                                initialEnd = curPos + initialArticleWord.length();
                            }
                            else if (!endsLetterOrDigit)
                            {
                                initialEnd = curPos + initialArticleWord.length();
                            }
                        }
                    }

                    // Quit the loop, as we have a significant character
                    break;
                }
                
                // Keep going
                curPos++;
            }
            
            // If endPos is positive, then we've found an article word
            if (initialEnd > 0)
            {
                // Find a cut point in the source string, removing any whitespace after the article word
                int cutPos = initialEnd;
                while (cutPos < str.length() && Character.isWhitespace(str.charAt(cutPos)))
                {
                    cutPos++;
                }
                
                // Are we stripping the article word?
                if (stripInitialArticle)
                {
                    // Yes, simply return everything after the cut
                    return str.substring(cutPos);
                }
                else
                {
                    // No - move the initial article word to the end
                    return new StringBuffer(str.substring(cutPos))
                                        .append(wordSeparator)
                                        .append(str.substring(initialStart, initialEnd))
                                        .toString();
                }
            }
        }
        
        // Didn't do any processing, or didn't find an initial article word
        // Return the original string
        return str;
    }
    
    protected InitialArticleWord(boolean stripWord)
    {
        stripInitialArticle = stripWord;
    }
    
    protected InitialArticleWord()
    {
        stripInitialArticle = false;
    }

    /**
     * Abstract method to get the list of words to use in the initial word filter
     * 
     * @param lang The language to retrieve article words for
     * @return An array of definite/indefinite article words
     */
    protected abstract String[] getArticleWords(String lang);

    // Separator to use when appending article to end
    private String wordSeparator = ", ";

    // Flag to signify initial article word should be removed
    // If false, then the initial article word is appended to the end
    private boolean stripInitialArticle = false;
    
    /**
     * Helper method to extract text from a string.
     * Ensures that there is significant data (ie. non-whitespace)
     * after the segment requested.
     * 
     * @param str
     * @param pos
     * @param len
     * @return
     */
    private String extractText(String str, int pos, int len)
    {
        int testPos = pos + len;
        while (testPos < str.length() && Character.isWhitespace(str.charAt(testPos)))
        {
            testPos++;
        }
        
        if (testPos < str.length())
        {
            return str.substring(pos, pos + len);
        }
        
        return null;
    }
}