/* * MARC21InitialArticleWord.java * * Version: $Revision: 3738 $ * * Date: $Date: 2009-04-24 04:32:12 +0000 (Fri, 24 Apr 2009) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.text.filter; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.dspace.core.ConfigurationManager; /** * Implements MARC 21 standards to disregard initial * definite or indefinite article in sorting. * * Note: This only works for languages defined with IANA code entries. * * @author Graham Triggs */ public class MARC21InitialArticleWord extends InitialArticleWord { public MARC21InitialArticleWord() { // Default behaviour is to strip the initial word completely super(true); } public MARC21InitialArticleWord(boolean stripWord) { super(stripWord); } /** * Return the list of definite and indefinite article codes * for this language. */ protected String[] getArticleWords(String lang) { // No language - no words if (StringUtils.isEmpty(lang)) return defaultWords; Language l = Language.getLanguage(lang); // Is the lanugage in our map? if (l != null && ianaArticleMap.containsKey(l.IANA)) { // Get the list of words for this language ArticlesForLang articles = (ArticlesForLang)ianaArticleMap.get(l.IANA); if (articles != null) return articles.words; } return null; } // Mapping of IANA codes to article word lists private static Map ianaArticleMap = new HashMap(); private static String[] defaultWords = null; // Static initialisation - convert word -> languages map // into language -> words map static { /* Define a mapping for article words to the languages that have them. * Take from: http://www.loc.gov/marc/bibliographic/bdapp-e.html */ Object[][] articleWordArray = { { "a", Language.ENGLISH, Language.GALICIAN, Language.HUNGARIAN, Language.PORTUGUESE, Language.ROMANIAN, Language.SCOTS, Language.YIDDISH }, { "a'", Language.SCOTTISH_GAELIC }, { "al", Language.ROMANIAN }, { "al-", Language.ARABIC, Language.BALUCHI, Language.BRAHUI, Language.PANJABI, Language.PERSIAN, Language.TURKISH, Language.URDU }, { "am", Language.SCOTTISH_GAELIC }, { "an", Language.ENGLISH, Language.IRISH, Language.SCOTS, Language.SCOTTISH_GAELIC, Language.YIDDISH }, { "an t-", Language.IRISH, Language.SCOTTISH_GAELIC }, { "ane", Language.SCOTS }, { "ang", Language.TAGALOG }, { "ang mga", Language.TAGALOG }, { "as", Language.GALICIAN, Language.PORTUGUESE }, { "az", Language.HUNGARIAN }, { "bat", Language.BASQUE }, { "bir", Language.TURKISH }, { "d'", Language.ENGLISH }, { "da", Language.SHETLAND_ENGLISH }, { "das", Language.GERMAN }, { "de", Language.DANISH, Language.DUTCH, Language.ENGLISH, Language.FRISIAN, Language.NORWEGIAN, Language.SWEDISH }, { "dei", Language.NORWEGIAN }, { "dem", Language.GERMAN }, { "den", Language.DANISH, Language.GERMAN, Language.NORWEGIAN, Language.SWEDISH }, { "der", Language.GERMAN, Language.YIDDISH }, { "des", Language.GERMAN, Language.WALLOON }, { "det", Language.DANISH, Language.NORWEGIAN, Language.SWEDISH }, { "di", Language.YIDDISH }, { "die", Language.AFRIKAANS, Language.GERMAN, Language.YIDDISH }, { "dos", Language.YIDDISH }, { "e", Language.NORWEGIAN }, { "e", Language.FRISIAN }, // should be 'e - leading apostrophes are ignored { "een", Language.DUTCH }, { "eene", Language.DUTCH }, { "egy", Language.HUNGARIAN }, { "ei", Language.NORWEGIAN }, { "ein", Language.GERMAN, Language.NORWEGIAN, Language.WALLOON }, { "eine", Language.GERMAN }, { "einem", Language.GERMAN }, { "einen", Language.GERMAN }, { "einer", Language.GERMAN }, { "eines", Language.GERMAN }, { "eit", Language.NORWEGIAN }, { "el", Language.CATALAN, Language.SPANISH }, { "el-", Language.ARABIC }, { "els", Language.CATALAN }, { "en", Language.CATALAN, Language.DANISH, Language.NORWEGIAN, Language.SWEDISH }, { "enne", Language.WALLOON }, { "et", Language.DANISH, Language.NORWEGIAN }, { "ett", Language.SWEDISH }, { "eyn", Language.YIDDISH }, { "eyne", Language.YIDDISH }, { "gl'", Language.ITALIAN }, { "gli", Language.PROVENCAL }, { "ha-", Language.HEBREW }, { "hai", Language.CLASSICAL_GREEK, Language.GREEK }, { "he", Language.HAWAIIAN }, { "h\u0113", Language.CLASSICAL_GREEK, Language.GREEK }, // e macron { "he-", Language.HEBREW }, { "heis", Language.GREEK }, { "hen", Language.GREEK }, { "hena", Language.GREEK }, { "henas", Language.GREEK }, { "het", Language.DUTCH }, { "hin", Language.ICELANDIC }, { "hina", Language.ICELANDIC }, { "hinar", Language.ICELANDIC }, { "hinir", Language.ICELANDIC }, { "hinn", Language.ICELANDIC }, { "hinna", Language.ICELANDIC }, { "hinnar", Language.ICELANDIC }, { "hinni", Language.ICELANDIC }, { "hins", Language.ICELANDIC }, { "hinu", Language.ICELANDIC }, { "hinum", Language.ICELANDIC }, { "hi\u01d2", Language.ICELANDIC }, { "ho", Language.CLASSICAL_GREEK, Language.GREEK }, { "hoi", Language.CLASSICAL_GREEK, Language.GREEK }, { "i", Language.ITALIAN }, { "ih'", Language.PROVENCAL }, { "il", Language.ITALIAN, Language.PROVENCAL_OCCITAN }, { "il-", Language.MALTESE }, { "in", Language.FRISIAN }, { "it", Language.FRISIAN }, { "ka", Language.HAWAIIAN }, { "ke", Language.HAWAIIAN }, { "l'", Language.CATALAN, Language.FRENCH, Language.ITALIAN, Language.PROVENCAL_OCCITAN, Language.WALLOON }, { "l-", Language.MALTESE }, { "la", Language.CATALAN, Language.ESPERANTO, Language.FRENCH, Language.ITALIAN, Language.PROVENCAL_OCCITAN, Language.SPANISH }, { "las", Language.PROVENCAL_OCCITAN, Language.SPANISH }, { "le", Language.FRENCH, Language.ITALIAN, Language.PROVENCAL_OCCITAN }, { "les", Language.CATALAN, Language.FRENCH, Language.PROVENCAL_OCCITAN, Language.WALLOON }, { "lh", Language.PROVENCAL_OCCITAN }, { "lhi", Language.PROVENCAL_OCCITAN }, { "li", Language.PROVENCAL_OCCITAN }, { "lis", Language.PROVENCAL_OCCITAN }, { "lo", Language.ITALIAN, Language.PROVENCAL_OCCITAN, Language.SPANISH }, { "los", Language.PROVENCAL_OCCITAN, Language.SPANISH }, { "lou", Language.PROVENCAL_OCCITAN }, { "lu", Language.PROVENCAL_OCCITAN }, { "mga", Language.TAGALOG }, { "m\u0303ga", Language.TAGALOG }, { "mia", Language.GREEK }, { "n", Language.AFRIKAANS, Language.DUTCH, Language.FRISIAN }, // should be 'n - leading apostrophes are ignored { "na", Language.HAWAIIAN, Language.IRISH, Language.SCOTTISH_GAELIC }, { "na h-", Language.IRISH, Language.SCOTTISH_GAELIC }, { "nje", Language.ALBANIAN }, { "ny", Language.MALAGASY }, { "o", Language.NEAPOLITAN_ITALIAN }, // should be 'o - leading apostrophes are ignored { "o", Language.GALICIAN, Language.HAWAIIAN, Language.PORTUGUESE, Language.ROMANIAN }, { "os", Language.PORTUGUESE }, { "r", Language.ICELANDIC }, // should be 'r - leading apostrophes are ignored { "s", Language.GERMAN }, // should be 's - leading apostrophes are ignored { "sa", Language.TAGALOG }, { "sa mga", Language.TAGALOG }, { "si", Language.TAGALOG }, { "sin\u00e1", Language.TAGALOG }, { "t", Language.DUTCH, Language.FRISIAN }, // should be 't - leading apostrophes are ignored { "ta", Language.CLASSICAL_GREEK, Language.GREEK }, { "tais", Language.CLASSICAL_GREEK }, { "tas", Language.CLASSICAL_GREEK }, { "t\u0113", Language.CLASSICAL_GREEK }, // e macron { "t\u0113n", Language.CLASSICAL_GREEK, Language.GREEK }, // e macron { "t\u0113s", Language.CLASSICAL_GREEK, Language.GREEK }, // e macron { "the", Language.ENGLISH }, { "t\u014d", Language.CLASSICAL_GREEK, Language.GREEK }, // o macron { "tois", Language.CLASSICAL_GREEK }, { "t\u014dn", Language.CLASSICAL_GREEK, Language.GREEK }, // o macron { "tou", Language.CLASSICAL_GREEK, Language.GREEK }, { "um", Language.PORTUGUESE }, { "uma", Language.PORTUGUESE }, { "un", Language.CATALAN, Language.FRENCH, Language.ITALIAN, Language.PROVENCAL_OCCITAN, Language.ROMANIAN, Language.SPANISH }, { "un'", Language.ITALIAN }, { "una", Language.CATALAN, Language.ITALIAN, Language.PROVENCAL_OCCITAN, Language.SPANISH }, { "une", Language.FRENCH }, { "unei", Language.ROMANIAN }, { "unha", Language.GALICIAN }, { "uno", Language.ITALIAN, Language.PROVENCAL_OCCITAN }, { "uns", Language.PROVENCAL_OCCITAN }, { "unui", Language.ROMANIAN }, { "us", Language.PROVENCAL_OCCITAN }, { "y", Language.WELSH }, { "ye", Language.ENGLISH }, { "yr", Language.WELSH } }; // Initialize the lang -> article map ianaArticleMap = new HashMap(); int wordIdx = 0; int langIdx = 0; // Iterate through word/language array // Generate temporary language map Map langWordMap = new HashMap(); for (wordIdx = 0; wordIdx < articleWordArray.length; wordIdx++) { for (langIdx = 1; langIdx < articleWordArray[wordIdx].length; langIdx++) { Language lang = (Language)articleWordArray[wordIdx][langIdx]; if (lang != null && lang.IANA.length() > 0) { List words = (List)langWordMap.get(lang); if (words == null) { words = new ArrayList(); langWordMap.put(lang, words); } // Add language to list if we haven't done so already if (!words.contains(articleWordArray[wordIdx][0])) words.add(articleWordArray[wordIdx][0]); } } } // Iterate through languages Iterator langIter = langWordMap.keySet().iterator(); while (langIter.hasNext()) { Language lang = (Language)langIter.next(); List wordList = (List)langWordMap.get(lang); // Convert the list into an array of strings String[] words = new String[wordList.size()]; for (int idx = 0; idx < wordList.size(); idx++) words[idx] = (String)wordList.get(idx); // Sort the array into length order - longest to shortest // This ensures maximal matching on the article words Arrays.sort(words, new MARC21InitialArticleWord.InverseLengthComparator() ); // Add language/article entry to map ianaArticleMap.put(lang.IANA, new MARC21InitialArticleWord.ArticlesForLang(lang, words)); } // Setup default stop words for null languages String defaultLangs = ConfigurationManager.getProperty("marc21wordfilter.defaultlang"); if (!StringUtils.isEmpty(defaultLangs)) { String[] langArr = defaultLangs.split("[, ]+"); if (langArr != null && langArr.length > 0) { int wordCount = 0; ArticlesForLang[] afl = new ArticlesForLang[langArr.length]; for (int idx = 0; idx < afl.length; idx++) { Language l = Language.getLanguage(langArr[idx]); if (l != null && ianaArticleMap.containsKey(l.IANA)) { afl[idx] = (ArticlesForLang)ianaArticleMap.get(l.IANA); if (afl[idx] != null) { wordCount += afl[idx].words.length; } } } if (wordCount > 0) { int destPos = 0; defaultWords = new String[wordCount]; for (int idx = 0; idx < afl.length; idx++) { if (afl[idx] != null) { System.arraycopy(afl[idx].words, 0, defaultWords, destPos, afl[idx].words.length); destPos += afl[idx].words.length; } } } } } } // Wrapper class for inserting word arrays into a map private static class ArticlesForLang { final Language lang; final String[] words; ArticlesForLang(Language lang, String[] words) { this.lang = lang; this.words = words; } } // Compare strings according to their length - longest to shortest private static class InverseLengthComparator implements Comparator { public int compare(Object arg0, Object arg1) { return ((String)arg1).length() - ((String)arg0).length(); }; }; }