package edu.northwestern.at.utils.corpuslinguistics.tokenizer; /* Please see the license information at the end of this file. */ import java.io.*; import java.net.*; import java.util.*; import java.util.regex.*; import edu.northwestern.at.utils.*; /** Abbreviation lists and pattern matchers. * * <p> * Holds a list of common abbreviations along with information about * whether each abbreviation can normally end a sentence or not. * Also provides patterns and methods for determining if a string is * a possible abbreviation. * </p> */ public class Abbreviations { // Compiled regular expression to match // an abbreviation. protected static Pattern abbreviationPattern = Pattern.compile ( // "([A-Z,a-z]\\.([A-Z,a-z,0-9]\\.)*)|([A-Z][bcdfghj-np-tvxz]+\\.)" "^([A-Za-z]\\.([A-Za-z0-9]\\.)+|[A-Z]\\.|[A-Z][bcdfghj-np-tvxz]+\\.)$" ); // Abbreviation pattern matcher. protected static Matcher abbreviationMatcher = abbreviationPattern.matcher( "" ); // Compiled regular expression to // match an initial. protected static Pattern initialPattern = Pattern.compile( "[A-Z][.]" ); // Initial pattern matcher. protected static Matcher initialMatcher = initialPattern.matcher( "" ); // Compiled regular expression to // match a possessive initial. protected static Pattern possessiveInitialPattern = Pattern.compile( "[A-Z][.]'[s|S]" ); // Initial pattern matcher. protected static Matcher possessiveInitialMatcher = possessiveInitialPattern.matcher( "" ); // Defined abbreviations. protected static UTF8Properties abbreviations = null; // Path to abbreviations list resource. protected final static String defaultAbbreviationsFileName = "resources/abbreviations.txt"; /** Load abbreviations list from resource properties file. * * <p> * Each line in the UTF8 abbreviations property file takes * the form: * </p> * * <p> * <code>abbrev.=n</code> * </p> * * <p> * where a value of 1 for n indicates the abbreviation can normally * end a sentence and a value of 0 for n indicates the abbreviation * normally cannot end a sentence. * </p> */ protected static void loadDefaultAbbreviations() { // If abbreviations already loaded, // don't load them again, if ( abbreviations != null ) return; // Create properties object to // hold abbreviations. abbreviations = new UTF8Properties(); // Load abbreviations from resource file. try { abbreviations.load ( Abbreviations.class.getResourceAsStream ( defaultAbbreviationsFileName ) ); } catch ( IOException ioe ) { // ioe.printStackTrace(); } } /** Load abbreviations list from a properties file. * * @param abbreviationsURL Abbreviations URL. * * @return true if abbreviations loaded OK, * false if error occurred. * * <p> * Each line in the UTF8 abbreviations property file takes * the form: * </p> * * <p> * <code>abbrev.=n</code> * </p> * * <p> * where a value of 1 for n indicates the abbreviation can normally * end a sentence and a value of 0 for n indicates the abbreviation * normally cannot end a sentence. * </p> */ public static boolean loadAbbreviations( String abbreviationsURL ) { boolean result = false; // Create properties object to // hold abbreviations if not // already created. if ( abbreviations == null ) { abbreviations = new UTF8Properties(); } // Load abbreviations from file. try { abbreviations = UTF8PropertyUtils.loadUTF8Properties ( new URL( abbreviationsURL ) , abbreviations ); result = true; } catch ( IOException ioe ) { // ioe.printStackTrace(); } return result; } /** Checks if string is a known abbreviation. * * @param str The string to check. * * @return true if "str" is on the known abbreviations list. */ public static boolean isKnownAbbreviation( String str ) { return abbreviations.containsKey( str ); } /** Checks if string is a probable abbreviation. * * @param str The string to check. * * @return true if "str" is probably an abbreviation . * * <p> * A string is declared to be a probable abbreviation if if * appears in the abbreviation list or matches the abbreviation * pattern. * </p> */ public static boolean isAbbreviation( String str ) { boolean result = isKnownAbbreviation( str ); if ( !result ) { abbreviationMatcher.reset( str ); result = abbreviationMatcher.matches(); } return result; } /** Checks if string is an abbreviation on which a sentence can end. * * @param str The string to check. * * @return true if "str" is an possible sentence-ending * abbreviation . * * <p> * A string is declared to be a probable sentence-ending abbreviation * if it appears in the abbreviation list and it has a * sentence-ending value of 1. * </p> */ public static boolean isEOSAbbreviation( String str ) { return abbreviations.getProperty( str , "0" ).equals( "1" ); } /** Checks if string is an initial. * * @param str The string to check. * * @return true if "str" is an abbreviation . * * <p> * A string is an initial when it takes the form "L." where * L is a capital letter. * </p> */ public static boolean isInitial( String str ) { initialMatcher.reset( str ); return initialMatcher.matches(); } /** Checks if string is a possible possessive initial. * * @param str The string to check. * * @return true if "str" is a possible possessive initial . * * <p> * A string is an possible possessive initial when it takes the form * "L.'s" where L is a capital letter. * </p> */ public static boolean isPossessiveInitial( String str ) { possessiveInitialMatcher.reset( str ); return possessiveInitialMatcher.matches(); } /** Get count of known abbreviations. * * @return Count of known abbreviations. */ public static int getAbbreviationsCount() { int result = 0; if ( abbreviations != null ) { result = abbreviations.size(); } return result; } /** Allow overrides but not instantiation. */ protected Abbreviations() { } /** Static initializer loads default abbreviations. */ static { // Load default abbreviations. loadDefaultAbbreviations(); } } /* Copyright (c) 2008, 2009 by Northwestern University. All rights reserved. Developed by: Academic and Research Technologies Northwestern University http://www.it.northwestern.edu/about/departments/at/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of Academic and Research Technologies, Northwestern University, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */