package edu.northwestern.at.morphadorner.tools.mergespellingdata; /* Please see the license information at the end of this file. */ import java.io.*; import java.util.*; import edu.northwestern.at.utils.*; import edu.northwestern.at.utils.corpuslinguistics.outputter.*; /** Merges multiple files of altenate spellings into one big file. * * <p> * Usage: * </p> * * <p> * java edu.northwestern.at.morphadorner.tools.mergespellingdata.MergeSpellingData output.tab input.tab input2.tab ...<br /> * <br /> * output.tab -- output merged word spelling data file.<br /> * input*.tab -- input tab-delimited files containing spelling maps to be merged.<br /> * </p> * * <p> * Each input spelling map is a utf-8 file containing two fields * separated by a tab character. The first field is a variant * spelling. The second field is the standardized spelling * for the variant. * </p> * * <p> * The output file is a utf-8 text file containing the merged spelling * maps from the input files. When a given variant appears more * than once with different standardized spellings in the input * files, the last mapping encountered is the one written to the * output file. * </p> */ public class MergeSpellingData { /** Main program for merge spelling data. */ public static void main( String[] args ) { try { mergeSpellingData( args ); } catch ( Exception e ) { e.printStackTrace(); } } /** Check if a string represents a database null value. * * @param s String to check for null value. * * @return true if string is null. */ protected static boolean isDBNull( String s ) { return ( s == null ) || s.equals( "\\N" ) || s.equals( "NULL" ); } /** Merge the spelling data. */ protected static void mergeSpellingData( String[] args ) throws IOException { // Get the file to check for non-standard // spellings. if ( args.length == 0 ) { System.out.println( "Usage: MergeSpellingData " + "combinedoutput spellinginput1 spellinginput2 ..." ); System.out.println( "" ); System.out.println( " -- combinedoutput is name of " + "file to received combined " + "alternate/standard spellings" ); System.out.println( " -- spellinginput1 ... are names of " + " files containing alternative spellings " + "mapped to standard spellings." ); System.exit( 1 ); } // Get output file name. String spellingDataOutputFileName = args[ 0 ]; // Create combined map of alternate // spellings to standard spellings from // each input file. Map<String, String> alternateSpellings = new TreeMap<String, String>(); Set<String> standardSpellings = SetFactory.createNewSet(); for ( int i = 1 ; i < args.length ; i++ ) { String altSpellingsFileName = args[ i ]; try { getAlternateSpellings ( new BufferedReader ( new UnicodeReader ( new FileInputStream( altSpellingsFileName ) , "utf-8" ) ) , alternateSpellings , standardSpellings ); System.out.println( "Merged alternate spellings from " + altSpellingsFileName ); } catch ( Exception e ) { e.printStackTrace(); System.out.println( "Unable to load alternate spellings from " + altSpellingsFileName + "." ); System.exit( 1 ); } } System.out.println( "There are " + alternateSpellings.size() + " alternate spellings." ); System.out.println( "There are " + standardSpellings.size() + " standard spellings." ); // Output the map of alternate to // standard spellings. AdornedWordOutputter outputter = null; try { outputter = new PrintStreamAdornedWordOutputter(); outputter.createOutputFile ( spellingDataOutputFileName , "utf-8" , '\t' ); } catch ( Exception e ) { e.printStackTrace(); System.out.println( "Unable to open output file " + spellingDataOutputFileName + " ." ); System.exit( 1 ); } for ( String alternateSpelling : alternateSpellings.keySet() ) { String standardSpelling = alternateSpellings.get( alternateSpelling ); outputter.outputWordAndAdornments ( new String[]{ alternateSpelling , standardSpelling } ); } // Close output file. outputter.close(); } /** Get map of alternative : canonical spelling pairs from a reader. * * @param reader The reader. */ public static void getAlternateSpellings ( Reader reader , Map<String, String> map , Set<String> set ) throws IOException { String[] tokens; BufferedReader bufferedReader = new BufferedReader( reader ); String inputLine = bufferedReader.readLine(); String alternateSpelling; String standardSpelling; while ( inputLine != null ) { tokens = inputLine.split( "\t" ); if ( tokens.length > 1 ) { alternateSpelling = tokens[ 0 ]; standardSpelling = tokens[ 1 ]; if ( !isDBNull( standardSpelling ) ) { alternateSpelling = alternateSpelling.trim(); standardSpelling = standardSpelling.trim(); if ( map.get( alternateSpelling ) == null ) { if ( !alternateSpelling.endsWith( "-" ) ) { map.put( alternateSpelling , standardSpelling ); set.add( standardSpelling ); } } } } else { System.out.println( "Skipping line = <" + inputLine + ">" ); } inputLine = bufferedReader.readLine(); } bufferedReader.close(); } /** Allow overrides but not instantiation. */ protected MergeSpellingData() { } } /* Copyright (c) 2008, 2009 by Northwestern University. All rights reserved. Developed by: Academic and Research Technologies Northwestern University http://www.it.northwestern.edu/about/departments/at/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of Academic and Research Technologies, Northwestern University, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */