TagDiff.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.morphadorner.tools.tagdiff;

/*	Please see the license information at the end of this file. */

import java.net.*;
import java.io.*;
import java.text.*;
import java.util.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.math.*;

/**	Compares two tagged files and reports discrepancies.
 */

public class TagDiff
{
	/**	Map of incorrect tags and counts.
	 */

	protected static Map<String,TagCount> incorrectTags	=
		MapFactory.createNewMap();

	/**	Map of tags along with correct and incorrect counts.
	 */

	protected static Map<String,TagCount> tagCounts		=
		MapFactory.createNewMap();

	/**	Main program. */

	public static void main( String[] args )
	{
		try
		{
			compareTaggedTexts
			(
				args[ 0 ] ,
				Integer.parseInt( args[ 1 ] ) ,
				args[ 2 ] ,
				Integer.parseInt( args[ 3 ] )
			);
		}
		catch ( Exception e )
		{
			e.printStackTrace();
		}
	}

	/**	Update tag counts.
	 *
	 *	@param	map			The map to update.
	 *	@param	tag			The tag whose counts should be updated.
	 *	@param	correct		# of correct tags.
	 *	@param	incorrect	# of incorrect tags.
	 */

	protected static void updateTagCount
	(
		Map<String,TagCount> map ,
		String tag ,
		int correct ,
		int incorrect ,
		int comparisonType
	)
	{
		TagCount tagCount	= map.get( tag );

		if ( tagCount == null )
		{
			tagCount	= new TagCount( tag );

			tagCount.setComparisonType( comparisonType );

			map.put( tag , tagCount );
		}

		tagCount.update( correct , incorrect );
	}

	/**	Compare expected and generated tags.
	 *
	 *	@param	taggedFile1		The "correctly" tagged file.
	 *	@param	taggedFile2		The "incorrectly" tagged file.
	 */

	protected static void compareTaggedTexts
	(
		String taggedFile1 ,
		int tagColFile1 ,
		String taggedFile2 ,
		int tagColFile2
	)
		throws MalformedURLException, IOException
	{
								//	Open the tagged files.

		BufferedReader tagged1Reader	=
			new BufferedReader(
				new UnicodeReader(
					new FileInputStream( taggedFile1 ) , "utf-8" ) );

		BufferedReader tagged2Reader	=
			new BufferedReader(
				new UnicodeReader(
					new FileInputStream( taggedFile2 ) , "utf-8" ) );

		BufferedOutputStream bufferedStream	=
			new BufferedOutputStream( System.out );

								//	Open the output to which to write
								//	the results.

		OutputStreamWriter outWriter	=
			new OutputStreamWriter( bufferedStream , "utf-8" );

								//	Read first line from each
								//	tagged file.

		String line1	= tagged1Reader.readLine();
		String line2	= tagged2Reader.readLine();

								//	Count of lines read from each
								//	tagged file.
		int lineCount1	= 1;
		int lineCount2	= 1;

								//	Total number of matches and
								//	mismatches.
		int matches		= 0;
		int mismatches	= 0;

								//	Loop over both tagged files.
								//	They must contain the same
								//	number of words in the same order.
								//	The first entry on each line is
								//	the word spelling, and the
								//	second is the part of speech tag.
								//

		while ( ( line1 != null ) && ( line2 != null ) )
		{
								//	Ignore empty lines.

			line1	= line1.trim();
			line2	= line2.trim();

			boolean emptyline1	= ( line1.length() == 0 );
			boolean emptyline2	= ( line2.length() == 0 );

			if ( !emptyline1 && !emptyline2 )
			{
								//	Make sure the current word
								//	is the same in both files.

				String[] tokens1	= line1.split( "\t" );
				String[] tokens2	= line2.split( "\t" );

				if ( !tokens1[ 0 ].equals( tokens2[ 0 ] ) )
				{
					System.err.println(
						"Mismatched words " +
						tokens1[ 0 ] + " and " +
						tokens2[ 0 ] + " at line " + lineCount1 +
						" in first tagged file and line " +
						lineCount2 + " in second tagged file." );

					System.err.flush();

					System.exit( 1 );
				}
								//	See if the two tags match.
								//	If not, generate
								//	a confusion matrix entry.

				if	( 	tokens1[ tagColFile1 ].equalsIgnoreCase(
							tokens2[ tagColFile2 ]  ) ||
						CharUtils.isPunctuation( tokens1[ 0 ] )
					)
				{
					matches++;

					updateTagCount(
						tagCounts , tokens1[ tagColFile1 ] , 1 , 0 , 3 );
				}
				else
				{
					mismatches++;

					updateTagCount(
						tagCounts , tokens1[ tagColFile1 ] , 0 , 1 , 3 );

					String badTags	=
						tokens2[ tagColFile2 ] + " instead of " +
						tokens1[ tagColFile1 ];

					updateTagCount( incorrectTags , badTags , 0 , 1 , 2 );
				}
			}
								//	Read the next line from each
								//	each tagged file.

			if ( !emptyline2 )
			{
				line1	= tagged1Reader.readLine();
				lineCount1++;
			}

			if ( !emptyline1 )
			{
				line2	= tagged2Reader.readLine();
				lineCount2++;
			}
		}
								//	Close the input files.

		tagged1Reader.close();
		tagged2Reader.close();

								//	Sort the confusion matrix entries
								//	into descending order by error count.

		SortedArrayList<TagCount> incorrectTagCounts	=
			new SortedArrayList<TagCount>();

		for ( String tag : incorrectTags.keySet() )
		{
			TagCount tagCount	= incorrectTags.get( tag );

			incorrectTagCounts.add( tagCount );
		}
								//	Total number of words is the
								//	number of matches plus the number
								//	of mismatches.

		int wordCount	= matches + mismatches;

								//	Display the confusion matrix
								//	linearly.  Each output line
								//	contains the error count,
								//	error %, and the tags confused.

		outWriter.write( Env.LINE_SEPARATOR );
		outWriter.write( Env.LINE_SEPARATOR );
		outWriter.write( "Counts of tagging errors." );
		outWriter.write( Env.LINE_SEPARATOR );
		outWriter.write( Env.LINE_SEPARATOR );
		outWriter.write( "            Pct." );
		outWriter.write( Env.LINE_SEPARATOR );
		outWriter.write( "     Count  Error   Tags confused" );
		outWriter.write( Env.LINE_SEPARATOR );
		outWriter.write( Env.LINE_SEPARATOR );

		TagCount tagCount;

		for ( int i = 0 ; i < incorrectTagCounts.size() ; i++ )
		{
			tagCount	= (TagCount)incorrectTagCounts.get( i );

			if ( tagCount.incorrect == 0 ) continue;

			String s	=
				StringUtils.lpad
				(
					Formatters.formatIntegerWithCommas(
						tagCount.incorrect ) ,
					10
				);

			outWriter.write( s );

			double pctOfError	=
				tagCount.incorrect / (double)mismatches;

			pctOfError			=
				Math.round( pctOfError * 1000.0D ) / 10.0D;

			s					=
				StringUtils.lpad
				(
					Formatters.formatDouble( pctOfError , 1 ) ,
					5
				);

			outWriter.write( s );
			outWriter.write( "%    " );

			outWriter.write( tagCount.tag );

			outWriter.write( Env.LINE_SEPARATOR );
		}

		double pctMatched	=
			(double)matches / (double)wordCount;

		pctMatched		=
			Math.round( pctMatched * 1000.0D ) / 10.0D;

		double pctNotMatched	=
			(double)mismatches / (double)wordCount;

		pctNotMatched	=
			Math.round( pctNotMatched * 1000.0D ) / 10.0D;

		outWriter.write( Env.LINE_SEPARATOR );

		outWriter.write(
			"Total number of words    : " + wordCount );

		outWriter.write( Env.LINE_SEPARATOR );

		outWriter.write(
			"Correctly tagged words   : " +
			matches + " (" + pctMatched + "%)" );

		outWriter.write( Env.LINE_SEPARATOR );

		outWriter.write(
			"Incorrectly tagged words : " +
			mismatches + " (" + pctNotMatched + "%)" );

		outWriter.write( Env.LINE_SEPARATOR );

		outWriter.flush();
		outWriter.close();
	}

	/**	Allow overrides but not instantiation.
	 */

	protected TagDiff()
	{
	}

	/**	Class to hold counts of correct and incorrect tags.
	 */

	static class TagCount implements Comparable
	{
		/**	The tag string.
		 */

		public String tag;

		/**	Correctly tagged count.
		 */

		public int correct;

		/**	Incorrectly tagged count.
		 */

		public int incorrect;

		/**	Comparison type.
		 *
		 * 	= 0: compare tags
		 *	= 1: compare correct counts
		 *	= 2: compare incorrect counts
		 *	= 3: compare incorrect percent
		 */

		public int comparisonType;

		/**	Create tag count object.
		 *
		 *	@param	tag		The tag string.
		 */

		public TagCount( String tag )
		{
			this.tag			= tag;
			this.correct		= 0;
			this.incorrect		= 0;
			this.comparisonType	= 0;
		}

		/**	Create tag count object with given counts.
		 *
		 *	@param	tag			The tag string.
		 *	@param	correct		Correct tag count.
		 *	@param	incorrect	Incorrect tag count.
		 */

		public TagCount( String tag , int correct , int incorrect )
		{
			this.tag			= tag;
			this.correct		= correct;
			this.incorrect		= incorrect;
			this.comparisonType	= 0;
		}

		/**	Set comparison type.
		 */

		public void setComparisonType( int comparisonType )
		{
			this.comparisonType	=
				Math.min( Math.max( comparisonType , 0 ) , 2 );
		}

		/**	Update counts.
		 *
		 *	@param	correct		Number of correct entries to add.
		 *	@param	incorrect	Number of incorrect entries to add.
		 */

		public void update( int correct , int incorrect )
		{
			this.correct	+= correct;
			this.incorrect	+= incorrect;
		}

		/**	Get percentage of incorrect tags.
		 *
		 *	@return		correct / incorrect as a percentage.
		 */

		public double percentageIncorrect()
		{
			return (double)incorrect / ( correct + incorrect );
		}

		/**	Convert to string.
		 */

		public String toString()
		{
			return tag + " " + correct + " " + incorrect;
		}

		/**	Compare this object to another.
		 *
		 *	@param	object	Other object.
		 *
		 *	@return			< 0 if the other object is greater than this one,
		 *					= 0 if the two objects are equal,
	 	 *					> 0 if the other object is less than this one.
		 *
		 *	<p>
		 *	We only compare the tags.
		 *	</p>
		 */

		public int compareTo( Object object )
		{
			int result	= Integer.MIN_VALUE;

			if ( ( object != null ) && ( object instanceof TagCount ) )
			{
				TagCount otherTagCount	= (TagCount)object;

				switch ( comparisonType )
				{
					case 1		:
					{
						result	=
							-Compare.compare(
								correct , otherTagCount.correct );

						if ( result == 0 )
						{
							result	=
								Compare.compare( tag , otherTagCount.tag );
						}

						break;
                    }

					case 2		:
					{
						result	=
							-Compare.compare(
								incorrect , otherTagCount.incorrect );

						if ( result == 0 )
						{
							result	=
								Compare.compare( tag , otherTagCount.tag );
						}

						break;
                    }

					case 3		:
					{
						result	=
							-Compare.compare(
								percentageIncorrect() ,
								otherTagCount.percentageIncorrect() );

						if ( result == 0 )
						{
							result	=
								Compare.compare( tag , otherTagCount.tag );
						}

						break;
                    }

					default	:
					{
						result	=
							Compare.compare( tag , otherTagCount.tag );
					}
				}
			}

			return result;
		}
	}
}

/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.

Developed by:
   Academic and Research Technologies
   Northwestern University
   http://www.it.northwestern.edu/about/departments/at/

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimers.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimers in the documentation and/or other materials provided
      with the distribution.

    * Neither the names of Academic and Research Technologies,
      Northwestern University, nor the names of its contributors may be
      used to endorse or promote products derived from this Software
      without specific prior written permission.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/