AdornedXMLReader.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.morphadorner.tools;

/*	Please see the license information at the end of this file. */

import java.io.*;
import java.text.*;
import java.util.*;

import org.xml.sax.*;
import org.xml.sax.helpers.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.sentencemelder.*;
import edu.northwestern.at.utils.csv.*;
import edu.northwestern.at.utils.xml.*;

/**	Reads word tag (<w>) information from MorphAdorner XML output.
 */

public class AdornedXMLReader
{
    /** Adorned word information filter. */

	protected ExtendedAdornedWordFilter wordInfoFilter;

	/**	Create adorned XML reader.
	 *
	 *	@param	xmlInputFileName	Input XML file name.
	 */

	public AdornedXMLReader( String xmlInputFileName )
		throws org.xml.sax.SAXException, java.io.IOException
	{
								//	Read XML and load word information.

		readXML( xmlInputFileName );
	}

	/**	Reads adorned XML.
	 *
	 *	@param	xmlInputFileName	XML input file name.
	 */

	protected void readXML( String xmlInputFileName )
		throws org.xml.sax.SAXException, java.io.IOException
	{
								//	Create XML reader for XML input.

		XMLReader reader		=
			XMLReaderFactory.createXMLReader();

								//	Add filter to XML reader to
								//	add path attributes to XML elements.

		AddXMLPathFilter xmlPathFilter	=
			new AddXMLPathFilter
			(
				reader ,
				FileNameUtils.changeFileExtension
				(
					FileNameUtils.stripPathName( xmlInputFileName ) ,
					""
				)
			);
								//	Add filter to XML reader to
								//	pull out "<w>" elements containing
								//	word information.

		 wordInfoFilter	=
			new ExtendedAdornedWordFilter( xmlPathFilter , true );

								//	Parse the XML input.
								//
								//	$$$PIB$$$ So you wonder why
								//	we don't just call
								//
								//	wordInfoFilter.parse( xmlInputFileName );
								//
								//	here.  At least in Java 1.6, the
								//	runtime SAX parser is too stupid to open
								//	some file names correctly.  So we
								//	create the input source ourselves.

		BufferedReader bufferedReader	=
			new BufferedReader
			(
				new UnicodeReader
				(
					new FileInputStream( xmlInputFileName ) ,
					"utf-8"
				)
			);

		InputSource inputSource	= new InputSource( bufferedReader );

		wordInfoFilter.parse( inputSource );

		bufferedReader.close();
	}

	/**	Return list of adorned word IDs.
	 *
	 *	@return	List of adorned word IDs (strings).
	 */

	public List<String> getAdornedWordIDs()
	{
		return wordInfoFilter.getAdornedWordIDs();
	}

	/**	Get adorned words as a list of sentences.
	 *
	 *	@return		List of sentences. Each sentence is a list of
	 *				ExtendedAdornedWord entries.  Sentences are returned
	 *				in reading context order.  Only one ExtendedAdornedWord
	 *				entry is returned for split words.
	 *
	 *	<p>
	 *	This method tries to return sentences in as close to their
	 *	order of appearance in the text as possible.  Sentences from
	 *	intrusive jump tags will generally appear after the text
	 *	section into which they intrude, and so may be dislodged
	 *	an arbitrary distance from their actual position in the text.
	 *	</p>
	 */

	public List<List<ExtendedAdornedWord>> getSentences()
	{
		return wordInfoFilter.getSentences();
	}

	/**	Get adorned word information for a word ID.
	 *
	 *	@param	id	The String word ID.
	 *
	 *	@return		ExtendedAdornedWord for the specified ID.
	 *				Returns null if ID not found.
	 */

	public ExtendedAdornedWord getExtendedAdornedWord( String id )
	{
		return wordInfoFilter.getExtendedAdornedWord( id );
	}

	/**	Get adorned word information for a word index.
	 *
	 *	@param	index	The word index.
	 *
	 *	@return			ExtendedAdornedWord for the specified word index.
	 *					Returns null if index not found.
	 */

	public ExtendedAdornedWord getExtendedAdornedWord( int index )
	{
		return wordInfoFilter.getExtendedAdornedWord( index );
	}

	/**	Get related adorned word IDs for a word ID of a split word.
	 *
	 *	@param	wordID	Word ID for which related IDs are wanted.
	 *
	 *	@return			List of related word IDs.
	 *
	 *	<p>
	 *	Related word IDs are the word IDs for the other parts of
	 *	a split word.  The returned list includes the
	 *	given wordID.
	 *	</p>
	 *
	 *	<p>
	 *	For unsplit words, the single given wordID is returned
	 *	in the list.
	 *	</p>
	 *
	 *	<p>
	 *	Null is returned when the wordID does not exist.
	 *	</p>
	 */

	public List<String> getRelatedSplitWordIDs( String wordID )
	{
		return wordInfoFilter.getRelatedSplitWordIDs( wordID );
	}

	/**	Get related adorned words for split words.
	 *
	 *	@param	adornedWordInfo		Adorned word for which related words
	 *								are wanted.
	 *
	 *	@return						List of related adorned words.
	 *
	 *	<p>
	 *	Related words are those corresponding to the parts of
	 *	a split word.  The returned list includes the given word.
	 *	</p>
	 *
	 *	<p>
	 *	For unsplit words, the single given adorned word is returned
	 *	in the list.
	 *	</p>
	 */

	public List<ExtendedAdornedWord> getRelatedSplitWords
	(
		ExtendedAdornedWord adornedWordInfo
	)
	{
		return wordInfoFilter.getRelatedSplitWords( adornedWordInfo );
	}

	/**	Get index for a word ID.
	 *
	 *	@param	id	The String word ID.
	 *
	 *	@return		Index of word in adorned words list.
	 *				Returns -1 ID not found.
	 */

	public int getAdornedWordIndexByID( String id )
	{
		return wordInfoFilter.getAdornedWordIndexByID( id );
	}

	/**	Get sibling words.
	 *
	 *	@param	wordID	The word ID of the word for which to find siblings.
	 *
	 *	@return			String list of sibling word IDs.
	 *					Empty list if no siblings found.
	 *
	 *	<p>
	 *	Sibling words have the same parent hard or jump tag.
	 *	</p>
	 */

	public List<String> getSiblingWordIDs( String wordID )
	{
		return wordInfoFilter.getSiblingWordIDs( wordID );
	}

	/**	Find words whose paths start with a given string.
	 *
	 *	@param	pattern		The pattern to match.
	 *
	 *	@return				String list of words matching the pattern.
	 *						Empty if no words match.
	 */

	public List<String> findWordsByMatchingLeadingPath( String pattern )
	{
		return wordInfoFilter.findWordsByMatchingLeadingPath( pattern );
	}

	/**	Find words matching a specified path regular expression pattern.
	 *
	 *	@param	pattern		The regular expression pattern to match.
	 *
	 *	@return				String list of words matching the pattern.
	 *						Empty if no words match.
	 */

	public List<String> findWordsByMatchingPath( String pattern )
	{
		return wordInfoFilter.findWordsByMatchingPath( pattern );
	}

	/**	Trim tag number from XML tag.
	 *
	 *	@param	tag		XML tag to trim.
	 *
	 *	@return			Trimmed tag.
	 */

	public String trimTag( String tag )
	{
		return wordInfoFilter.trimTag( tag );
	}

	/**	Split word path into separate tags.
	 *
	 *	@param	path	The word path.
	 *
	 *	@return			String array of split XML tags.
	 *					The trailing "w[]" element is removed.
	 */

	public String[] splitPathFull( String path )
	{
		return wordInfoFilter.splitPathFull( path );
	}

	/**	Split word path into separate tags.
	 *
	 *	@param	path	The word path.
	 *
	 *	@return			String array of split XML tags.
	 *					Both the leading document name tag and
	 *					the trailing word (w[]) tag are removed.
	 */

	public String[] splitPath( String path )
	{
		return wordInfoFilter.splitPath( path );
	}

	/**	Get list of selected word IDs from specified ID range.
	 *
	 *	@param	startingWordID		Starting word ID.
	 *	@param	endingWordID		Ending word ID.
	 *
	 *	@return						String list of selected word IDs.
	 */

	public List<String> getSelectedWordIDs
	(
		String startingWordID ,
		String endingWordID
	)
	{
		return wordInfoFilter.getSelectedWordIDs(
			startingWordID , endingWordID );
	}

	/**	Generate XML for selected word IDs.
	 *
	 *	@param	startingWordID		Starting word ID.
	 *	@param	endingWordID		Ending word ID.
	 *
	 *	@return						Reconstituted XML encompassing
	 *								the selected word IDs.  Empty string
	 *								if one or both of the word IDs
	 *								is invalid or out of order.
	 */

	public String generateXML
	(
		String startingWordID ,
		String endingWordID
	)
	{
								//	Get list of words from starting
								//	word ID through ending word ID.

		List<String> selectedWordIDs	=
			getSelectedWordIDs( startingWordID , endingWordID );

								//	If no words found, return an empty
								//	string as the result.

		if ( selectedWordIDs.size() == 0 )
		{
			return "";
		}
								//	Holds generated XML.

		StringBuffer result		= new StringBuffer();

								//	Stack to hold XML elements in progress.

		Deque<String> tagStack	= new ArrayDeque<String>();

								//	Sentence melder to reconstitute
								//	spellings into readable sentences.

		MySentenceMelder melder	= new MySentenceMelder();

								//	Start first sentence with the
								//	melder.

		melder.startSentence();

								//	Tag classifier to differentiate
								//	hard, soft, and jump tags.

		XMLTagClassifier tagClassifier	= new TEITagClassifier();

								//	Word ID for first word.

		String id	= selectedWordIDs.get( 0 );

								//	Get word.

		ExtendedAdornedWord word	= getExtendedAdornedWord( id );

								//	Word path for first word.

		String path	= word.getPath();

								//	Spelling for first word.

		String spelling	= word.getSpelling();

								//	Split word path into constituent
								//	XML tags.  We ignore the leading
								//	document name and word number.

		String[] tags	= splitPath( path );

								//	Push the leading XML tags from
								//	the word path onto the XML tag stack.

		for ( int i = 0 ; i < tags.length ; i++ )
		{
			String tag	= trimTag( tags[ i ] );

			outputTag( tag , true , melder , tagClassifier );

			tagStack.addFirst( tag );
		}
                                //	Add the first word to the sentence
                                //	output.

		melder.processWord( spelling );

								//	No longer the first word in the
								//	sentence since we just emitted it.

		boolean firstWordInSentence	= false;

								//	Keep the previous word's XML tags.
								//	We need this to determine what
								//	tags change from word to word in order
								//	to emit proper opening and closing
								//	XML tags.

		String[] prevTags	= tags;

								//	Process all the words in the range
								//	of selected words.

		for ( int i = 1 ; i < selectedWordIDs.size() ; i++ )
		{
								//	Get next word ID.

 			id			= selectedWordIDs.get( i );

								//	Get word.

			word		= getExtendedAdornedWord( id );

								//	Get next word path.

 			path		= word.getPath();

								//	Get next spelling.

 			spelling	= word.getSpelling();

								//	Output whitespace if we need to.

			if ( melder.shouldOutputBlank( spelling , firstWordInSentence ) )
			{
				melder.outputBlank();
			}
								//	Split this word's path into
								//	separate XML tags.

			prevTags	= tags;
			tags		= splitPath( path );

								//	Find where this path and the
								//	previous path diverge.

			int m	= Math.min( tags.length , prevTags.length );
			int n	= -1;

			for ( int j = 0 ; j < m ; j++ )
			{
				if ( !tags[ j ].equals( prevTags[ j ] ) )
				{
					n	= j;
					break;
				}
			}
								//	If paths match up to the point of
								//	the shortest tag set, but the new
								//	tag set is the shorter, the difference
								//	starts at the end of the new tag set.
			if ( n == -1 )
			{
				if ( prevTags.length != tags.length )
				{
					n	= m;
				}
			}
								//	If the paths differ ..
			if ( n != -1 )
			{
								//	Emit the stacked tags beyond the
								//	point of difference.

				for ( int j = n ; j < prevTags.length ; j++ )
				{
					String tag	= tagStack.removeFirst();

					outputTag( tag , false , melder , tagClassifier );
				}
								//	Add the new tags to the stack.

				for ( int j = n ; j < tags.length ; j++ )
				{
					String tag	= trimTag( tags[ j ] );

					outputTag( tag , true , melder , tagClassifier );

					tagStack.addFirst( tag );
				}
			}
            					//	Emit current word.

			melder.processWord( spelling );

                                //	If this is the last word in the
                                //	sentence, end the sentence and
                                //	emit an end-of-line.

			if ( word.getEOS() )
			{
								//	Line wrap the sentence text at
								//	column 65.
				result.append
				(
					StringUtils.wrapLine
					(
						melder.endSentence() ,
						Env.LINE_SEPARATOR ,
						65
					)
				);
								//	Add end of line.

				result.append( Env.LINE_SEPARATOR );

								//	Reset the sentence melder.

				melder.startSentence();

								//	Next word is first word in sentence.

				firstWordInSentence	= true;
			}
			else
			{
				firstWordInSentence	= false;
			}
		}
								//	Emit any remaining stacked tags.

		result.append( melder.endSentence() );
		result.append( Env.LINE_SEPARATOR );

		while ( tagStack.size() > 0 )
		{
			String tag	= tagStack.removeFirst();

			result.append( "</" + tag + ">" );

			if ( !tagClassifier.isSoftTag( tag ) )
			{
				result.append( Env.LINE_SEPARATOR );
			}
		}
								//	Return the generated XML as a string.

		return result.toString();
	}

	/**	Output an XML tag.
	 *
	 *	@param	tag				The XML tag to output.
	 *	@param	openingTag		True to generate opening tag, false otherwise.
	 *	@param	melder			XML sentence melder.
	 *	@param	tagClassifier	XML tag classifier.
	 */

	protected void outputTag
	(
		String tag ,
		boolean openingTag ,
		MySentenceMelder melder ,
		XMLTagClassifier tagClassifier
	)
	{
								//	Output the tag opening.
		if ( openingTag )
		{
			melder.outputWord( "<" );
		}
		else
		{
			melder.outputWord( "</" );
		}
								//	Output tag text and closing.

		melder.outputWord( tag + ">" );

								//	Ouput a line feed except for
								//	soft tags.

		if ( !tagClassifier.isSoftTag( tag ) )
		{
			melder.outputWord( Env.LINE_SEPARATOR );
		}
	}

	/**	Custom sentence melder. */

	public class MySentenceMelder extends SentenceMelder
	{
		public void outputWord( String word )
		{
			super.outputWord( word );
		}
	}
}

/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.

Developed by:
   Academic and Research Technologies
   Northwestern University
   http://www.it.northwestern.edu/about/departments/at/

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimers.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimers in the documentation and/or other materials provided
      with the distribution.

    * Neither the names of Academic and Research Technologies,
      Northwestern University, nor the names of its contributors may be
      used to endorse or promote products derived from this Software
      without specific prior written permission.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/