XGParser.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.morphadorner.xgtagger;

/*	Please see the license information in the header below. */

/** XGTagger
 *
 *	Copyright Ecole Nationale Superieure des Mines de Saint-Etienne
 *
 *	Original authors: Aude Garnier and Xavier Tannier.
 *
 *	Modifications by Philip R. "Pib" Burns at Northwestern University
 *	for integration into MorphAdorner.
 *
 *	Please DO NOT address questions about this modified version to the
 *	original authors.
 *
 *	This software is a computer program whose purpose is to provide
 *	a generic interface to deal with and analyse any XML textual content.
 *
 *	This software is governed by the CeCILL  license under French law and
 *	abiding by the rules of distribution of free software.	You can  use,
 *	modify and/ or redistribute the software under the terms of the CeCILL
 *	license as circulated by CEA, CNRS and INRIA at the following URL
 *	"http://www.cecill.info".
 *
 *	As a counterpart to the access to the source code and  rights to copy,
 *	modify and redistribute granted by the license, users are provided only
 *	with a limited warranty  and the software's author,  the holder of the
 *	economic rights,  and the successive licensors	have only  limited
 *	liability.
 *
 *	In this respect, the user's attention is drawn to the risks associated
 *	with loading,  using,  modifying and/or developing or reproducing the
 *	software by the user in light of its specific status of free software,
 *	that may mean  that it is complicated to manipulate,  and  that  also
 *	therefore means  that it is reserved for developers  and  experienced
 *	professionals having in-depth computer knowledge. Users are therefore
 *	encouraged to load and test the software's suitability as regards their
 *	requirements in conditions enabling the security of their systems and/or
 *	data to be ensured and,  more generally, to use and operate it in the
 *	same conditions as regards security.
 *
 *	The fact that you are presently reading this means that you have had
 *	knowledge of the CeCILL license and that you accept its terms.
 */

import java.util.*;
import java.util.regex.*;
import java.lang.Thread;
import java.io.*;

import javax.print.attribute.standard.NumberOfDocuments;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.*;
import org.xml.sax.SAXException;
import org.xml.sax.InputSource;

import java.util.HashMap;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Set;
import java.util.Map;
import java.util.Enumeration;
import java.util.Vector;

import edu.northwestern.at.morphadorner.*;
import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.inputter.*;
import edu.northwestern.at.utils.corpuslinguistics.outputter.*;

/**	Parse XML document for morphological adornment.
 *
 *	@author Aude Garnier, Xavier Tannier
 */

public class XGParser
{
	// Execution options

	XGOptions options;

	// tag/text hash map

	Map<Integer , XGPair> hMap;

	// attributes creation

	Map<String , String> hmAttributes;

	// Document Entities

	NamedNodeMap nnmEntities;
	boolean boolDot;
	int intCountNonBlanks;
	int intCountTags;

	// number of last textual node (#text) parsed

	int intCpt;
	String strLine;
	StringBuffer sbWord;
	int intStrWordIndex;
	int intStrWordLength;
	String strWord;
	int intLongWord;
	int intID;

	UnicodeReader frCurrent;
	BufferedReader brCurrent;

	AdornedWordOutputter adornerOutputter;

	/**	Next adorned word to process. */

	int nextAdornedWord;

	/**	List of adorned word data entries. */

	List adornedWordDataList;

	/** Surrounding sentence/phrase marker.
	 */

	String surroundMarker;
	String surroundMarkerTrim;

	/**	Surround marker string length. */

	int surroundMarkerLength;

	/**	Map of multipart word IDs to # of parts.
	 *
	 *	<p>
	 *	Records for each word split by soft or jump tags,
	 *	the ID for that word and the number of parts into
	 *	which it is split.
	 *	</p>
	 */

	Map<Integer, Integer> splitWords	= MapFactory.createNewMap();

	/**	Number of word nodes created. */

	int wordNodesCreated	= 0;

	/** File separator. */

	static final String FILE_SEPARATOR =
		System.getProperty( "file.separator" );

	/**	Create parser.
	 *
	 *	@param	options		Options for processing.
	 *	@param	document	Document to process.
	 */

	public XGParser( XGOptions options , Document document )
	{
		this.options				= options;
		this.hMap					= MapFactory.createNewMap();

		hmAttributes				= MapFactory.createNewMap();

		this.strLine				= null;
		this.boolDot				= false;
		this.intCpt					= 0;
		this.intCountNonBlanks		= 0;
		this.intCountTags			= 0;
		this.strWord				= "";
		this.sbWord					= new StringBuffer();
		this.intStrWordIndex		= 0;
		this.intStrWordLength		= 0;
		this.intID					= 0;
		this.frCurrent				= null;
		this.brCurrent				= null;
		this.surroundMarker			= this.options.getSurroundMarker();
		this.surroundMarkerTrim		= this.surroundMarker.trim();
		this.surroundMarkerLength	= surroundMarkerTrim.length();
 		this.nextAdornedWord		= 0;
		this.adornedWordDataList	= null;
    	this.wordNodesCreated		= 0;

		AdornedWordOutputter adornerOutputter	= null;

		if ( document.getDoctype() != null )
		{
			this.nnmEntities = document.getDoctype().getEntities();
		}
	}

	/**	Set running word ID.
	 *
	 *	@param	runningWordID	The running word ID.
	 */

	public void setRunningWordID( int runningWordID )
	{
		this.intID	= runningWordID;
	}

	/**	Get word ID.
	 *
	 *	@return		The current running word ID.
	 */

	public int getRunningWordID()
	{
		return intID;
	}

	/**	Get number of adorned words.
	 *
	 *	@return		Number of adorned words.
	 */

	public int getNumberOfAdornedWords()
	{
		return wordNodesCreated;
	}

	/**	Reads a integer from the adorner.
	 *
	 *	@return		The next <code>int</code> in the output stream.
	 *
	 *	<p>
	 *	If this output is split into several files, handle
	 *	multiple buffers.
	 *	</p>
	 */

	protected int read()
		throws IOException , FileNotFoundException
	{
								//	First reading.

		if ( frCurrent == null )
		{
								//	Create FileReader and BufferedReader.

			byte[] outputBytes	=
				((ByteStreamAdornedWordOutputter)adornerOutputter).getBytes();

			frCurrent	=
				new UnicodeReader
				(
					new ByteArrayInputStream( outputBytes ) ,
					"utf-8"
				);

			brCurrent = new BufferedReader( frCurrent );

								//	Read.

			return this.read();
		}
		else
		{
			return this.brCurrent.read();
		}
	}

	/**	Reads next entry of adorner and updates appropriate class variables.
	 */

	protected void getNextEntry()
		throws IOException , FileNotFoundException
	{
		String strElem;
		String strAttName;

		this.hmAttributes.clear();

		this.strWord	= "";

		if ( this.nextAdornedWord < adornedWordDataList.size() )
		{
			List adornedWordData	=
				(List)adornedWordDataList.get( this.nextAdornedWord++ );

			for ( int i = 0 ; i < adornedWordData.size() ; i++ )
			{
				strElem	= (String)adornedWordData.get( i );

								//	Initial word.

				if ( this.options.getWordField() == ( i + 1 ) )
				{
					this.strWord = strElem;
				}
								//	Other fields.

				strAttName =
					MorphAdornerSettings.getXMLWordAttribute( i  );

				if ( strAttName.length() > 0  )
				{
					this.hmAttributes.put( strAttName , strElem );
				}
			}
		}
								//	Id.
		++( this.intID );

		this.intStrWordIndex	= 0;
		this.intStrWordLength	= this.strWord.length();
	}

	/**	Extract text form <code>node</code>.
	 *
	 *	@param	node	the <code>Node</code> to parse.
	 *
	 *	@return			A <code>StringBuffer</code> containing the
	 *					element text, taking reading context into account.
	 *
	 *	<p>
	 *	The algorithm used to parse children (soft, jump, hard tags)
	 *	is the same as that in {@link #modifyDOM}.
	 *	</p>
	 */

	public StringBuffer extractText( Node node )
		throws IOException
	{
		StringBuffer sbResult = new StringBuffer();
		StringBuffer sbBuffer = new StringBuffer();

		boolean boolInternDot = false;

								//	Get list of child nodes.

		NodeList nlChildren	= node.getChildNodes();

								//	Number of child nodes.

		int intChildNumber	= nlChildren.getLength();

		String strText;
		String strChildName;

		Vector<Integer> vectorTempJumpTags = new Vector<Integer>();
		Node nodeChild;
		int i;
								//	Parse all the children.

		for ( i = 0 ; i < intChildNumber ; ++i )
		{
								//	Next child.

			nodeChild		= nlChildren.item( i );
			strChildName	= nodeChild.getNodeName();

								//	Child is an entity reference,

			if ( nodeChild instanceof EntityReference )
			{
				Entity entity =
					(Entity)this.nnmEntities.getNamedItem( strChildName );

								//	If it is a reference to an external
								//	file.

				if	(	( entity.getSystemId() != null ) &&
						!this.options.getEntityIgnoreFiles()
					)
				{
								//	If the user has not set the
								//	proper options: error.

					if	(	!this.options.isOutputDirectory() &&
							!this.options.getEntityMerging()
						)
					{
						MorphAdornerLogger.logError
						(
							"Error: XML input " +
								" contains external file entity " +
								"references.\n  Specified output should " +
								"be a directory, or options " +
								"xml.entities_not_files or " +
								"xml.entities_merge " +
								"should be set.\n"
						);

						System.exit( -1 );
					}
								//	Else extract text in its children
								//	and stop for itself.

					sbResult.append( this.extractText( nodeChild ) );
				}
								//	Other entity reference.
				else
				{
								//	If the user has asked us to treat it.

					if ( this.options.getEntityTreatAll() )
					{
								//	Extract text in its children and stop
								//	for itself.

						sbResult.append( this.extractText( nodeChild ) );
					}
								//	Else add a space character in the
								//	reading context.
					else
					{
						sbResult.append( " " );
					}
				}

				continue;
			}
								//	Child is a Text node.

			if ( nodeChild instanceof Text )
			{
								//	Text with all whitespace mapped
								//	to blanks.
				strText	=
					nodeChild.getNodeValue().replaceAll( "\\s" , " " );

								//	Count number of non-blank
								//	characters.

				int nbChars	= countNonBlankCharacters( strText );

								//	Append any non-blank
								//	characters to reading context.

				sbResult.append( strText );

				if ( nbChars > 0 )
				{
//					sbResult.append( strText );

					this.boolDot = false;
				}
				else
				{
					boolInternDot = true;
				}
			}
								//	Child is not text.
			else
			{
								//	Not a jump tag.

				if ( !this.options.isJumpTag( strChildName ) )
				{
								//	If not a soft tag and if a surround
								//	marker has been previously requested,
								//	add a surround marker.

					boolean boolSoftTag =
						this.options.isSoftTag( strChildName );

					if ( boolInternDot && !boolSoftTag )
					{
						sbResult.append( surroundMarker );

						this.intCountNonBlanks	+=
							surroundMarkerLength;
					}
								//	Recursively call extractText
								//	on the child node.

					sbBuffer	= this.extractText( nodeChild );

								//	If we got back some text ...

					if ( !sbBuffer.equals( "" ) )
					{
								//	Append child text.

						sbResult.append( sbBuffer );

								//	Check for soft tag.

						if ( this.options.isSoftTag( strChildName ) )
						{
							boolInternDot	= true;
							this.boolDot	= false;
						}
								//	Not soft tag.  Must be hard tag.
						else
						{
							if ( !this.boolDot )
							{
								sbResult.append( surroundMarker );

								this.intCountNonBlanks	+=
									surroundMarkerLength;
							}

							this.boolDot	= true;
							boolInternDot	= false;
						}
					}
				}
								//	Is a jump tag.
				else
				{
								//	Remember we skipped jump tag.

					vectorTempJumpTags.add( new Integer( i ) );
				}
			}
		}
								//	If we encountered some jump tags,
								//	we need to treat them now.

		if ( !vectorTempJumpTags.isEmpty() )
		{
								//	Treat all jump tag numbers.

			for ( int j = 0 ; j < vectorTempJumpTags.size() ; j++ )
			{
				nodeChild =
					nlChildren.item
					(
						vectorTempJumpTags.get( j ).intValue()
					);

				this.intCountNonBlanks	+= surroundMarkerLength;

								//	Recursively call extractText on the
								//	jump tag node.

				sbBuffer = this.extractText( nodeChild );

								//	Append text and surround marker
								//	to accumulated text.

				sbResult.append( surroundMarker + sbBuffer );
			}
		}

		return sbResult;
	}

	/**	Create new document node.
	 *
	 *	@param	doc					The document we're processing.
	 *	@param	node				The current node we're processing.
	 *	@param	nodeChild			The child node we're processing.
	 *	@param	strCurrentPath		Current XML path to this node.
	 *	@param	integerTagNumber	Integer tag number for path.
	 *
	 *	@return						# of string word elements generated.
	 */

	protected int createNewNode
	(
		Document doc ,
		Node node ,
		Node nodeChild ,
		String strCurrentPath ,
		Integer integerTagNumber
	)
	{
		String[] strArray;
								//	Do nothing if we don't have
								//	node text or the text contains
								//	the surround marker.

		if	(	( this.sbWord.length() == 0 ) ||
				( this.sbWord.indexOf(  surroundMarkerTrim ) >= 0 )
			)
		{
			this.sbWord.delete( 0 , this.sbWord.length() );
			return 0;
		}
								//	A special separator cuts the "word"
								//	(or expression).  Only the text part
								//	of the element will change.

		if ( this.options.getSpecialSeparator() != null )
		{
			strArray =
				( this.sbWord.toString() ).split(
					this.options.getSpecialSeparator() );
		}
		else
		{
			strArray		= new String[ 1 ];
			strArray[ 0 ]	= this.sbWord.toString();
		}
								//	If this is a split word, record
								//	its ID and the number of split
								//	parts.

		int splitCount	= 1;

		if ( splitWords.containsKey( this.intID ) )
		{
			splitCount	= splitWords.get( this.intID ) + 1;
		}

		splitWords.put( this.intID , splitCount );

								//	Loop over each element of the array,
								//	split by special separator.

		for ( int i = 0 ; i < strArray.length ; i++ )
		{
								//	Create a new node.

			Element elementNewTag =
				doc.createElement( this.options.getWordTagName() );

			Text newText = doc.createTextNode( strArray[ i ] );

								//	Generate a node ID.

			if ( this.options.getWriteIds() )
			{
				elementNewTag.setAttribute
				(
					this.options.getIdArgumentName() ,
					String.valueOf( this.intID )
				);
			}
								//	Generate path.

			if ( this.options.getWritePath() % 2 == 1 )
			{
				if ( integerTagNumber == null )
				{
					integerTagNumber = 1;
				}
				else
				{
					++integerTagNumber;
				}

				elementNewTag.setAttribute
				(
					this.options.getWordPathArgumentName() ,
					strCurrentPath + File.separator +
						this.options.getWordTagName() + "[" +
						integerTagNumber.toString() + "]"
				);
			}
								//	Create attributes.

			if ( ( i == 0 ) || this.options.repeatAttributes() )
			{
				Set< Map.Entry< String , String > > setEnum =
					hmAttributes.entrySet();

				for ( Map.Entry< String , String > entry : setEnum )
				{
					elementNewTag.setAttribute
					(
						entry.getKey() ,
						entry.getValue()
					);
				}
			}
								//	Insert new tag.

			elementNewTag.appendChild( newText );

			node.insertBefore( elementNewTag , nodeChild );

			this.sbWord.delete( 0 , this.sbWord.length() );
		}

		wordNodesCreated++;

		return strArray.length;
	}

	/** Clone a node and its sub-elements.
	 *
	 *	@param node		The <code>Node</code> to clone
	 *
	 *	@return			The <code>Node</code> cloned.
	 */

	protected static Node cloneNode( Node node )
	{
		Node nodeClone			= node.cloneNode( false );
		NodeList nodeChildList	= node.getChildNodes();

		int intChildNumber		= nodeChildList.getLength();

		try
		{
			for ( int i = 0 ; i < intChildNumber ; ++i )
			{
				nodeClone.appendChild
				(
					XGParser.cloneNode( nodeChildList.item( i ) )
				);
			}
		}
								//	If cloning is not possible,
								//	clone with subelement.

		catch ( org.w3c.dom.DOMException e )
		{
			nodeClone = node.cloneNode( true );
		}

		return nodeClone;
	}

	/**	Clone a read-only EntityReference into a writable Node.
	 *
	 *	@param	er		The <code>EntityReference</code> to clone.
	 *	@param	doc		The parent <code>Document</code>.
	 *
	 *	@return			A <code>Node</code> containing the same
	 *					writable sub-elements than <code>er</code> .
	 */

	protected Node cloneEntityReference
	(
		EntityReference er ,
		Document doc
	)
	{
		Node nodeClone			= doc.createElement( "entityReferenceRoot" );
		NodeList nodeChildList	= er.getChildNodes();

		int intChildNumber		= nodeChildList.getLength();

		for ( int i = 0 ; i < intChildNumber ; ++i )
		{
			nodeClone.appendChild
			(
				XGParser.cloneNode( nodeChildList.item( i ) )
			);
		}

		return nodeClone;
	}

	/**	Modify <code>element</code> to add adornments and remove initial text node.
	 *
	 *	@param	node			The <code>Node</code> to parse.
	 *	@param	doc				The <code>Document</code> to modify.
	 *	@param strCurrentPath	The XPath or the last <code>Node</code>
	 *							explored.
	 *
	 *	@return					Modified <code>Document</code>.
	 *
	 *	<p>
	 *	The algorithm used to parse children (soft, jump, hard tags)
	 *	is the same as used in {@link #extractText}.
	 *	</p>
	 */

	public Document modifyDOM
	(
		Node node ,
		Document doc ,
		String strCurrentPath
	)
		throws DOMException , IOException
	{
		String strText = null;
		int intBegin;
		int intEnd;
								//	Child list.

		NodeList nlChildren	= node.getChildNodes();

		Node nodeChild;
		String strNodeChildName;

		String strNewPath	= null;

		int intChildNumber	= nlChildren.getLength();
		int i , t;
		Integer integerTagNumber;
		boolean boolConsiderAsAnElement = false;

								//	Path reminder.

		Map< String , Integer > hmPaths = MapFactory.createNewMap();

								//	Result.
		StringBuilder sbNew;

								//	Jump tag number in the children list.

		Vector<Integer> vectorTempJumpTags	=
			new Vector<Integer>();

		for (  i = 0 ; i < intChildNumber ; ++i )
		{
								//	Child.

			nodeChild			= nlChildren.item( i );
			strNodeChildName	= nodeChild.getNodeName();

								//	DTD description (element DOCTYPE):
								//	to be removed!

			if ( nodeChild instanceof DocumentType )
			{
				Comment comment1 =
					doc.createComment(
						"Document Type Description element (DOCTYPE \"" +
							nodeChild.getNodeName() +
							"\") has been removed. " );

				Comment comment2 =
					doc.createComment(
						"To build a correct DTD for this document, " +
							"change all #PCDATA into '" +
							this.options.getWordTagName() +
							"' element, containing #PCDATA." );

				node.insertBefore( comment1 , nodeChild );
				node.insertBefore( comment2 , nodeChild );
				node.removeChild( nodeChild );

								//	Two children added (comments),
								//	one removed (DOCTYPE) =>
								//	one more child!
				++i;
				++intChildNumber;

				MorphAdornerLogger.logError
				(
					" *** Element DOCTYPE (\"" + nodeChild.getNodeName() +
					"\") removed in the output (out of date) *** "
				);

				continue;
			}

			boolean boolT	= false;

								//	Child is an entity reference.

			if ( nodeChild instanceof EntityReference )
			{
				Entity entity =
					(Entity)this.nnmEntities.getNamedItem(
						strNodeChildName );

								//	If it is not a reference to an external
								//	file =>
								//	If the user has asked to treat it
								//	(--entities_treat_all) =>
								//	add all of its children to the
								//	tree.

				if ( entity.getSystemId() == null )
				{
					if ( this.options.getEntityTreatAll() )
					{
						Node nodeClone				=
							this.cloneEntityReference(
								(EntityReference)nodeChild , doc );

						NodeList nlGrandChildren	=
							nodeClone.getChildNodes();

						int intGrandChildrenNumber	=
							nlGrandChildren.getLength();

						for	(	int intGrandChild = 0 ;
								intGrandChild < intGrandChildrenNumber;
								++intGrandChild
							)
						{
							if ( i != ( intChildNumber - 1 ) )
							{
								node.insertBefore(
									nlGrandChildren.item( intGrandChild ) ,
									nodeClone.getNextSibling() );
							}
							else
							{
								node.appendChild(
									nlGrandChildren.item( intGrandChild ) );
							}

							++intChildNumber;
						}

						node.removeChild( nodeChild );

						--intChildNumber;
						--i;
					}

					continue;
				}
								//	If it is a reference to an external
								//	file ...
				else
				{
								//	If the user has not asked to ignore
								//	this kind of file.

					if ( !this.options.getEntityIgnoreFiles() )
					{
								//	If the user has not set the proper
								//	options, an error should have already
								//	been raised by extractText.
								//	But do it again.

						if	(	!this.options.isOutputDirectory() &&
								!this.options.getEntityMerging()
							)
						{
							MorphAdornerLogger.logError
							(
								"Error: XML output file " +
									" contains some external file " +
									"entity references.\n  " +
									"Specified output should be a " +
									"directory."
							);

							System.exit( -1 );
						}

								//	Recursive modifyDOM call on the tag
								//	and update.
						else
						{
								//	As an EntityReference is readonly,
								//	we have to clone the Node.

							Node nodeClone =
								this.cloneEntityReference(
									(EntityReference)nodeChild , doc );

							doc =
								this.modifyDOM( nodeClone , doc ,
								strCurrentPath );

								//	If the entities should be merged.

							if ( this.options.getEntityMerging() )
							{
								NodeList nlNewGrandChildren =
									nodeClone.getChildNodes();

								int intNewGrandChildrenNumber =
									nlNewGrandChildren.getLength();

								//	Comment to begin.

								Node nodeComment =
									doc.createComment
									(
										" ++ " + nodeChild.getNodeName() +
										" ++ Here begins the content of " +
										" entity " +
										nodeChild.getNodeName() +
										" inserted here in place of " +
										"a reference to this entity in " +
										" the original document."
									);

								node.insertBefore
								(
									nodeComment ,
									nodeChild
								);

								++i;
								++intChildNumber;

								//	Copy content.

								for	(	int intGrandChild = 0 ;
										intGrandChild <
											intNewGrandChildrenNumber ;
											++intGrandChild
									)
								{
									node.insertBefore
									(
										nlNewGrandChildren.item(
											intGrandChild ) ,
										nodeChild
									);

									++i;
									++intChildNumber;
								}

								//	Comment to end (this comment
								//	(+ 1 child) and the child removal
								//	(- 1 child) = nothing.

								node.insertBefore
								(
									doc.createComment
									(
										" -- " + nodeChild.getNodeName() +
										" -- End of entity " +
										nodeChild.getNodeName() ) ,
										nodeChild
									);

								node.removeChild( nodeChild );
							}

								//	if output is a directory
								//	(already checked) =>
								//	write a separate file.
								//
								//	Note:  Should never get here
								//	in MorphAdorner.

							else
							{
/*
								String strFileName = entity.getSystemId();

								strFileName =
									new File( entity.getSystemId() ).getName();

								//	Print result.

								XGMisc.printNodeList
								(
									nodeClone.getChildNodes() ,
									"<!-- File referenced by an XML " +
										"well-formed document -->\n" ,
									options.getOutputFileName() +
										File.separator + strFileName
								);

								MorphAdornerLogger.logInfo
								(
									"\nExternal file referenced written in " +
										options.getOutputFileName() +
										File.separator + strFileName + "\n"
								);
*/
								MorphAdornerLogger.logError
								(
									"Internal error:  attempted to write " +
									"secondary XML output file."
								);
							}
						}
					}

					continue;
				}
			}
								//	Text node.

			if ( nodeChild instanceof Text )
			{
								//	Text with normalized blanks.

				strText =
					nodeChild.getNodeValue().replaceAll( "\\s" , " " );

								//	Number of the text tag.

				++( this.intCountTags );

								//	Find entry for tag in hash map.

				XGPair pairResult =
					this.hMap.get( new Integer( this.intCountTags ) );

								//	Find where tag's text starts
								//	and ends.

				intBegin	= pairResult.begin;
				intEnd		= pairResult.end;

								//	Skip surround markers.

				while ( this.intCpt < intBegin )
				{
					if ( !strWord.equals( surroundMarkerTrim ) )
					{
						break;
					}

					this.getNextEntry();
					this.intCpt++;
				}
								//	Loop on nonblank characters.

				while ( this.intCpt < intEnd )
				{
								//	Append word text if any.

					if ( !this.strWord.equals( "" ) )
					{
						this.sbWord.append
						(
							this.strWord.charAt( this.intStrWordIndex )
						);
					}
								//	If we are at the end of a word ...

					if	(	this.intStrWordIndex >=
							( this.intStrWordLength - 1 )
						)
					{
								//	Create a new node.

								//	If paths should be added ...

						if ( this.options.getWritePath() % 2 == 1 )
						{
							integerTagNumber =
								hmPaths.get
								(
									this.options.getWordTagName()
								);

							t =
								this.createNewNode
								(
									doc ,
									node ,
									nodeChild ,
									strCurrentPath ,
									integerTagNumber
								);

							if ( integerTagNumber != null )
							{
								hmPaths.put
								(
									this.options.getWordTagName() ,
									integerTagNumber + t
								);
							}
							else
							{
								hmPaths.put
								(
									this.options.getWordTagName() ,
									new Integer( t )
								);
							}
						}
								//	If no path has been requested,
								//	create node with no path.

						else
						{
							t =
								this.createNewNode
								(
									doc ,
									node ,
									nodeChild ,
									null ,
									0
								);
						}

						intChildNumber	+= t;
						i				+= t;

								//	Get next adornment entry.

						this.getNextEntry();
					}
								//	If not the end of the word,
								//	increment the character index
								//	in the word.
					else
					{
						++( this.intStrWordIndex );
					}
								//	If next characters correspond to the
								//	special separator, intCpt should not
								//	follow!

					if ( this.options.getSpecialSeparator() != null )
					{
						if	(	this.strWord.length() >=
								(	this.intStrWordIndex +
									this.options.getSpecialSeparator().length()
								)
							)
						{
							if	(	this.strWord.substring
									(
										this.intStrWordIndex ,
										this.intStrWordIndex +
										this.options.getSpecialSeparator().
											length() ).equals(
											this.options.getSpecialSeparator()
									)
								)
							{
								this.sbWord.append(
									this.options.getSpecialSeparator() );

								this.intStrWordIndex +=
									this.options.getSpecialSeparator().length();
							}
						}
					}

					++( this.intCpt );
				}

								//	If a word has been found (usual case).

				if ( this.sbWord.length() > 0 )
				{
								//	Create a new node.

								//	Should paths should be added?

					if ( this.options.getWritePath() % 2 == 1 )
					{
						integerTagNumber =
							hmPaths.get
							(
								this.options.getWordTagName()
							);

						t =
							this.createNewNode
							(
								doc ,
								node ,
								nodeChild ,
								strCurrentPath ,
								integerTagNumber
							);

						if ( integerTagNumber != null )
						{
							hmPaths.put
							(	this.options.getWordTagName() ,
								integerTagNumber + t
							);
						}
						else
						{
							hmPaths.put
							(
								this.options.getWordTagName() ,
								new Integer( t )
							);
						}
					}
								//	If no path has been requested.
					else
					{
						t =
							this.createNewNode( doc , node ,
								nodeChild , null , 0 );
					}

					intChildNumber	+= t;
					i				+= t;
				}
								//	If we have seen all text contained
								//	in the tag.

				if ( this.intCpt >= intEnd )
				{
								//	Delete old child text node.

					node.removeChild( nodeChild );

					--intChildNumber;
					--i;
				}
			}
							//	Child is not text.
			else
			{
								//	Not a jump tag.

				if ( !this.options.isJumpTag( strNodeChildName ) )
				{
								//	Path.

					if ( this.options.getWritePath() > 0 )
					{
						integerTagNumber =
							hmPaths.get( strNodeChildName );

						if ( integerTagNumber == null )
						{
							integerTagNumber = 1;
						}
						else
						{
							++integerTagNumber;
						}

						strNewPath =
							strCurrentPath + File.separator +
							strNodeChildName +
							"[" + integerTagNumber.toString() + "]";

						if ( this.options.getWritePath() >= 2 )
						{
							( (Element)nodeChild ).setAttribute
							(
								this.options.getTagsPathArgumentName() ,
								strNewPath
							);
						}

						hmPaths.put
						(
							strNodeChildName ,
							integerTagNumber
						);
					}
								//	If hard tag or soft tag,
								//	pursue treatment.

					doc	=
						this.modifyDOM( nodeChild , doc , strNewPath );
				}
								//	Jump tag.
				else
				{
								//	Skip jump tag, but remember we did.

					vectorTempJumpTags.add( new Integer( i ) );
				}
			}
		}
								//	All the children have been passed
								//	and there was a jump tag.

		if ( !vectorTempJumpTags.isEmpty() )
		{
								//	Treat all jump tag numbers.

			for ( int j = 0 ; j < vectorTempJumpTags.size() ; j++ )
			{
				nodeChild =
					nlChildren.item
					(
						vectorTempJumpTags.get( j ).intValue()
					);

				strNodeChildName = nodeChild.getNodeName();

								//	Path.

				if ( this.options.getWritePath() >= 0 )
				{
					integerTagNumber	= hmPaths.get( strNodeChildName );

					if ( integerTagNumber == null )
					{
						integerTagNumber = 1;
					}
					else
					{
						++integerTagNumber;
					}

					strNewPath	=
						strCurrentPath + File.separator +
						strNodeChildName +
						"[" + integerTagNumber.toString() + "]";

					if ( this.options.getWritePath() >= 2 )
					{
						( (Element)nodeChild ).setAttribute
						(
							this.options.getTagsPathArgumentName() ,
							strNewPath
						);
					}

					hmPaths.put( strNodeChildName , integerTagNumber );
				}

				this.intCountNonBlanks++;

								//	Recursively call modifyDOM on the
								//	jump tag node.

				doc	= this.modifyDOM( nodeChild , doc , strNewPath );
			}
		}

		return doc;
	}

	/**	Count non-blank characters in a <code>String</code> and
	 *	update the tag <code>HashMap</code>.
	 *
	 *	@param	strString		The text to analyze.
	 *
	 *	@return					Number of non-blank
	 *								characters in strString.
	 *
	 *	<p>
	 *	strString should have all whitespace characters mapped to
	 *	blanks before this method is called.
	 *	</p>
	 */

	protected int countNonBlankCharacters( String strString )
		throws IOException
	{
								//	Increment tag count.

		this.intCountTags++;

								//	Length of input string.

		int intLetters = strString.length();

								//	Only non-blank characters are
								//	counted.
		int nonBlanks	= 0;

		for ( int i = 0 ; i < intLetters ; i++ )
		{
			if ( strString.charAt( i ) != ' ' )
			{
				nonBlanks++;
			}
		}
								//	First character.

		int intBegin = this.intCountNonBlanks;

		if ( nonBlanks > 0 )
		{
			intBegin++;
		}
								//	Last character.

		this.intCountNonBlanks	+= nonBlanks;

								//	HashMap update.
		hMap.put
		(
			new Integer( this.intCountTags ) ,
			new XGPair( intBegin , this.intCountNonBlanks )
		);

		return nonBlanks;
	}

	/**	Extract text from DOM document.
	 *
	 *	@param	options		The processing options.
	 *	@param	document	The document to process.
	 *
	 *	@return				Two element object array.
	 *						result[ 0 ]	= XGParser instance.
	 *						result[ 1 ]	= reading context text.
	 *
	 *	@throws				IOException
	 */

	public static Object[] extractText
	(
		XGOptions options ,
		Document document
	)
		throws IOException
    {
		StringBuffer sbText = null;
		Object[] result		= new Object[ 2 ];

								// Start document treatment.

		XGParser instance	= new XGParser( options , document );

								//	Save parser instance.

		result[ 1 ]			= instance;

								// Extract text of reading context.

		sbText = instance.extractText( document );

		String strText = sbText.toString();

								//	Return text of reading context.

		result[ 0 ]	= sbText.toString();

		return result;
	}

	/**	Merged adornments with original XML text.
	 *
	 *	@param	options			XGTagger options.
	 *	@param	instance		XGParser instance.
	 *	@param	document		Document being processed.
	 *	@param	segmentName		Name of document segment being processed.
	 *	@param	outputter		Adorned word outputter.
	 *	@param	inputter		Text inputter.
	 *
	 *	@return					Map of (word id, # of word parts)
	 *							for words split by soft or jump tags.
	 *
	 *	@throws	IOException
	 */

	public static Map<Integer, Integer> mergeAdornments
	(
		XGOptions options ,
		XGParser instance ,
		Document document ,
		String segmentName ,
		AdornedWordOutputter outputter ,
		TextInputter inputter
	)
		throws IOException
	{
		instance.adornerOutputter	= outputter;
		instance.intCountTags		= 0;

								//	Get next document segment.

		instance.nextAdornedWord		= 0;

		instance.adornedWordDataList	=
			((ListAdornedWordOutputter)outputter).getAdornedWordDataList();

		instance.getNextEntry();

								//	Pass DOM tree to modifyDOM method
								//	to update DOM with adorner output.

		document = instance.modifyDOM( document , document , "" );

								//	Output updated DOM tree segment as
								//	XML text.

		File file	= File.createTempFile( "mad" , null );

		file.deleteOnExit();

		String fileName	= file.getAbsolutePath();

		if ( XGMisc.printNodeToFile( document , fileName ) == 1 )
		{
			inputter.setSegmentText( segmentName , file );
		}

		return instance.splitWords;
    }

	/**	Create DOM from XML text.
	 *
	 *	@param	options		The processing options.
	 *	@param	xmlText		The XML text.
	 *
	 *	@return				DOM for document.
	 */

	public static Document textToDOM
	(
		XGOptions options ,
		String xmlText
	)
		throws IOException
	{
		Document result	= null;

		try
		{
								//	Create a factory of DOM builders.

			DocumentBuilderFactory factory =
				DocumentBuilderFactory.newInstance();

			factory.setExpandEntityReferences( false );

			DocumentBuilder builder = factory.newDocumentBuilder();

			result =
				builder.parse
				(
					new InputSource( new StringReader( xmlText ) )
				);
		}
		catch ( ParserConfigurationException pce )
		{
			System.out.println( pce.getMessage() );
		}
		catch ( SAXException se )
		{
			System.out.println( se.getMessage() );
		}

		return result;
	}
}