package edu.northwestern.at.morphadorner.xgtagger; /* Please see the license information in the header below. */ /** XGTagger * * Copyright Ecole Nationale Superieure des Mines de Saint-Etienne * * Original authors: Aude Garnier and Xavier Tannier. * * Modifications by Philip R. "Pib" Burns at Northwestern University * for integration into MorphAdorner. * * Please DO NOT address questions about this modified version to the * original authors. * * This software is a computer program whose purpose is to provide * a generic interface to deal with and analyse any XML textual content. * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/ or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * * As a counterpart to the access to the source code and rights to copy, * modify and redistribute granted by the license, users are provided only * with a limited warranty and the software's author, the holder of the * economic rights, and the successive licensors have only limited * liability. * * In this respect, the user's attention is drawn to the risks associated * with loading, using, modifying and/or developing or reproducing the * software by the user in light of its specific status of free software, * that may mean that it is complicated to manipulate, and that also * therefore means that it is reserved for developers and experienced * professionals having in-depth computer knowledge. Users are therefore * encouraged to load and test the software's suitability as regards their * requirements in conditions enabling the security of their systems and/or * data to be ensured and, more generally, to use and operate it in the * same conditions as regards security. * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. */ import java.util.*; import java.util.regex.*; import java.lang.Thread; import java.io.*; import javax.print.attribute.standard.NumberOfDocuments; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.w3c.dom.*; import org.xml.sax.SAXException; import org.xml.sax.InputSource; import java.util.HashMap; import java.util.Properties; import java.util.StringTokenizer; import java.util.Set; import java.util.Map; import java.util.Enumeration; import java.util.Vector; import edu.northwestern.at.morphadorner.*; import edu.northwestern.at.utils.*; import edu.northwestern.at.utils.corpuslinguistics.inputter.*; import edu.northwestern.at.utils.corpuslinguistics.outputter.*; /** Parse XML document for morphological adornment. * * @author Aude Garnier, Xavier Tannier */ public class XGParser { // Execution options XGOptions options; // tag/text hash map Map<Integer , XGPair> hMap; // attributes creation Map<String , String> hmAttributes; // Document Entities NamedNodeMap nnmEntities; boolean boolDot; int intCountNonBlanks; int intCountTags; // number of last textual node (#text) parsed int intCpt; String strLine; StringBuffer sbWord; int intStrWordIndex; int intStrWordLength; String strWord; int intLongWord; int intID; UnicodeReader frCurrent; BufferedReader brCurrent; AdornedWordOutputter adornerOutputter; /** Next adorned word to process. */ int nextAdornedWord; /** List of adorned word data entries. */ List adornedWordDataList; /** Surrounding sentence/phrase marker. */ String surroundMarker; String surroundMarkerTrim; /** Surround marker string length. */ int surroundMarkerLength; /** Map of multipart word IDs to # of parts. * * <p> * Records for each word split by soft or jump tags, * the ID for that word and the number of parts into * which it is split. * </p> */ Map<Integer, Integer> splitWords = MapFactory.createNewMap(); /** Number of word nodes created. */ int wordNodesCreated = 0; /** File separator. */ static final String FILE_SEPARATOR = System.getProperty( "file.separator" ); /** Create parser. * * @param options Options for processing. * @param document Document to process. */ public XGParser( XGOptions options , Document document ) { this.options = options; this.hMap = MapFactory.createNewMap(); hmAttributes = MapFactory.createNewMap(); this.strLine = null; this.boolDot = false; this.intCpt = 0; this.intCountNonBlanks = 0; this.intCountTags = 0; this.strWord = ""; this.sbWord = new StringBuffer(); this.intStrWordIndex = 0; this.intStrWordLength = 0; this.intID = 0; this.frCurrent = null; this.brCurrent = null; this.surroundMarker = this.options.getSurroundMarker(); this.surroundMarkerTrim = this.surroundMarker.trim(); this.surroundMarkerLength = surroundMarkerTrim.length(); this.nextAdornedWord = 0; this.adornedWordDataList = null; this.wordNodesCreated = 0; AdornedWordOutputter adornerOutputter = null; if ( document.getDoctype() != null ) { this.nnmEntities = document.getDoctype().getEntities(); } } /** Set running word ID. * * @param runningWordID The running word ID. */ public void setRunningWordID( int runningWordID ) { this.intID = runningWordID; } /** Get word ID. * * @return The current running word ID. */ public int getRunningWordID() { return intID; } /** Get number of adorned words. * * @return Number of adorned words. */ public int getNumberOfAdornedWords() { return wordNodesCreated; } /** Reads a integer from the adorner. * * @return The next <code>int</code> in the output stream. * * <p> * If this output is split into several files, handle * multiple buffers. * </p> */ protected int read() throws IOException , FileNotFoundException { // First reading. if ( frCurrent == null ) { // Create FileReader and BufferedReader. byte[] outputBytes = ((ByteStreamAdornedWordOutputter)adornerOutputter).getBytes(); frCurrent = new UnicodeReader ( new ByteArrayInputStream( outputBytes ) , "utf-8" ); brCurrent = new BufferedReader( frCurrent ); // Read. return this.read(); } else { return this.brCurrent.read(); } } /** Reads next entry of adorner and updates appropriate class variables. */ protected void getNextEntry() throws IOException , FileNotFoundException { String strElem; String strAttName; this.hmAttributes.clear(); this.strWord = ""; if ( this.nextAdornedWord < adornedWordDataList.size() ) { List adornedWordData = (List)adornedWordDataList.get( this.nextAdornedWord++ ); for ( int i = 0 ; i < adornedWordData.size() ; i++ ) { strElem = (String)adornedWordData.get( i ); // Initial word. if ( this.options.getWordField() == ( i + 1 ) ) { this.strWord = strElem; } // Other fields. strAttName = MorphAdornerSettings.getXMLWordAttribute( i ); if ( strAttName.length() > 0 ) { this.hmAttributes.put( strAttName , strElem ); } } } // Id. ++( this.intID ); this.intStrWordIndex = 0; this.intStrWordLength = this.strWord.length(); } /** Extract text form <code>node</code>. * * @param node the <code>Node</code> to parse. * * @return A <code>StringBuffer</code> containing the * element text, taking reading context into account. * * <p> * The algorithm used to parse children (soft, jump, hard tags) * is the same as that in {@link #modifyDOM}. * </p> */ public StringBuffer extractText( Node node ) throws IOException { StringBuffer sbResult = new StringBuffer(); StringBuffer sbBuffer = new StringBuffer(); boolean boolInternDot = false; // Get list of child nodes. NodeList nlChildren = node.getChildNodes(); // Number of child nodes. int intChildNumber = nlChildren.getLength(); String strText; String strChildName; Vector<Integer> vectorTempJumpTags = new Vector<Integer>(); Node nodeChild; int i; // Parse all the children. for ( i = 0 ; i < intChildNumber ; ++i ) { // Next child. nodeChild = nlChildren.item( i ); strChildName = nodeChild.getNodeName(); // Child is an entity reference, if ( nodeChild instanceof EntityReference ) { Entity entity = (Entity)this.nnmEntities.getNamedItem( strChildName ); // If it is a reference to an external // file. if ( ( entity.getSystemId() != null ) && !this.options.getEntityIgnoreFiles() ) { // If the user has not set the // proper options: error. if ( !this.options.isOutputDirectory() && !this.options.getEntityMerging() ) { MorphAdornerLogger.logError ( "Error: XML input " + " contains external file entity " + "references.\n Specified output should " + "be a directory, or options " + "xml.entities_not_files or " + "xml.entities_merge " + "should be set.\n" ); System.exit( -1 ); } // Else extract text in its children // and stop for itself. sbResult.append( this.extractText( nodeChild ) ); } // Other entity reference. else { // If the user has asked us to treat it. if ( this.options.getEntityTreatAll() ) { // Extract text in its children and stop // for itself. sbResult.append( this.extractText( nodeChild ) ); } // Else add a space character in the // reading context. else { sbResult.append( " " ); } } continue; } // Child is a Text node. if ( nodeChild instanceof Text ) { // Text with all whitespace mapped // to blanks. strText = nodeChild.getNodeValue().replaceAll( "\\s" , " " ); // Count number of non-blank // characters. int nbChars = countNonBlankCharacters( strText ); // Append any non-blank // characters to reading context. sbResult.append( strText ); if ( nbChars > 0 ) { // sbResult.append( strText ); this.boolDot = false; } else { boolInternDot = true; } } // Child is not text. else { // Not a jump tag. if ( !this.options.isJumpTag( strChildName ) ) { // If not a soft tag and if a surround // marker has been previously requested, // add a surround marker. boolean boolSoftTag = this.options.isSoftTag( strChildName ); if ( boolInternDot && !boolSoftTag ) { sbResult.append( surroundMarker ); this.intCountNonBlanks += surroundMarkerLength; } // Recursively call extractText // on the child node. sbBuffer = this.extractText( nodeChild ); // If we got back some text ... if ( !sbBuffer.equals( "" ) ) { // Append child text. sbResult.append( sbBuffer ); // Check for soft tag. if ( this.options.isSoftTag( strChildName ) ) { boolInternDot = true; this.boolDot = false; } // Not soft tag. Must be hard tag. else { if ( !this.boolDot ) { sbResult.append( surroundMarker ); this.intCountNonBlanks += surroundMarkerLength; } this.boolDot = true; boolInternDot = false; } } } // Is a jump tag. else { // Remember we skipped jump tag. vectorTempJumpTags.add( new Integer( i ) ); } } } // If we encountered some jump tags, // we need to treat them now. if ( !vectorTempJumpTags.isEmpty() ) { // Treat all jump tag numbers. for ( int j = 0 ; j < vectorTempJumpTags.size() ; j++ ) { nodeChild = nlChildren.item ( vectorTempJumpTags.get( j ).intValue() ); this.intCountNonBlanks += surroundMarkerLength; // Recursively call extractText on the // jump tag node. sbBuffer = this.extractText( nodeChild ); // Append text and surround marker // to accumulated text. sbResult.append( surroundMarker + sbBuffer ); } } return sbResult; } /** Create new document node. * * @param doc The document we're processing. * @param node The current node we're processing. * @param nodeChild The child node we're processing. * @param strCurrentPath Current XML path to this node. * @param integerTagNumber Integer tag number for path. * * @return # of string word elements generated. */ protected int createNewNode ( Document doc , Node node , Node nodeChild , String strCurrentPath , Integer integerTagNumber ) { String[] strArray; // Do nothing if we don't have // node text or the text contains // the surround marker. if ( ( this.sbWord.length() == 0 ) || ( this.sbWord.indexOf( surroundMarkerTrim ) >= 0 ) ) { this.sbWord.delete( 0 , this.sbWord.length() ); return 0; } // A special separator cuts the "word" // (or expression). Only the text part // of the element will change. if ( this.options.getSpecialSeparator() != null ) { strArray = ( this.sbWord.toString() ).split( this.options.getSpecialSeparator() ); } else { strArray = new String[ 1 ]; strArray[ 0 ] = this.sbWord.toString(); } // If this is a split word, record // its ID and the number of split // parts. int splitCount = 1; if ( splitWords.containsKey( this.intID ) ) { splitCount = splitWords.get( this.intID ) + 1; } splitWords.put( this.intID , splitCount ); // Loop over each element of the array, // split by special separator. for ( int i = 0 ; i < strArray.length ; i++ ) { // Create a new node. Element elementNewTag = doc.createElement( this.options.getWordTagName() ); Text newText = doc.createTextNode( strArray[ i ] ); // Generate a node ID. if ( this.options.getWriteIds() ) { elementNewTag.setAttribute ( this.options.getIdArgumentName() , String.valueOf( this.intID ) ); } // Generate path. if ( this.options.getWritePath() % 2 == 1 ) { if ( integerTagNumber == null ) { integerTagNumber = 1; } else { ++integerTagNumber; } elementNewTag.setAttribute ( this.options.getWordPathArgumentName() , strCurrentPath + File.separator + this.options.getWordTagName() + "[" + integerTagNumber.toString() + "]" ); } // Create attributes. if ( ( i == 0 ) || this.options.repeatAttributes() ) { Set< Map.Entry< String , String > > setEnum = hmAttributes.entrySet(); for ( Map.Entry< String , String > entry : setEnum ) { elementNewTag.setAttribute ( entry.getKey() , entry.getValue() ); } } // Insert new tag. elementNewTag.appendChild( newText ); node.insertBefore( elementNewTag , nodeChild ); this.sbWord.delete( 0 , this.sbWord.length() ); } wordNodesCreated++; return strArray.length; } /** Clone a node and its sub-elements. * * @param node The <code>Node</code> to clone * * @return The <code>Node</code> cloned. */ protected static Node cloneNode( Node node ) { Node nodeClone = node.cloneNode( false ); NodeList nodeChildList = node.getChildNodes(); int intChildNumber = nodeChildList.getLength(); try { for ( int i = 0 ; i < intChildNumber ; ++i ) { nodeClone.appendChild ( XGParser.cloneNode( nodeChildList.item( i ) ) ); } } // If cloning is not possible, // clone with subelement. catch ( org.w3c.dom.DOMException e ) { nodeClone = node.cloneNode( true ); } return nodeClone; } /** Clone a read-only EntityReference into a writable Node. * * @param er The <code>EntityReference</code> to clone. * @param doc The parent <code>Document</code>. * * @return A <code>Node</code> containing the same * writable sub-elements than <code>er</code> . */ protected Node cloneEntityReference ( EntityReference er , Document doc ) { Node nodeClone = doc.createElement( "entityReferenceRoot" ); NodeList nodeChildList = er.getChildNodes(); int intChildNumber = nodeChildList.getLength(); for ( int i = 0 ; i < intChildNumber ; ++i ) { nodeClone.appendChild ( XGParser.cloneNode( nodeChildList.item( i ) ) ); } return nodeClone; } /** Modify <code>element</code> to add adornments and remove initial text node. * * @param node The <code>Node</code> to parse. * @param doc The <code>Document</code> to modify. * @param strCurrentPath The XPath or the last <code>Node</code> * explored. * * @return Modified <code>Document</code>. * * <p> * The algorithm used to parse children (soft, jump, hard tags) * is the same as used in {@link #extractText}. * </p> */ public Document modifyDOM ( Node node , Document doc , String strCurrentPath ) throws DOMException , IOException { String strText = null; int intBegin; int intEnd; // Child list. NodeList nlChildren = node.getChildNodes(); Node nodeChild; String strNodeChildName; String strNewPath = null; int intChildNumber = nlChildren.getLength(); int i , t; Integer integerTagNumber; boolean boolConsiderAsAnElement = false; // Path reminder. Map< String , Integer > hmPaths = MapFactory.createNewMap(); // Result. StringBuilder sbNew; // Jump tag number in the children list. Vector<Integer> vectorTempJumpTags = new Vector<Integer>(); for ( i = 0 ; i < intChildNumber ; ++i ) { // Child. nodeChild = nlChildren.item( i ); strNodeChildName = nodeChild.getNodeName(); // DTD description (element DOCTYPE): // to be removed! if ( nodeChild instanceof DocumentType ) { Comment comment1 = doc.createComment( "Document Type Description element (DOCTYPE \"" + nodeChild.getNodeName() + "\") has been removed. " ); Comment comment2 = doc.createComment( "To build a correct DTD for this document, " + "change all #PCDATA into '" + this.options.getWordTagName() + "' element, containing #PCDATA." ); node.insertBefore( comment1 , nodeChild ); node.insertBefore( comment2 , nodeChild ); node.removeChild( nodeChild ); // Two children added (comments), // one removed (DOCTYPE) => // one more child! ++i; ++intChildNumber; MorphAdornerLogger.logError ( " *** Element DOCTYPE (\"" + nodeChild.getNodeName() + "\") removed in the output (out of date) *** " ); continue; } boolean boolT = false; // Child is an entity reference. if ( nodeChild instanceof EntityReference ) { Entity entity = (Entity)this.nnmEntities.getNamedItem( strNodeChildName ); // If it is not a reference to an external // file => // If the user has asked to treat it // (--entities_treat_all) => // add all of its children to the // tree. if ( entity.getSystemId() == null ) { if ( this.options.getEntityTreatAll() ) { Node nodeClone = this.cloneEntityReference( (EntityReference)nodeChild , doc ); NodeList nlGrandChildren = nodeClone.getChildNodes(); int intGrandChildrenNumber = nlGrandChildren.getLength(); for ( int intGrandChild = 0 ; intGrandChild < intGrandChildrenNumber; ++intGrandChild ) { if ( i != ( intChildNumber - 1 ) ) { node.insertBefore( nlGrandChildren.item( intGrandChild ) , nodeClone.getNextSibling() ); } else { node.appendChild( nlGrandChildren.item( intGrandChild ) ); } ++intChildNumber; } node.removeChild( nodeChild ); --intChildNumber; --i; } continue; } // If it is a reference to an external // file ... else { // If the user has not asked to ignore // this kind of file. if ( !this.options.getEntityIgnoreFiles() ) { // If the user has not set the proper // options, an error should have already // been raised by extractText. // But do it again. if ( !this.options.isOutputDirectory() && !this.options.getEntityMerging() ) { MorphAdornerLogger.logError ( "Error: XML output file " + " contains some external file " + "entity references.\n " + "Specified output should be a " + "directory." ); System.exit( -1 ); } // Recursive modifyDOM call on the tag // and update. else { // As an EntityReference is readonly, // we have to clone the Node. Node nodeClone = this.cloneEntityReference( (EntityReference)nodeChild , doc ); doc = this.modifyDOM( nodeClone , doc , strCurrentPath ); // If the entities should be merged. if ( this.options.getEntityMerging() ) { NodeList nlNewGrandChildren = nodeClone.getChildNodes(); int intNewGrandChildrenNumber = nlNewGrandChildren.getLength(); // Comment to begin. Node nodeComment = doc.createComment ( " ++ " + nodeChild.getNodeName() + " ++ Here begins the content of " + " entity " + nodeChild.getNodeName() + " inserted here in place of " + "a reference to this entity in " + " the original document." ); node.insertBefore ( nodeComment , nodeChild ); ++i; ++intChildNumber; // Copy content. for ( int intGrandChild = 0 ; intGrandChild < intNewGrandChildrenNumber ; ++intGrandChild ) { node.insertBefore ( nlNewGrandChildren.item( intGrandChild ) , nodeChild ); ++i; ++intChildNumber; } // Comment to end (this comment // (+ 1 child) and the child removal // (- 1 child) = nothing. node.insertBefore ( doc.createComment ( " -- " + nodeChild.getNodeName() + " -- End of entity " + nodeChild.getNodeName() ) , nodeChild ); node.removeChild( nodeChild ); } // if output is a directory // (already checked) => // write a separate file. // // Note: Should never get here // in MorphAdorner. else { /* String strFileName = entity.getSystemId(); strFileName = new File( entity.getSystemId() ).getName(); // Print result. XGMisc.printNodeList ( nodeClone.getChildNodes() , "<!-- File referenced by an XML " + "well-formed document -->\n" , options.getOutputFileName() + File.separator + strFileName ); MorphAdornerLogger.logInfo ( "\nExternal file referenced written in " + options.getOutputFileName() + File.separator + strFileName + "\n" ); */ MorphAdornerLogger.logError ( "Internal error: attempted to write " + "secondary XML output file." ); } } } continue; } } // Text node. if ( nodeChild instanceof Text ) { // Text with normalized blanks. strText = nodeChild.getNodeValue().replaceAll( "\\s" , " " ); // Number of the text tag. ++( this.intCountTags ); // Find entry for tag in hash map. XGPair pairResult = this.hMap.get( new Integer( this.intCountTags ) ); // Find where tag's text starts // and ends. intBegin = pairResult.begin; intEnd = pairResult.end; // Skip surround markers. while ( this.intCpt < intBegin ) { if ( !strWord.equals( surroundMarkerTrim ) ) { break; } this.getNextEntry(); this.intCpt++; } // Loop on nonblank characters. while ( this.intCpt < intEnd ) { // Append word text if any. if ( !this.strWord.equals( "" ) ) { this.sbWord.append ( this.strWord.charAt( this.intStrWordIndex ) ); } // If we are at the end of a word ... if ( this.intStrWordIndex >= ( this.intStrWordLength - 1 ) ) { // Create a new node. // If paths should be added ... if ( this.options.getWritePath() % 2 == 1 ) { integerTagNumber = hmPaths.get ( this.options.getWordTagName() ); t = this.createNewNode ( doc , node , nodeChild , strCurrentPath , integerTagNumber ); if ( integerTagNumber != null ) { hmPaths.put ( this.options.getWordTagName() , integerTagNumber + t ); } else { hmPaths.put ( this.options.getWordTagName() , new Integer( t ) ); } } // If no path has been requested, // create node with no path. else { t = this.createNewNode ( doc , node , nodeChild , null , 0 ); } intChildNumber += t; i += t; // Get next adornment entry. this.getNextEntry(); } // If not the end of the word, // increment the character index // in the word. else { ++( this.intStrWordIndex ); } // If next characters correspond to the // special separator, intCpt should not // follow! if ( this.options.getSpecialSeparator() != null ) { if ( this.strWord.length() >= ( this.intStrWordIndex + this.options.getSpecialSeparator().length() ) ) { if ( this.strWord.substring ( this.intStrWordIndex , this.intStrWordIndex + this.options.getSpecialSeparator(). length() ).equals( this.options.getSpecialSeparator() ) ) { this.sbWord.append( this.options.getSpecialSeparator() ); this.intStrWordIndex += this.options.getSpecialSeparator().length(); } } } ++( this.intCpt ); } // If a word has been found (usual case). if ( this.sbWord.length() > 0 ) { // Create a new node. // Should paths should be added? if ( this.options.getWritePath() % 2 == 1 ) { integerTagNumber = hmPaths.get ( this.options.getWordTagName() ); t = this.createNewNode ( doc , node , nodeChild , strCurrentPath , integerTagNumber ); if ( integerTagNumber != null ) { hmPaths.put ( this.options.getWordTagName() , integerTagNumber + t ); } else { hmPaths.put ( this.options.getWordTagName() , new Integer( t ) ); } } // If no path has been requested. else { t = this.createNewNode( doc , node , nodeChild , null , 0 ); } intChildNumber += t; i += t; } // If we have seen all text contained // in the tag. if ( this.intCpt >= intEnd ) { // Delete old child text node. node.removeChild( nodeChild ); --intChildNumber; --i; } } // Child is not text. else { // Not a jump tag. if ( !this.options.isJumpTag( strNodeChildName ) ) { // Path. if ( this.options.getWritePath() > 0 ) { integerTagNumber = hmPaths.get( strNodeChildName ); if ( integerTagNumber == null ) { integerTagNumber = 1; } else { ++integerTagNumber; } strNewPath = strCurrentPath + File.separator + strNodeChildName + "[" + integerTagNumber.toString() + "]"; if ( this.options.getWritePath() >= 2 ) { ( (Element)nodeChild ).setAttribute ( this.options.getTagsPathArgumentName() , strNewPath ); } hmPaths.put ( strNodeChildName , integerTagNumber ); } // If hard tag or soft tag, // pursue treatment. doc = this.modifyDOM( nodeChild , doc , strNewPath ); } // Jump tag. else { // Skip jump tag, but remember we did. vectorTempJumpTags.add( new Integer( i ) ); } } } // All the children have been passed // and there was a jump tag. if ( !vectorTempJumpTags.isEmpty() ) { // Treat all jump tag numbers. for ( int j = 0 ; j < vectorTempJumpTags.size() ; j++ ) { nodeChild = nlChildren.item ( vectorTempJumpTags.get( j ).intValue() ); strNodeChildName = nodeChild.getNodeName(); // Path. if ( this.options.getWritePath() >= 0 ) { integerTagNumber = hmPaths.get( strNodeChildName ); if ( integerTagNumber == null ) { integerTagNumber = 1; } else { ++integerTagNumber; } strNewPath = strCurrentPath + File.separator + strNodeChildName + "[" + integerTagNumber.toString() + "]"; if ( this.options.getWritePath() >= 2 ) { ( (Element)nodeChild ).setAttribute ( this.options.getTagsPathArgumentName() , strNewPath ); } hmPaths.put( strNodeChildName , integerTagNumber ); } this.intCountNonBlanks++; // Recursively call modifyDOM on the // jump tag node. doc = this.modifyDOM( nodeChild , doc , strNewPath ); } } return doc; } /** Count non-blank characters in a <code>String</code> and * update the tag <code>HashMap</code>. * * @param strString The text to analyze. * * @return Number of non-blank * characters in strString. * * <p> * strString should have all whitespace characters mapped to * blanks before this method is called. * </p> */ protected int countNonBlankCharacters( String strString ) throws IOException { // Increment tag count. this.intCountTags++; // Length of input string. int intLetters = strString.length(); // Only non-blank characters are // counted. int nonBlanks = 0; for ( int i = 0 ; i < intLetters ; i++ ) { if ( strString.charAt( i ) != ' ' ) { nonBlanks++; } } // First character. int intBegin = this.intCountNonBlanks; if ( nonBlanks > 0 ) { intBegin++; } // Last character. this.intCountNonBlanks += nonBlanks; // HashMap update. hMap.put ( new Integer( this.intCountTags ) , new XGPair( intBegin , this.intCountNonBlanks ) ); return nonBlanks; } /** Extract text from DOM document. * * @param options The processing options. * @param document The document to process. * * @return Two element object array. * result[ 0 ] = XGParser instance. * result[ 1 ] = reading context text. * * @throws IOException */ public static Object[] extractText ( XGOptions options , Document document ) throws IOException { StringBuffer sbText = null; Object[] result = new Object[ 2 ]; // Start document treatment. XGParser instance = new XGParser( options , document ); // Save parser instance. result[ 1 ] = instance; // Extract text of reading context. sbText = instance.extractText( document ); String strText = sbText.toString(); // Return text of reading context. result[ 0 ] = sbText.toString(); return result; } /** Merged adornments with original XML text. * * @param options XGTagger options. * @param instance XGParser instance. * @param document Document being processed. * @param segmentName Name of document segment being processed. * @param outputter Adorned word outputter. * @param inputter Text inputter. * * @return Map of (word id, # of word parts) * for words split by soft or jump tags. * * @throws IOException */ public static Map<Integer, Integer> mergeAdornments ( XGOptions options , XGParser instance , Document document , String segmentName , AdornedWordOutputter outputter , TextInputter inputter ) throws IOException { instance.adornerOutputter = outputter; instance.intCountTags = 0; // Get next document segment. instance.nextAdornedWord = 0; instance.adornedWordDataList = ((ListAdornedWordOutputter)outputter).getAdornedWordDataList(); instance.getNextEntry(); // Pass DOM tree to modifyDOM method // to update DOM with adorner output. document = instance.modifyDOM( document , document , "" ); // Output updated DOM tree segment as // XML text. File file = File.createTempFile( "mad" , null ); file.deleteOnExit(); String fileName = file.getAbsolutePath(); if ( XGMisc.printNodeToFile( document , fileName ) == 1 ) { inputter.setSegmentText( segmentName , file ); } return instance.splitWords; } /** Create DOM from XML text. * * @param options The processing options. * @param xmlText The XML text. * * @return DOM for document. */ public static Document textToDOM ( XGOptions options , String xmlText ) throws IOException { Document result = null; try { // Create a factory of DOM builders. DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setExpandEntityReferences( false ); DocumentBuilder builder = factory.newDocumentBuilder(); result = builder.parse ( new InputSource( new StringReader( xmlText ) ) ); } catch ( ParserConfigurationException pce ) { System.out.println( pce.getMessage() ); } catch ( SAXException se ) { System.out.println( se.getMessage() ); } return result; } }