package edu.northwestern.at.morphadorner.xgtagger;
/* Please see the license information in the header below. */
/** XGTagger
*
* Copyright Ecole Nationale Superieure des Mines de Saint-Etienne
*
* Original authors: Aude Garnier and Xavier Tannier.
*
* Modifications by Philip R. "Pib" Burns at Northwestern University
* for integration into MorphAdorner.
*
* Please DO NOT address questions about this modified version to the
* original authors.
*
* This software is a computer program whose purpose is to provide
* a generic interface to deal with and analyse any XML textual content.
*
* This software is governed by the CeCILL license under French law and
* abiding by the rules of distribution of free software. You can use,
* modify and/ or redistribute the software under the terms of the CeCILL
* license as circulated by CEA, CNRS and INRIA at the following URL
* "http://www.cecill.info".
*
* As a counterpart to the access to the source code and rights to copy,
* modify and redistribute granted by the license, users are provided only
* with a limited warranty and the software's author, the holder of the
* economic rights, and the successive licensors have only limited
* liability.
*
* In this respect, the user's attention is drawn to the risks associated
* with loading, using, modifying and/or developing or reproducing the
* software by the user in light of its specific status of free software,
* that may mean that it is complicated to manipulate, and that also
* therefore means that it is reserved for developers and experienced
* professionals having in-depth computer knowledge. Users are therefore
* encouraged to load and test the software's suitability as regards their
* requirements in conditions enabling the security of their systems and/or
* data to be ensured and, more generally, to use and operate it in the
* same conditions as regards security.
*
* The fact that you are presently reading this means that you have had
* knowledge of the CeCILL license and that you accept its terms.
*/
import java.util.*;
import java.util.regex.*;
import java.lang.Thread;
import java.io.*;
import javax.print.attribute.standard.NumberOfDocuments;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.*;
import org.xml.sax.SAXException;
import org.xml.sax.InputSource;
import java.util.HashMap;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.Set;
import java.util.Map;
import java.util.Enumeration;
import java.util.Vector;
import edu.northwestern.at.morphadorner.*;
import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.inputter.*;
import edu.northwestern.at.utils.corpuslinguistics.outputter.*;
/** Parse XML document for morphological adornment.
*
* @author Aude Garnier, Xavier Tannier
*/
public class XGParser
{
// Execution options
XGOptions options;
// tag/text hash map
Map<Integer , XGPair> hMap;
// attributes creation
Map<String , String> hmAttributes;
// Document Entities
NamedNodeMap nnmEntities;
boolean boolDot;
int intCountNonBlanks;
int intCountTags;
// number of last textual node (#text) parsed
int intCpt;
String strLine;
StringBuffer sbWord;
int intStrWordIndex;
int intStrWordLength;
String strWord;
int intLongWord;
int intID;
UnicodeReader frCurrent;
BufferedReader brCurrent;
AdornedWordOutputter adornerOutputter;
/** Next adorned word to process. */
int nextAdornedWord;
/** List of adorned word data entries. */
List adornedWordDataList;
/** Surrounding sentence/phrase marker.
*/
String surroundMarker;
String surroundMarkerTrim;
/** Surround marker string length. */
int surroundMarkerLength;
/** Map of multipart word IDs to # of parts.
*
* <p>
* Records for each word split by soft or jump tags,
* the ID for that word and the number of parts into
* which it is split.
* </p>
*/
Map<Integer, Integer> splitWords = MapFactory.createNewMap();
/** Number of word nodes created. */
int wordNodesCreated = 0;
/** File separator. */
static final String FILE_SEPARATOR =
System.getProperty( "file.separator" );
/** Create parser.
*
* @param options Options for processing.
* @param document Document to process.
*/
public XGParser( XGOptions options , Document document )
{
this.options = options;
this.hMap = MapFactory.createNewMap();
hmAttributes = MapFactory.createNewMap();
this.strLine = null;
this.boolDot = false;
this.intCpt = 0;
this.intCountNonBlanks = 0;
this.intCountTags = 0;
this.strWord = "";
this.sbWord = new StringBuffer();
this.intStrWordIndex = 0;
this.intStrWordLength = 0;
this.intID = 0;
this.frCurrent = null;
this.brCurrent = null;
this.surroundMarker = this.options.getSurroundMarker();
this.surroundMarkerTrim = this.surroundMarker.trim();
this.surroundMarkerLength = surroundMarkerTrim.length();
this.nextAdornedWord = 0;
this.adornedWordDataList = null;
this.wordNodesCreated = 0;
AdornedWordOutputter adornerOutputter = null;
if ( document.getDoctype() != null )
{
this.nnmEntities = document.getDoctype().getEntities();
}
}
/** Set running word ID.
*
* @param runningWordID The running word ID.
*/
public void setRunningWordID( int runningWordID )
{
this.intID = runningWordID;
}
/** Get word ID.
*
* @return The current running word ID.
*/
public int getRunningWordID()
{
return intID;
}
/** Get number of adorned words.
*
* @return Number of adorned words.
*/
public int getNumberOfAdornedWords()
{
return wordNodesCreated;
}
/** Reads a integer from the adorner.
*
* @return The next <code>int</code> in the output stream.
*
* <p>
* If this output is split into several files, handle
* multiple buffers.
* </p>
*/
protected int read()
throws IOException , FileNotFoundException
{
// First reading.
if ( frCurrent == null )
{
// Create FileReader and BufferedReader.
byte[] outputBytes =
((ByteStreamAdornedWordOutputter)adornerOutputter).getBytes();
frCurrent =
new UnicodeReader
(
new ByteArrayInputStream( outputBytes ) ,
"utf-8"
);
brCurrent = new BufferedReader( frCurrent );
// Read.
return this.read();
}
else
{
return this.brCurrent.read();
}
}
/** Reads next entry of adorner and updates appropriate class variables.
*/
protected void getNextEntry()
throws IOException , FileNotFoundException
{
String strElem;
String strAttName;
this.hmAttributes.clear();
this.strWord = "";
if ( this.nextAdornedWord < adornedWordDataList.size() )
{
List adornedWordData =
(List)adornedWordDataList.get( this.nextAdornedWord++ );
for ( int i = 0 ; i < adornedWordData.size() ; i++ )
{
strElem = (String)adornedWordData.get( i );
// Initial word.
if ( this.options.getWordField() == ( i + 1 ) )
{
this.strWord = strElem;
}
// Other fields.
strAttName =
MorphAdornerSettings.getXMLWordAttribute( i );
if ( strAttName.length() > 0 )
{
this.hmAttributes.put( strAttName , strElem );
}
}
}
// Id.
++( this.intID );
this.intStrWordIndex = 0;
this.intStrWordLength = this.strWord.length();
}
/** Extract text form <code>node</code>.
*
* @param node the <code>Node</code> to parse.
*
* @return A <code>StringBuffer</code> containing the
* element text, taking reading context into account.
*
* <p>
* The algorithm used to parse children (soft, jump, hard tags)
* is the same as that in {@link #modifyDOM}.
* </p>
*/
public StringBuffer extractText( Node node )
throws IOException
{
StringBuffer sbResult = new StringBuffer();
StringBuffer sbBuffer = new StringBuffer();
boolean boolInternDot = false;
// Get list of child nodes.
NodeList nlChildren = node.getChildNodes();
// Number of child nodes.
int intChildNumber = nlChildren.getLength();
String strText;
String strChildName;
Vector<Integer> vectorTempJumpTags = new Vector<Integer>();
Node nodeChild;
int i;
// Parse all the children.
for ( i = 0 ; i < intChildNumber ; ++i )
{
// Next child.
nodeChild = nlChildren.item( i );
strChildName = nodeChild.getNodeName();
// Child is an entity reference,
if ( nodeChild instanceof EntityReference )
{
Entity entity =
(Entity)this.nnmEntities.getNamedItem( strChildName );
// If it is a reference to an external
// file.
if ( ( entity.getSystemId() != null ) &&
!this.options.getEntityIgnoreFiles()
)
{
// If the user has not set the
// proper options: error.
if ( !this.options.isOutputDirectory() &&
!this.options.getEntityMerging()
)
{
MorphAdornerLogger.logError
(
"Error: XML input " +
" contains external file entity " +
"references.\n Specified output should " +
"be a directory, or options " +
"xml.entities_not_files or " +
"xml.entities_merge " +
"should be set.\n"
);
System.exit( -1 );
}
// Else extract text in its children
// and stop for itself.
sbResult.append( this.extractText( nodeChild ) );
}
// Other entity reference.
else
{
// If the user has asked us to treat it.
if ( this.options.getEntityTreatAll() )
{
// Extract text in its children and stop
// for itself.
sbResult.append( this.extractText( nodeChild ) );
}
// Else add a space character in the
// reading context.
else
{
sbResult.append( " " );
}
}
continue;
}
// Child is a Text node.
if ( nodeChild instanceof Text )
{
// Text with all whitespace mapped
// to blanks.
strText =
nodeChild.getNodeValue().replaceAll( "\\s" , " " );
// Count number of non-blank
// characters.
int nbChars = countNonBlankCharacters( strText );
// Append any non-blank
// characters to reading context.
sbResult.append( strText );
if ( nbChars > 0 )
{
// sbResult.append( strText );
this.boolDot = false;
}
else
{
boolInternDot = true;
}
}
// Child is not text.
else
{
// Not a jump tag.
if ( !this.options.isJumpTag( strChildName ) )
{
// If not a soft tag and if a surround
// marker has been previously requested,
// add a surround marker.
boolean boolSoftTag =
this.options.isSoftTag( strChildName );
if ( boolInternDot && !boolSoftTag )
{
sbResult.append( surroundMarker );
this.intCountNonBlanks +=
surroundMarkerLength;
}
// Recursively call extractText
// on the child node.
sbBuffer = this.extractText( nodeChild );
// If we got back some text ...
if ( !sbBuffer.equals( "" ) )
{
// Append child text.
sbResult.append( sbBuffer );
// Check for soft tag.
if ( this.options.isSoftTag( strChildName ) )
{
boolInternDot = true;
this.boolDot = false;
}
// Not soft tag. Must be hard tag.
else
{
if ( !this.boolDot )
{
sbResult.append( surroundMarker );
this.intCountNonBlanks +=
surroundMarkerLength;
}
this.boolDot = true;
boolInternDot = false;
}
}
}
// Is a jump tag.
else
{
// Remember we skipped jump tag.
vectorTempJumpTags.add( new Integer( i ) );
}
}
}
// If we encountered some jump tags,
// we need to treat them now.
if ( !vectorTempJumpTags.isEmpty() )
{
// Treat all jump tag numbers.
for ( int j = 0 ; j < vectorTempJumpTags.size() ; j++ )
{
nodeChild =
nlChildren.item
(
vectorTempJumpTags.get( j ).intValue()
);
this.intCountNonBlanks += surroundMarkerLength;
// Recursively call extractText on the
// jump tag node.
sbBuffer = this.extractText( nodeChild );
// Append text and surround marker
// to accumulated text.
sbResult.append( surroundMarker + sbBuffer );
}
}
return sbResult;
}
/** Create new document node.
*
* @param doc The document we're processing.
* @param node The current node we're processing.
* @param nodeChild The child node we're processing.
* @param strCurrentPath Current XML path to this node.
* @param integerTagNumber Integer tag number for path.
*
* @return # of string word elements generated.
*/
protected int createNewNode
(
Document doc ,
Node node ,
Node nodeChild ,
String strCurrentPath ,
Integer integerTagNumber
)
{
String[] strArray;
// Do nothing if we don't have
// node text or the text contains
// the surround marker.
if ( ( this.sbWord.length() == 0 ) ||
( this.sbWord.indexOf( surroundMarkerTrim ) >= 0 )
)
{
this.sbWord.delete( 0 , this.sbWord.length() );
return 0;
}
// A special separator cuts the "word"
// (or expression). Only the text part
// of the element will change.
if ( this.options.getSpecialSeparator() != null )
{
strArray =
( this.sbWord.toString() ).split(
this.options.getSpecialSeparator() );
}
else
{
strArray = new String[ 1 ];
strArray[ 0 ] = this.sbWord.toString();
}
// If this is a split word, record
// its ID and the number of split
// parts.
int splitCount = 1;
if ( splitWords.containsKey( this.intID ) )
{
splitCount = splitWords.get( this.intID ) + 1;
}
splitWords.put( this.intID , splitCount );
// Loop over each element of the array,
// split by special separator.
for ( int i = 0 ; i < strArray.length ; i++ )
{
// Create a new node.
Element elementNewTag =
doc.createElement( this.options.getWordTagName() );
Text newText = doc.createTextNode( strArray[ i ] );
// Generate a node ID.
if ( this.options.getWriteIds() )
{
elementNewTag.setAttribute
(
this.options.getIdArgumentName() ,
String.valueOf( this.intID )
);
}
// Generate path.
if ( this.options.getWritePath() % 2 == 1 )
{
if ( integerTagNumber == null )
{
integerTagNumber = 1;
}
else
{
++integerTagNumber;
}
elementNewTag.setAttribute
(
this.options.getWordPathArgumentName() ,
strCurrentPath + File.separator +
this.options.getWordTagName() + "[" +
integerTagNumber.toString() + "]"
);
}
// Create attributes.
if ( ( i == 0 ) || this.options.repeatAttributes() )
{
Set< Map.Entry< String , String > > setEnum =
hmAttributes.entrySet();
for ( Map.Entry< String , String > entry : setEnum )
{
elementNewTag.setAttribute
(
entry.getKey() ,
entry.getValue()
);
}
}
// Insert new tag.
elementNewTag.appendChild( newText );
node.insertBefore( elementNewTag , nodeChild );
this.sbWord.delete( 0 , this.sbWord.length() );
}
wordNodesCreated++;
return strArray.length;
}
/** Clone a node and its sub-elements.
*
* @param node The <code>Node</code> to clone
*
* @return The <code>Node</code> cloned.
*/
protected static Node cloneNode( Node node )
{
Node nodeClone = node.cloneNode( false );
NodeList nodeChildList = node.getChildNodes();
int intChildNumber = nodeChildList.getLength();
try
{
for ( int i = 0 ; i < intChildNumber ; ++i )
{
nodeClone.appendChild
(
XGParser.cloneNode( nodeChildList.item( i ) )
);
}
}
// If cloning is not possible,
// clone with subelement.
catch ( org.w3c.dom.DOMException e )
{
nodeClone = node.cloneNode( true );
}
return nodeClone;
}
/** Clone a read-only EntityReference into a writable Node.
*
* @param er The <code>EntityReference</code> to clone.
* @param doc The parent <code>Document</code>.
*
* @return A <code>Node</code> containing the same
* writable sub-elements than <code>er</code> .
*/
protected Node cloneEntityReference
(
EntityReference er ,
Document doc
)
{
Node nodeClone = doc.createElement( "entityReferenceRoot" );
NodeList nodeChildList = er.getChildNodes();
int intChildNumber = nodeChildList.getLength();
for ( int i = 0 ; i < intChildNumber ; ++i )
{
nodeClone.appendChild
(
XGParser.cloneNode( nodeChildList.item( i ) )
);
}
return nodeClone;
}
/** Modify <code>element</code> to add adornments and remove initial text node.
*
* @param node The <code>Node</code> to parse.
* @param doc The <code>Document</code> to modify.
* @param strCurrentPath The XPath or the last <code>Node</code>
* explored.
*
* @return Modified <code>Document</code>.
*
* <p>
* The algorithm used to parse children (soft, jump, hard tags)
* is the same as used in {@link #extractText}.
* </p>
*/
public Document modifyDOM
(
Node node ,
Document doc ,
String strCurrentPath
)
throws DOMException , IOException
{
String strText = null;
int intBegin;
int intEnd;
// Child list.
NodeList nlChildren = node.getChildNodes();
Node nodeChild;
String strNodeChildName;
String strNewPath = null;
int intChildNumber = nlChildren.getLength();
int i , t;
Integer integerTagNumber;
boolean boolConsiderAsAnElement = false;
// Path reminder.
Map< String , Integer > hmPaths = MapFactory.createNewMap();
// Result.
StringBuilder sbNew;
// Jump tag number in the children list.
Vector<Integer> vectorTempJumpTags =
new Vector<Integer>();
for ( i = 0 ; i < intChildNumber ; ++i )
{
// Child.
nodeChild = nlChildren.item( i );
strNodeChildName = nodeChild.getNodeName();
// DTD description (element DOCTYPE):
// to be removed!
if ( nodeChild instanceof DocumentType )
{
Comment comment1 =
doc.createComment(
"Document Type Description element (DOCTYPE \"" +
nodeChild.getNodeName() +
"\") has been removed. " );
Comment comment2 =
doc.createComment(
"To build a correct DTD for this document, " +
"change all #PCDATA into '" +
this.options.getWordTagName() +
"' element, containing #PCDATA." );
node.insertBefore( comment1 , nodeChild );
node.insertBefore( comment2 , nodeChild );
node.removeChild( nodeChild );
// Two children added (comments),
// one removed (DOCTYPE) =>
// one more child!
++i;
++intChildNumber;
MorphAdornerLogger.logError
(
" *** Element DOCTYPE (\"" + nodeChild.getNodeName() +
"\") removed in the output (out of date) *** "
);
continue;
}
boolean boolT = false;
// Child is an entity reference.
if ( nodeChild instanceof EntityReference )
{
Entity entity =
(Entity)this.nnmEntities.getNamedItem(
strNodeChildName );
// If it is not a reference to an external
// file =>
// If the user has asked to treat it
// (--entities_treat_all) =>
// add all of its children to the
// tree.
if ( entity.getSystemId() == null )
{
if ( this.options.getEntityTreatAll() )
{
Node nodeClone =
this.cloneEntityReference(
(EntityReference)nodeChild , doc );
NodeList nlGrandChildren =
nodeClone.getChildNodes();
int intGrandChildrenNumber =
nlGrandChildren.getLength();
for ( int intGrandChild = 0 ;
intGrandChild < intGrandChildrenNumber;
++intGrandChild
)
{
if ( i != ( intChildNumber - 1 ) )
{
node.insertBefore(
nlGrandChildren.item( intGrandChild ) ,
nodeClone.getNextSibling() );
}
else
{
node.appendChild(
nlGrandChildren.item( intGrandChild ) );
}
++intChildNumber;
}
node.removeChild( nodeChild );
--intChildNumber;
--i;
}
continue;
}
// If it is a reference to an external
// file ...
else
{
// If the user has not asked to ignore
// this kind of file.
if ( !this.options.getEntityIgnoreFiles() )
{
// If the user has not set the proper
// options, an error should have already
// been raised by extractText.
// But do it again.
if ( !this.options.isOutputDirectory() &&
!this.options.getEntityMerging()
)
{
MorphAdornerLogger.logError
(
"Error: XML output file " +
" contains some external file " +
"entity references.\n " +
"Specified output should be a " +
"directory."
);
System.exit( -1 );
}
// Recursive modifyDOM call on the tag
// and update.
else
{
// As an EntityReference is readonly,
// we have to clone the Node.
Node nodeClone =
this.cloneEntityReference(
(EntityReference)nodeChild , doc );
doc =
this.modifyDOM( nodeClone , doc ,
strCurrentPath );
// If the entities should be merged.
if ( this.options.getEntityMerging() )
{
NodeList nlNewGrandChildren =
nodeClone.getChildNodes();
int intNewGrandChildrenNumber =
nlNewGrandChildren.getLength();
// Comment to begin.
Node nodeComment =
doc.createComment
(
" ++ " + nodeChild.getNodeName() +
" ++ Here begins the content of " +
" entity " +
nodeChild.getNodeName() +
" inserted here in place of " +
"a reference to this entity in " +
" the original document."
);
node.insertBefore
(
nodeComment ,
nodeChild
);
++i;
++intChildNumber;
// Copy content.
for ( int intGrandChild = 0 ;
intGrandChild <
intNewGrandChildrenNumber ;
++intGrandChild
)
{
node.insertBefore
(
nlNewGrandChildren.item(
intGrandChild ) ,
nodeChild
);
++i;
++intChildNumber;
}
// Comment to end (this comment
// (+ 1 child) and the child removal
// (- 1 child) = nothing.
node.insertBefore
(
doc.createComment
(
" -- " + nodeChild.getNodeName() +
" -- End of entity " +
nodeChild.getNodeName() ) ,
nodeChild
);
node.removeChild( nodeChild );
}
// if output is a directory
// (already checked) =>
// write a separate file.
//
// Note: Should never get here
// in MorphAdorner.
else
{
/*
String strFileName = entity.getSystemId();
strFileName =
new File( entity.getSystemId() ).getName();
// Print result.
XGMisc.printNodeList
(
nodeClone.getChildNodes() ,
"<!-- File referenced by an XML " +
"well-formed document -->\n" ,
options.getOutputFileName() +
File.separator + strFileName
);
MorphAdornerLogger.logInfo
(
"\nExternal file referenced written in " +
options.getOutputFileName() +
File.separator + strFileName + "\n"
);
*/
MorphAdornerLogger.logError
(
"Internal error: attempted to write " +
"secondary XML output file."
);
}
}
}
continue;
}
}
// Text node.
if ( nodeChild instanceof Text )
{
// Text with normalized blanks.
strText =
nodeChild.getNodeValue().replaceAll( "\\s" , " " );
// Number of the text tag.
++( this.intCountTags );
// Find entry for tag in hash map.
XGPair pairResult =
this.hMap.get( new Integer( this.intCountTags ) );
// Find where tag's text starts
// and ends.
intBegin = pairResult.begin;
intEnd = pairResult.end;
// Skip surround markers.
while ( this.intCpt < intBegin )
{
if ( !strWord.equals( surroundMarkerTrim ) )
{
break;
}
this.getNextEntry();
this.intCpt++;
}
// Loop on nonblank characters.
while ( this.intCpt < intEnd )
{
// Append word text if any.
if ( !this.strWord.equals( "" ) )
{
this.sbWord.append
(
this.strWord.charAt( this.intStrWordIndex )
);
}
// If we are at the end of a word ...
if ( this.intStrWordIndex >=
( this.intStrWordLength - 1 )
)
{
// Create a new node.
// If paths should be added ...
if ( this.options.getWritePath() % 2 == 1 )
{
integerTagNumber =
hmPaths.get
(
this.options.getWordTagName()
);
t =
this.createNewNode
(
doc ,
node ,
nodeChild ,
strCurrentPath ,
integerTagNumber
);
if ( integerTagNumber != null )
{
hmPaths.put
(
this.options.getWordTagName() ,
integerTagNumber + t
);
}
else
{
hmPaths.put
(
this.options.getWordTagName() ,
new Integer( t )
);
}
}
// If no path has been requested,
// create node with no path.
else
{
t =
this.createNewNode
(
doc ,
node ,
nodeChild ,
null ,
0
);
}
intChildNumber += t;
i += t;
// Get next adornment entry.
this.getNextEntry();
}
// If not the end of the word,
// increment the character index
// in the word.
else
{
++( this.intStrWordIndex );
}
// If next characters correspond to the
// special separator, intCpt should not
// follow!
if ( this.options.getSpecialSeparator() != null )
{
if ( this.strWord.length() >=
( this.intStrWordIndex +
this.options.getSpecialSeparator().length()
)
)
{
if ( this.strWord.substring
(
this.intStrWordIndex ,
this.intStrWordIndex +
this.options.getSpecialSeparator().
length() ).equals(
this.options.getSpecialSeparator()
)
)
{
this.sbWord.append(
this.options.getSpecialSeparator() );
this.intStrWordIndex +=
this.options.getSpecialSeparator().length();
}
}
}
++( this.intCpt );
}
// If a word has been found (usual case).
if ( this.sbWord.length() > 0 )
{
// Create a new node.
// Should paths should be added?
if ( this.options.getWritePath() % 2 == 1 )
{
integerTagNumber =
hmPaths.get
(
this.options.getWordTagName()
);
t =
this.createNewNode
(
doc ,
node ,
nodeChild ,
strCurrentPath ,
integerTagNumber
);
if ( integerTagNumber != null )
{
hmPaths.put
( this.options.getWordTagName() ,
integerTagNumber + t
);
}
else
{
hmPaths.put
(
this.options.getWordTagName() ,
new Integer( t )
);
}
}
// If no path has been requested.
else
{
t =
this.createNewNode( doc , node ,
nodeChild , null , 0 );
}
intChildNumber += t;
i += t;
}
// If we have seen all text contained
// in the tag.
if ( this.intCpt >= intEnd )
{
// Delete old child text node.
node.removeChild( nodeChild );
--intChildNumber;
--i;
}
}
// Child is not text.
else
{
// Not a jump tag.
if ( !this.options.isJumpTag( strNodeChildName ) )
{
// Path.
if ( this.options.getWritePath() > 0 )
{
integerTagNumber =
hmPaths.get( strNodeChildName );
if ( integerTagNumber == null )
{
integerTagNumber = 1;
}
else
{
++integerTagNumber;
}
strNewPath =
strCurrentPath + File.separator +
strNodeChildName +
"[" + integerTagNumber.toString() + "]";
if ( this.options.getWritePath() >= 2 )
{
( (Element)nodeChild ).setAttribute
(
this.options.getTagsPathArgumentName() ,
strNewPath
);
}
hmPaths.put
(
strNodeChildName ,
integerTagNumber
);
}
// If hard tag or soft tag,
// pursue treatment.
doc =
this.modifyDOM( nodeChild , doc , strNewPath );
}
// Jump tag.
else
{
// Skip jump tag, but remember we did.
vectorTempJumpTags.add( new Integer( i ) );
}
}
}
// All the children have been passed
// and there was a jump tag.
if ( !vectorTempJumpTags.isEmpty() )
{
// Treat all jump tag numbers.
for ( int j = 0 ; j < vectorTempJumpTags.size() ; j++ )
{
nodeChild =
nlChildren.item
(
vectorTempJumpTags.get( j ).intValue()
);
strNodeChildName = nodeChild.getNodeName();
// Path.
if ( this.options.getWritePath() >= 0 )
{
integerTagNumber = hmPaths.get( strNodeChildName );
if ( integerTagNumber == null )
{
integerTagNumber = 1;
}
else
{
++integerTagNumber;
}
strNewPath =
strCurrentPath + File.separator +
strNodeChildName +
"[" + integerTagNumber.toString() + "]";
if ( this.options.getWritePath() >= 2 )
{
( (Element)nodeChild ).setAttribute
(
this.options.getTagsPathArgumentName() ,
strNewPath
);
}
hmPaths.put( strNodeChildName , integerTagNumber );
}
this.intCountNonBlanks++;
// Recursively call modifyDOM on the
// jump tag node.
doc = this.modifyDOM( nodeChild , doc , strNewPath );
}
}
return doc;
}
/** Count non-blank characters in a <code>String</code> and
* update the tag <code>HashMap</code>.
*
* @param strString The text to analyze.
*
* @return Number of non-blank
* characters in strString.
*
* <p>
* strString should have all whitespace characters mapped to
* blanks before this method is called.
* </p>
*/
protected int countNonBlankCharacters( String strString )
throws IOException
{
// Increment tag count.
this.intCountTags++;
// Length of input string.
int intLetters = strString.length();
// Only non-blank characters are
// counted.
int nonBlanks = 0;
for ( int i = 0 ; i < intLetters ; i++ )
{
if ( strString.charAt( i ) != ' ' )
{
nonBlanks++;
}
}
// First character.
int intBegin = this.intCountNonBlanks;
if ( nonBlanks > 0 )
{
intBegin++;
}
// Last character.
this.intCountNonBlanks += nonBlanks;
// HashMap update.
hMap.put
(
new Integer( this.intCountTags ) ,
new XGPair( intBegin , this.intCountNonBlanks )
);
return nonBlanks;
}
/** Extract text from DOM document.
*
* @param options The processing options.
* @param document The document to process.
*
* @return Two element object array.
* result[ 0 ] = XGParser instance.
* result[ 1 ] = reading context text.
*
* @throws IOException
*/
public static Object[] extractText
(
XGOptions options ,
Document document
)
throws IOException
{
StringBuffer sbText = null;
Object[] result = new Object[ 2 ];
// Start document treatment.
XGParser instance = new XGParser( options , document );
// Save parser instance.
result[ 1 ] = instance;
// Extract text of reading context.
sbText = instance.extractText( document );
String strText = sbText.toString();
// Return text of reading context.
result[ 0 ] = sbText.toString();
return result;
}
/** Merged adornments with original XML text.
*
* @param options XGTagger options.
* @param instance XGParser instance.
* @param document Document being processed.
* @param segmentName Name of document segment being processed.
* @param outputter Adorned word outputter.
* @param inputter Text inputter.
*
* @return Map of (word id, # of word parts)
* for words split by soft or jump tags.
*
* @throws IOException
*/
public static Map<Integer, Integer> mergeAdornments
(
XGOptions options ,
XGParser instance ,
Document document ,
String segmentName ,
AdornedWordOutputter outputter ,
TextInputter inputter
)
throws IOException
{
instance.adornerOutputter = outputter;
instance.intCountTags = 0;
// Get next document segment.
instance.nextAdornedWord = 0;
instance.adornedWordDataList =
((ListAdornedWordOutputter)outputter).getAdornedWordDataList();
instance.getNextEntry();
// Pass DOM tree to modifyDOM method
// to update DOM with adorner output.
document = instance.modifyDOM( document , document , "" );
// Output updated DOM tree segment as
// XML text.
File file = File.createTempFile( "mad" , null );
file.deleteOnExit();
String fileName = file.getAbsolutePath();
if ( XGMisc.printNodeToFile( document , fileName ) == 1 )
{
inputter.setSegmentText( segmentName , file );
}
return instance.splitWords;
}
/** Create DOM from XML text.
*
* @param options The processing options.
* @param xmlText The XML text.
*
* @return DOM for document.
*/
public static Document textToDOM
(
XGOptions options ,
String xmlText
)
throws IOException
{
Document result = null;
try
{
// Create a factory of DOM builders.
DocumentBuilderFactory factory =
DocumentBuilderFactory.newInstance();
factory.setExpandEntityReferences( false );
DocumentBuilder builder = factory.newDocumentBuilder();
result =
builder.parse
(
new InputSource( new StringReader( xmlText ) )
);
}
catch ( ParserConfigurationException pce )
{
System.out.println( pce.getMessage() );
}
catch ( SAXException se )
{
System.out.println( se.getMessage() );
}
return result;
}
}