package edu.northwestern.at.morphadorner.tools.namedentities; /* Please see the license information at the end of this file. */ import java.io.*; import java.net.*; import java.text.*; import java.util.*; import java.util.regex.*; import org.w3c.dom.*; import org.w3c.dom.traversal.*; import edu.northwestern.at.utils.*; import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*; import edu.northwestern.at.utils.gate.*; import edu.northwestern.at.utils.xml.*; /** Adorn XML files with named entities. * * <p> * AdornWithNamedEntities adorns texts with named entities such as person, * location, time, date, and organization. * </p> * <p> * Usage: * </p> * <blockquote> * <pre> * java edu.northwestern.at.morphadorner.tools.namedentities.AdornWithNamedEntities outputdirectory input1.xml input2.xml ... * </pre> * </blockquote> * <p> * outputdirectory -- output directory to receive xml files adorned with named entities.<br /> * input*.xml -- input TEI XML files.<br /> * </p> * <p> * Note: The named entity adorner does not always recognize entities which cross soft tags. * Thus "<hi>Emma</hi> Woodhouse" may be recognized as two separate entities. * AdornedWithNamedEntities should be run on the input files before their * submission to MorphAdorner. * </p> */ public class AdornWithNamedEntities { /** DOM document. */ protected static Document document; /** # params before input file specs. */ protected static final int INITPARAMS = 1; /** Number of documents to process. */ protected static int docsToProcess = 0; /** Current document. */ protected static int currentDocNumber = 0; /** Output directory. */ protected static String outputDirectory; /** Annie annotator. */ protected static Annie annie; /** Fixups list resource URL. */ protected static String fixupsURL = "resources/fixups.txt"; /** Fixups list. */ protected static List<PatternReplacer> fixupsList = ListFactory.createNewList(); /** TEI header element pattern. */ protected static final String teiHeaderPattern = "tei|tei\\.2|TEI|TEI\\.2"; /** Main program. * * @param args Program parameters. */ public static void main( String[] args ) { // Initialize. if ( !initialize( args ) ) { System.exit( 1 ); } // Process all files. long startTime = System.currentTimeMillis(); int filesProcessed = processFiles( args ); long processingTime = ( System.currentTimeMillis() - startTime + 999 ) / 1000; // Terminate. terminate( filesProcessed , processingTime ); } /** Initialize. */ protected static boolean initialize( String[] args ) { boolean result = false; // See if we have enough parameters. if ( args.length < 2 ) { System.out.println( "Not enough parameters." ); return result; } // Get the output directory. outputDirectory = args[ 0 ]; // Load fixups. result = loadFixups(); // Initialize Annie. if ( result ) { try { annie = new Annie(); result = true; } catch ( Exception e ) { e.printStackTrace(); } } return result; } /** Load fixup definitions. */ protected static boolean loadFixups() { TextFile fixupsFile = new TextFile ( AdornWithNamedEntities.class.getResourceAsStream( fixupsURL ) , "utf-8" ); String[] fixups = fixupsFile.toArray(); for ( int i = 0 ; i < fixups.length ; i++ ) { String fixupLine = fixups[ i ].trim(); if ( ( fixupLine.length() > 0 ) && ( fixupLine.charAt( 0 ) != '#' ) ) { String[] fixup = fixupLine.split( "\t" ); if ( fixup.length == 2 ) { fixupsList.add ( new PatternReplacer( fixup[ 0 ] , fixup[ 1 ] ) ); } } } return true; } /** Process one file. * * @param xmlFileName XML input file name. */ protected static void processOneFile( String xmlFileName ) { // Extract words from input text. currentDocNumber++; System.out.println( "Processing " + xmlFileName + " (" + currentDocNumber + "/" + docsToProcess + ")" ); // Load the XML document. try { long startTime = System.currentTimeMillis(); // Parse the XML text to a DOM tree. document = DOMUtils.parse( xmlFileName ); long processingTime = ( System.currentTimeMillis() - startTime + 999 ) / 1000; System.out.println ( " Document loaded and parsed in " + Formatters.formatLongWithCommas ( processingTime ) + StringUtils.pluralize ( processingTime , " second." , " seconds." ) ); // Find parent node for text node(s). Node textRoot = findTextNodesParent( document ); // Get text children. List<Node> textRootChildren = DOMUtils.findChildren( textRoot , "text|TEXT" ); startTime = System.currentTimeMillis(); // Traverse each text child and // adorn each with named entities. for ( int i = 0 ; i < textRootChildren.size() ; i++ ) { traverse( textRootChildren.get( i ) ); } // Convert adorned DOM document // to text string. String docText = DOMUtils.saveToString( document ); // Split document text string into // header and body. String[] docParts = splitDocumentText ( docText , "</teiHeader>|</temphead>|</TEMPHEAD>|</tempHead>" ); // Clean up entity references. docParts[ 1 ] = docParts[ 1 ].replaceAll( "<" , "<" ); docParts[ 1 ] = docParts[ 1 ].replaceAll( ">" , ">" ); // Apply fixups to text body only. docParts[ 1 ] = applyFixups( docParts[ 1 ] ); // Put document back together. docText = docParts[ 0 ] + docParts[ 1 ]; // Report processing time. processingTime = ( System.currentTimeMillis() - startTime + 999 ) / 1000; System.out.println ( " Named entities added in " + Formatters.formatLongWithCommas ( processingTime ) + StringUtils.pluralize ( processingTime , " second." , " seconds." ) ); // Save updated text to file. String outputFileName = new File( outputDirectory , FileNameUtils.stripPathName( xmlFileName ) ).getCanonicalPath(); FileUtils.createPathForFile( outputFileName ); FileUtils.writeTextFile( outputFileName , false , docText , "utf-8" ); } catch ( Exception e ) { e.printStackTrace(); System.out.println( " *** Failed" ); } } /** Process files. */ protected static int processFiles( String[] args ) { int result = 0; // Get file name/file wildcard specs. String[] wildCards = new String[ args.length - INITPARAMS ]; for ( int i = INITPARAMS ; i < args.length ; i++ ) { wildCards[ i - INITPARAMS ] = args[ i ]; } // Expand wildcards to list of // file names, String[] fileNames = FileNameUtils. expandFileNameWildcards( wildCards ); docsToProcess = fileNames.length; // Process each file. for ( int i = 0 ; i < fileNames.length ; i++ ) { processOneFile( fileNames[ i ] ); } return fileNames.length; } /** Terminate. * * @param filesProcessed Number of files processed. * @param processingTime Processing time in seconds. */ protected static void terminate ( int filesProcessed , long processingTime ) { System.out.println ( "Processed " + Formatters.formatIntegerWithCommas ( filesProcessed ) + StringUtils.pluralize ( processingTime , " file in " , " files in " ) + Formatters.formatLongWithCommas ( processingTime ) + StringUtils.pluralize ( processingTime , " second." , " seconds." ) ); } /** Traverse DOM tree and fix quotes. * * @param node Root node of tree. */ protected static void traverse( Node node ) { // Process child nodes. NodeList children = node.getChildNodes(); if ( children != null ) { for ( int i = 0 ; i < children.getLength() ; i++ ) { traverse( children.item( i ) ); } } // Get this node's type. int type = node.getNodeType(); // If we have a text node, // extract its text and annotate the // named entities. if ( type == Node.TEXT_NODE ) { Text textNode = (Text)node; // Get node text. String text = textNode.getData(); // If we have at least one character // of text ... if ( ( text != null ) && ( text.length() > 0 ) ) { // Update node text with revised // quotes. text = addNamedEntities( text ); if ( text != null ) { textNode.setData( text ); } } } } /** Adorn text with named entities. * * @param text The text. * * @return The adorned text. * Null if annotation could not be done. */ protected static String addNamedEntities( String text ) { return annie.adornText( text ); } /** Apply fixups. * * @param text The text to which to apply fixups. * * @return The text after applying fixups. */ protected static String applyFixups( String text ) { String result = text; // Apply basic character entity fixups. result = result.replaceAll( "&(\\w+);" , "&$1;" ); result = result.replaceAll( "'" , "'" ); result = result.replaceAll( "<" , "<" ); result = result.replaceAll( ">" , ">" ); result = result.replaceAll( """ , "\"" ); // Apply other fixups. for ( int i = 0 ; i < fixupsList.size() ; i++ ) { PatternReplacer fixup = fixupsList.get( i ); result = fixup.replace( result ); } return result; } /** Split document text. * * @param docText The document text. * * @param splitString The regular expression string at which to * split the document. * If this appears more than once, the * document is split at the first appearance. * * @return Two element string array. * [0] = document text up to * first appearance of split string. * Empty if split string not found. * [1] = document text right after start of split * string through end of document. */ protected static String[] splitDocumentText ( String docText , String splitString ) { String[] result = new String[ 2 ]; Matcher matcher = Pattern.compile( splitString ).matcher( docText ); if ( matcher.find() ) { int splitIndex = matcher.start(); result[ 0 ] = docText.substring( 0 , splitIndex ); result[ 1 ] = docText.substring( splitIndex ); } else { result[ 0 ] = ""; result[ 1 ] = docText; } return result; } /** Find parent of text nodes in a DOM document. * * @param document The document. * * @return Node which is parent of the text nodes. */ protected static Node findTextNodesParent( Document document ) { // Get root element of document. Element rootNode = document.getDocumentElement(); // Look for TEI node of some kind. Element teiNode; if ( rootNode.getTagName().matches( teiHeaderPattern ) ) { teiNode = rootNode; } else { teiNode = DOMUtils.findChild( rootNode , teiHeaderPattern ); } // Look for EEBO node. Element eeboNode = DOMUtils.findChild( rootNode , "eebo|EEBO" ); Element groupTextRoot = null; // See if we have EEBO GROUP child node. if ( eeboNode != null ) { groupTextRoot = DOMUtils.findChild( eeboNode, "group|GROUP" ); } // If TEI, text node parent is // TEI node. // If EEBO, text node parent is either // EEBO or GROUP. Element textParent = null; if ( groupTextRoot != null ) { textParent = groupTextRoot; } else { textParent = eeboNode; if ( textParent == null ) { textParent = teiNode; } } // Return parent for text nodes. return textParent; } /** Allow overrides but not instantiation. */ protected AdornWithNamedEntities() { } } /* Copyright (c) 2008, 2009 by Northwestern University. All rights reserved. Developed by: Academic and Research Technologies Northwestern University http://www.it.northwestern.edu/about/departments/at/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of Academic and Research Technologies, Northwestern University, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */