package edu.northwestern.at.morphadorner.tools.fixquotes; /* Please see the license information at the end of this file. */ import java.io.*; import java.net.*; import java.text.*; import java.util.*; import java.util.regex.*; import edu.northwestern.at.utils.*; import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*; import edu.northwestern.at.utils.xml.*; import org.w3c.dom.*; import org.w3c.dom.traversal.*; /** Fix quotes in TEI XML files. * * <p> * Usage: * </p> * <blockquote> * <pre> * java edu.northwestern.at.morphadorner.tools.fixquotes.FixXMLQuotes softtags.txt jumptags.txt outputdirectory input1.xml input2.xml ... * </pre> * </blockquote> * <p> * softtags.txt -- text file containing list of soft XML tags, one per line.<br /> * jumptags.txt -- text file containing list of jump XML tags, one per line.<br /> * outputdirectory -- output directory to receive xml files with quotes fixed.<br /> * input*.xml -- input TEI XML files.<br /> * </p> * * <p> * Since the "quotification" relies on heuristics, not all quotes will be * converted correctly. * </p> */ public class FixXMLQuotes { /** DOM document. */ protected static Document document; /** # params before input file specs. */ protected static final int INITPARAMS = 3; /** Number of documents to process. */ protected static int docsToProcess = 0; /** Current document. */ protected static int currentDocNumber = 0; /** Output directory. */ protected static String outputDirectory; /** Contractions. */ protected static TaggedStrings contractions; /** Pattern matcher for matching contractions. */ protected static Matcher contractionsMatcher; /** Left single quote replacement text. */ protected static final String lsquo = "‘"; /** Left double quote replacement text. */ protected static final String ldquo = "“"; /** Right single quote replacement text. */ protected static final String rsquo = "’"; /** Right double quote replacement text. */ protected static final String rdquo = "”"; /** Apostrophereplacement text. */ protected static final String apos = "'"; /** Temporary single quote marker. */ protected static final String sq = "\uE060"; /** Temporary double quote marker. */ protected static final String dq = "\uE061"; /** Temporary apostrophe marker. */ protected static final String ap = "\uE062"; /** Previous character of last text segment. */ protected static String prevChar = " "; /** Soft tags. */ protected static Set<String> softTags; /** Jump tags. */ protected static Set<String> jumpTags; /** True for debugging output. */ protected static boolean debug = false; /** Main program. * * @param args Program parameters. */ public static void main( String[] args ) { // Initialize. if ( !initialize( args ) ) { System.exit( 1 ); } // Process all files. long startTime = System.currentTimeMillis(); int filesProcessed = processFiles( args ); long processingTime = ( System.currentTimeMillis() - startTime + 999 ) / 1000; // Terminate. terminate( filesProcessed , processingTime ); } /** Initialize. */ protected static boolean initialize( String[] args ) { // See if we have enough parameters. if ( args.length < 2 ) { System.out.println( "Not enough parameters." ); return false; } // Load soft tags. try { softTags = SetUtils.loadSet( args[ 0 ] , "utf-8" ); } catch ( IOException e ) { return false; } // Load jump tags. try { jumpTags = SetUtils.loadSet( args[ 1 ] , "utf-8" ); } catch ( IOException e ) { return false; } // Get the output directory. outputDirectory = args[ 2 ]; // Load contractions. contractions = FixQuotes.loadContractions( "resources/contractions.txt" ); // Build contractions pattern. Pattern contractionsPattern = FixQuotes.buildContractionsPattern( contractions ); // Get a contractions matcher. contractionsMatcher = contractionsPattern.matcher( "" ); return true; } /** Process one file. * * @param xmlFileName XML input file name. */ protected static void processOneFile( String xmlFileName ) { // Extract words from input text. currentDocNumber++; System.out.println( "Processing " + xmlFileName + " (" + currentDocNumber + "/" + docsToProcess + ")" ); // Load and parse XML document // to DOM tree. try { long startTime = System.currentTimeMillis(); // Load document to a string. String docText = FileUtils.readTextFile( xmlFileName , "utf-8" ); // Convert existing ' to // special marker. We will convert // the apostrophes back later. docText = docText.replaceAll( "'" , ap ); // Parse the XML text to a DOM tree. document = DOMUtils.parseText( docText ); // Remember the DTD name. DocumentType docType = document.getDoctype(); String dtdName = docType.getSystemId(); // Report document load and parse time. long processingTime = ( System.currentTimeMillis() - startTime + 999 ) / 1000; System.out.println ( " Document loaded and parsed in " + Formatters.formatLongWithCommas ( processingTime ) + StringUtils.pluralize ( processingTime , " second." , " seconds." ) ); // Get text root node. Node textRoot = DOMUtils.getChild( document , "TEI" ); // Get text children. List<Node> textRootChildren = DOMUtils.getChildren( textRoot , "text" ); startTime = System.currentTimeMillis(); // Traverse each child of the // text node. for ( int i = 0 ; i < textRootChildren.size() ; i++ ) { traverse( textRootChildren.get( i ) ); } // Clean up entity references. docText = DOMUtils.saveToString( document , dtdName ); docText = docText.replaceAll( "&ldquo" , "&ldquo" ); docText = docText.replaceAll( "&rdquo" , "&rdquo" ); docText = docText.replaceAll( "&lsquo" , "&lsquo" ); docText = docText.replaceAll( "&rsquo" , "&rsquo" ); docText = docText.replaceAll( "&apos" , "&apos" ); docText = docText.replaceAll( ap , "'" ); // Report processing time. processingTime = ( System.currentTimeMillis() - startTime + 999 ) / 1000; System.out.println ( " Quotes fixed in " + Formatters.formatLongWithCommas ( processingTime ) + StringUtils.pluralize ( processingTime , " second." , " seconds." ) ); // Save updated xml to output file. String outputFileName = new File( outputDirectory , FileNameUtils.stripPathName( xmlFileName ) ).getCanonicalPath(); FileUtils.writeTextFile( outputFileName , false , docText , "utf-8" ); } catch ( Exception e ) { e.printStackTrace(); System.out.println( " *** Failed" ); } } /** Traverse DOM tree and fix quotes. * * @param node Root node of tree. */ protected static void traverse( Node node ) { // Get the node tag name. String nodeTag = node.getNodeName(); // Remember the previous character. String savePrevChar = prevChar; // If it is a hard tag, set the previous // character to a blank, since text // does not carry over from one hard // tag to another. if ( isHardTag( nodeTag ) ) { savePrevChar = " "; prevChar = " "; } // If it is a jump tag, set the // previous character to a blank, since // it does not carry over into a jump tag. // However, we will restore the // previous character string after the // jump tag is processed. else if ( isJumpTag( nodeTag ) ) { prevChar = " "; } // Process child nodes. NodeList children = node.getChildNodes(); if ( children != null ) { for ( int i = 0 ; i < children.getLength() ; i++ ) { traverse( children.item( i ) ); } } // Get this node's type. int type = node.getNodeType(); // If we have a text node, // extract its text and fix the // quotes. if ( type == Node.TEXT_NODE ) { Text textNode = (Text)node; // Get node text. String text = textNode.getData(); // If we have at least one character // of text ... if ( ( text != null ) && ( text.length() > 0 ) ) { // Remember last character of this // text section before processing. String lastChar = text.substring( text.length() - 1 ); // Prefix text with previous character // and add a blank to the end for // context. Appending a blank isn't // really correct for soft tabs -- // we actually want the next character // following the end of the soft tag // sequence -- but it's easier to just // use a blank which works most of the // time anyway. // // Fix the quotes. text = FixQuotes.repairQuotes ( prevChar + text + " " , contractionsMatcher , contractions ); // Remove the first and last // characters we added for context. text = text.substring( 1 , text.length() - 1 ); // Update node text with revised // quotes. textNode.setData( text ); // Set previous character for next // text section to last character of // this text section. prevChar = lastChar; } } else { // Restore previous character // if this was a jump tag. prevChar = savePrevChar; } } /** Process files. */ protected static int processFiles( String[] args ) { int result = 0; // Get file name/file wildcard specs. String[] wildCards = new String[ args.length - INITPARAMS ]; for ( int i = INITPARAMS ; i < args.length ; i++ ) { wildCards[ i - INITPARAMS ] = args[ i ]; } // Expand wildcards to list of // file names, String[] fileNames = FileNameUtils.expandFileNameWildcards( wildCards ); docsToProcess = fileNames.length; // Process each file. for ( int i = 0 ; i < fileNames.length ; i++ ) { processOneFile( fileNames[ i ] ); } return fileNames.length; } /** Terminate. * * @param filesProcessed Number of files processed. * @param processingTime Processing time in seconds. */ protected static void terminate ( int filesProcessed , long processingTime ) { System.out.println ( "Processed " + Formatters.formatIntegerWithCommas ( filesProcessed ) + " files in " + Formatters.formatLongWithCommas ( processingTime ) + StringUtils.pluralize ( processingTime , " second." , " seconds." ) ); } /** Is tag a soft tag? * * @param tag The XML tag. * * @return true if tag is a soft tag. */ protected static boolean isSoftTag( String tag ) { return softTags.contains( tag ) || softTags.contains( tag.toLowerCase() ); } /** Is tag a jump tag? * * @param tag The XML tag. * * @return true if tag is a jump tag. */ protected static boolean isJumpTag( String tag ) { return jumpTags.contains( tag ) || jumpTags.contains( tag.toLowerCase() ); } /** Is tag a hard tag? * * @param tag The XML tag. * * @return true if tag is a hard tag. */ protected static boolean isHardTag( String tag ) { return !( isSoftTag( tag ) || isJumpTag( tag ) ); } /** Allow overrides but not instantiation. */ protected FixXMLQuotes() { } } /* Copyright (c) 2008, 2009 by Northwestern University. All rights reserved. Developed by: Academic and Research Technologies Northwestern University http://www.it.northwestern.edu/about/departments/at/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of Academic and Research Technologies, Northwestern University, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */