package edu.northwestern.at.morphadorner; /* Please see the license information at the end of this file. */ import java.io.*; import java.text.*; import java.util.*; import org.xml.sax.*; import org.xml.sax.helpers.*; import com.megginson.sax.*; import edu.northwestern.at.utils.*; import edu.northwestern.at.utils.math.*; import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.*; import edu.northwestern.at.utils.corpuslinguistics.sentencemelder.*; import edu.northwestern.at.utils.xml.*; /** XML filter which updates <w> tag IDs and attributes. */ public class IDFixerFilter extends ExtendedXMLFilterImpl { /** Word ID formatters. */ protected static final NumberFormat ID_FORMATTER = NumberFormat.getInstance(); /** Page number formatter. */ protected static final NumberFormat PAGE_FORMATTER = NumberFormat.getInstance(); /** Word within page formatter. */ protected static final NumberFormat WORD_FORMATTER = NumberFormat.getInstance(); static { PAGE_FORMATTER.setMinimumIntegerDigits( 4 ); WORD_FORMATTER.setMinimumIntegerDigits( 3 ); ID_FORMATTER.setMinimumIntegerDigits( 8 ); PAGE_FORMATTER.setGroupingUsed( false ); WORD_FORMATTER.setGroupingUsed( false ); ID_FORMATTER.setGroupingUsed( false ); } /** Word ordinal. */ protected int wordOrdinal = 0; /** Previous ID value. */ protected String lastID = ""; /** Current ID value as string. */ protected String id = ""; /** ID attribute name. */ protected String idAttrName = WordAttributeNames.id; /** Base XML file name for generating ID values. */ protected String baseFileName; /** Part of speech tags used in XML file. */ protected PartOfSpeechTags posTags; /** URI for elements without one. */ protected String elementURI = null; /** True to output whitespace elements. */ protected boolean outputWhitespace = true; /** True to output non-redundant attributes only. */ protected boolean outputNonredundantAttributesOnly = false; /** True to output non-redundant token attributes only. */ protected boolean outputNonredundantTokenAttribute = false; /** True to output sentence boundary milestones. */ protected boolean outputSentenceBoundaryMilestones = false; /** True to output page boundary milestones. */ protected boolean outputPseudoPageBoundaryMilestones = false; /** Page size in number of tokens. */ protected int pseudoPageSize = 500; /** Current pseudo page count. */ protected int pseudoPageCount = 0; /** Current pseudo page word count. */ protected int pseudoPageWordCount = 0; /** True if pseudo page started. */ protected boolean pseudoPageStarted = false; /** Current number of words emitted. */ protected int emittedWordCount = 0; /** XML sentence melder. */ protected XMLSentenceMelder sentenceMelder; /** True if we're processing first word in a sentence. */ protected boolean isFirstWord = false; /** Pending word element. */ protected PendingElement pendingWordElement = null; /** Split words map of word ID to # of word parts. */ protected Map<Integer, Integer> splitWords; /** Copy of split words map. */ protected Map<Integer, Integer> splitWordsCopy; /** Foreign atttribute stack. */ protected QueueStack<String> foreignStack = new QueueStack<String>(); /** Jump tag stack. Saves state across jump tags. */ protected QueueStack<XMLWriterState> jumpStack = new QueueStack<XMLWriterState>(); /** Div tag stack. */ protected QueueStack<String> divStack = new QueueStack<String>(); /** Pseudo-page ending div types. */ protected Set<String> pseudoPageContainerDivTypes = SetFactory.createNewSet(); /** Sorted sentence and word number information. */ protected SortedArrayList<SentenceAndWordNumber> sortedWords; /** XML writer. */ protected XMLWriter writer; /** Total number of words to emit. */ protected int totalWordsToEmit = 0; /** Running page number. Starts at 0. */ protected int pageNumber = 0; /** Word within page number. Starts at 1. */ protected int wordNumberWithinPage = 0; /** ID Spacing. */ protected int idSpacing = 10; /** ID Type. */ protected MorphAdornerSettings.XMLIDType idType = MorphAdornerSettings.XMLIDType.READING_CONTEXT_ORDER; /** Output word ordinal. */ protected boolean outputWordOrdinal = true; /** Map of XML language tags to language name. */ protected static Map<String, String> languageTags = new TreeMap<String, String>(); /** Create ID filter. * * @param reader The XML reader to filter. * @param posTags The part of spech tags. * @param outFile The output file name. * @param maxID The maximum integer word ID. * @param sortedWords Sentence and word numbers sorted by word ID. * @param splitWords Split words. * @param totalWords Total words. * @param totalPageBreaks Total page breaks. */ public IDFixerFilter ( XMLReader reader , PartOfSpeechTags posTags , String outFile , int maxID , SortedArrayList<SentenceAndWordNumber> sortedWords , Map<Integer, Integer> splitWords , int totalWords , int totalPageBreaks ) { super( reader ); // Save ID attribute name. this.idAttrName = MorphAdornerSettings.xgOptions.getIdArgumentName(); // Output non-redundant attributes // only. this.outputNonredundantAttributesOnly = MorphAdornerSettings.outputNonredundantAttributesOnly; // Output non-redundant token attribute // only. this.outputNonredundantTokenAttribute = MorphAdornerSettings.outputNonredundantTokenAttribute; // Output sentence boundary milestones. this.outputSentenceBoundaryMilestones = MorphAdornerSettings.outputSentenceBoundaryMilestones; // Output word ordinal attribute. this.outputWordOrdinal = MorphAdornerSettings.outputWordOrdinal; // Output pseudo page boundary milestones. this.outputPseudoPageBoundaryMilestones = MorphAdornerSettings.outputPseudoPageBoundaryMilestones; // Pseudo page length in tokens. this.pseudoPageSize = MorphAdornerSettings.pseudoPageSize; // Pseudo page ending div types. String[] divTypes = StringUtils.makeTokenArray ( MorphAdornerSettings.pseudoPageContainerDivTypes ); for ( int i = 0 ; i < divTypes.length ; i++ ) { this.pseudoPageContainerDivTypes.add( divTypes[ i ].toLowerCase() ); } // Save output whitespace option. this.outputWhitespace = MorphAdornerSettings.outputWhitespaceElements; // Set sorted words. Will be set here. */ this.sortedWords = sortedWords; // Set split words. setSplitWords( splitWords ); // Save part of speech tags. setPosTags( posTags ); // Set word ID format from output // file name, maximum integer word ID, // and maximum number of page breaks. setIDFormat( outFile , maxID , totalPageBreaks ); // Total words. this.totalWordsToEmit = totalWords; elementURI = null; // Create list to hold sentence and // word number information. sortedWords = new SortedArrayList<SentenceAndWordNumber>(); } /** Handle start of an XML element. */ public void startElement ( String uri , String localName , String qName , Attributes atts ) throws SAXException { // Copy existing attributes for // this XML element. AttributesImpl newAtts = new AttributesImpl( atts ); // Assume we will pass on this // element for immediate output. boolean outputNow = true; // See if we have a path (p=) attribute. String p = atts.getValue( WordAttributeNames.p ); // Get the language attribute and // push it onto the foreign language // stack. foreignStack.push( getForeignLanguageTag( qName , atts ) ); // Add document name to front of path. if ( ( p != null ) && ( p.length() > 0 ) ) { setAttributeValue( newAtts, WordAttributeNames.p , "\\" + baseFileName + p ); } // Eject "TEIform=" attribute. String teiform = atts.getValue( "TEIform" ); if ( ( teiform != null ) && ( teiform.length() > 0 ) ) { removeAttribute( newAtts, "TEIform" ); } // If we have a page break, // increment the page count. if ( qName.equals( "pb" ) ) { pageNumber++; wordNumberWithinPage = 0; } // If we have a word tag element ... if ( qName.equals( "w" ) ) { // Don't output <w> element now, // but wait until we are able to // pick up its text. outputNow = false; // Get existing word attribute values. id = atts.getValue( idAttrName ); String tok = atts.getValue( WordAttributeNames.tok ); String spe = atts.getValue( WordAttributeNames.spe ); String pos = atts.getValue( WordAttributeNames.pos ); String eos = atts.getValue( WordAttributeNames.eos ); String lem = atts.getValue( WordAttributeNames.lem ); String reg = atts.getValue( WordAttributeNames.reg ); String part = atts.getValue( WordAttributeNames.part ); // Clean tok attribute value of // any special milestone characters. tok = StringUtils.replaceAll ( tok , CharUtils.CHAR_FAKE_SOFT_HYPHEN_STRING , "-" ); // Convert nonbreaking blanks to // regular blanks. /* tok = StringUtils.replaceAll ( tok , CharUtils.NONBREAKING_BLANK_STRING , " " ); */ // Clean tok, spe, and lem of // sup text marker string. if ( tok.indexOf( CharUtils.CHAR_SUP_TEXT_MARKER_STRING ) >= 0 ) { tok = StringUtils.replaceAll ( tok , CharUtils.CHAR_SUP_TEXT_MARKER_STRING , "" ); spe = StringUtils.replaceAll ( spe , CharUtils.CHAR_SUP_TEXT_MARKER_STRING , "" ); lem = StringUtils.replaceAll ( lem , CharUtils.CHAR_SUP_TEXT_MARKER_STRING , "" ); } // Integer version of word ID. int thisID = Integer.parseInt( id ); // Remember if word ID changed from // previous word. boolean idChanged = !id.equals( lastID ); // If we have a pending word element, // and the word ID changed, // emit the pending // word element now. if ( ( pendingWordElement != null ) && idChanged ) { emitWordElement ( pendingWordElement.getURI() , pendingWordElement.getLocalName() , pendingWordElement.getQName() , pendingWordElement.getAttributes() , pendingWordElement.getText() , true , false ); // Clear pending word. pendingWordElement = null; } // Is this a split word? if ( splitWords.containsKey( thisID ) ) { // Yes. Emit part="I" for first // part, part="F" for last part, // and part="M" for any middle parts. int nParts = splitWordsCopy.get( thisID ); if ( nParts == splitWords.get( thisID ) ) { part = "I"; } else if ( nParts <= 1 ) { part = "F"; } else { part = "M"; } nParts--; splitWordsCopy.put( thisID , nParts ); } // Not a split word -- part is "N". else { part = "N"; } // Increment word count within page. wordNumberWithinPage++; // Generate word ID. String idString = baseFileName + "-"; switch ( idType ) { case READING_CONTEXT_ORDER: idString += ID_FORMATTER.format( thisID * idSpacing ); break; case WORD_WITHIN_PAGE_BLOCK: idString += PAGE_FORMATTER.format( pageNumber ) + "-" + WORD_FORMATTER.format( wordNumberWithinPage * idSpacing ); } // Split words get a ".partnumber" // added to the end of the ID. if ( !part.equals( "N" ) ) { int partNumber = splitWords.get( thisID ) - splitWordsCopy.get( thisID ); idString = idString + "." + partNumber; } // Set ID attribute value. setAttributeValue( newAtts , idAttrName , idString ); // Generate word ordinal. if ( idChanged ) ++wordOrdinal; // Generate word ordinal attribute value. if ( outputWordOrdinal ) { setAttributeValue( newAtts, WordAttributeNames.ord , wordOrdinal + "" ); } // Remember this word's ID. lastID = id; // If we're in an element with a foreign // language attribute, // reset the part of speech to reflect // this word is foreign. Also reset the // lemma to the spelling. if ( !foreignStack.isEmpty() && ( foreignStack.peek().length() > 0 ) ) { if ( posTags.isNumberTag( pos ) || posTags.isSymbolTag( pos ) || posTags.isPunctuationTag( pos ) ) { } else { pos = foreignStack.peek(); lem = spe; } } // Handle missing attributes. if ( spe == null ) { spe = tok; } if ( pos == null ) { pos = spe; } if ( lem == null ) { lem = spe; } if ( eos == null ) { eos = "0"; } if ( reg == null ) { reg = spe; } // Save updated attributes. setAttributeValue( newAtts , WordAttributeNames.eos , eos ); setAttributeValue( newAtts , WordAttributeNames.lem , lem ); setAttributeValue( newAtts , WordAttributeNames.pos , pos ); setAttributeValue( newAtts , WordAttributeNames.reg , reg ); setAttributeValue( newAtts , WordAttributeNames.spe , spe ); setAttributeValue( newAtts , WordAttributeNames.tok , tok ); setAttributeValue( newAtts , WordAttributeNames.part , part ); // Remove redundant attributes // if requested. if ( outputNonredundantAttributesOnly ) { if ( eos.equals( "0" ) ) { removeAttribute ( newAtts , WordAttributeNames.eos ); } if ( spe.equals( tok ) ) { removeAttribute ( newAtts , WordAttributeNames.spe ); } if ( lem.equals( spe ) ) { removeAttribute ( newAtts , WordAttributeNames.lem ); } if ( pos.equals( spe ) ) { removeAttribute ( newAtts , WordAttributeNames.pos ); } if ( reg.equals( spe ) ) { removeAttribute ( newAtts , WordAttributeNames.reg ); } } // Output whitespace milestone // if needed before this word. // We only do this before the // first part of a multipart word. if ( idChanged ) { if ( outputWhitespace ) { if ( sentenceMelder.shouldOutputBlank( spe , isFirstWord ) ) { sentenceMelder.outputBlank(); } sentenceMelder.processWord( spe ); } isFirstWord = eos.equals( "1" ); } // Save this word element as // pending so we can examine its // text before emitting it. pendingWordElement = new PendingElement( uri , localName , qName , newAtts ); } else { // Emit any pending word element. if ( pendingWordElement != null ) { emitWordElement ( pendingWordElement.getURI() , pendingWordElement.getLocalName() , pendingWordElement.getQName() , pendingWordElement.getAttributes() , pendingWordElement.getText() , true , false ); // Clear pending word. pendingWordElement = null; } // Note if we have a div tag. Save // the div type if given, otherwise // save "*div". if ( qName.equalsIgnoreCase( "div" ) ) { String divType = atts.getValue( "type" ); if ( ( divType == null ) || ( divType.length() == 0 ) ) { divType = "*div"; } divStack.push( divType.toLowerCase() ); } // If we have a foreign tag element ... else if ( qName.equalsIgnoreCase( "foreign" ) ) { } // Reset the sentence melder at the // start of a jump or hard tag. else if ( !MorphAdornerSettings.xgOptions.isSoftTag( qName ) ) { // If we have a jump tag, save the state // of the sentence melder so we can // restore it when the jump tag ends. if ( MorphAdornerSettings.xgOptions.isJumpTag( qName ) ) { jumpStack.push ( new XMLWriterState ( isFirstWord , sentenceMelder ) ); } // Reset the sentence melder. sentenceMelder.reset(); isFirstWord = true; } } if ( elementURI == null ) { elementURI = uri; if ( outputWhitespace ) { sentenceMelder.setURI( elementURI ); } } // Hold <w> element and its text // but let others through. if ( outputNow ) { if ( !qName.startsWith( "zzzz" ) ) { super.startElement( uri , localName , qName , newAtts ); } } } /** Handle character data. * * @param ch Array of characters. * @param start The starting position in the array. * @param length The number of characters. * * @throws org.xml.sax.SAXException If there is an error. */ public void characters( char ch[] , int start , int length ) throws SAXException { // If there is a pending word element, // this is text for that word. if ( pendingWordElement != null ) { pendingWordElement.appendText( ch , start , length ); } // Otherwise just pass on the text // for immediate output. else { super.characters( ch , start , length ); } } /** Emit a word element. * * @param uri The word element's URI. * @param localName The word element's local name. * @param qName The word element's qname. * @param atts The word element's attributes. * @param wordText The word element's text. * @param allowOutputWhitespace True to allow outputting * whitespace element for word. * @param forceEOS True to force end of sentence * for this word. */ public void emitWordElement ( String uri , String localName , String qName , AttributesImpl atts , String wordText , boolean allowOutputWhitespace , boolean forceEOS ) throws SAXException { // See if we have a path (p=) attribute. String p = atts.getValue( WordAttributeNames.p ); // Get "part=" attribute value. String part = atts.getValue( WordAttributeNames.part ); // Is this first or only word part // for this word? boolean isFirstWordPart = ( part == null ) || part.equals( "N" ) || part.equals( "I" ); // Is this last or only word part // for this word? boolean isLastWordPart = ( part == null ) || part.equals( "N" ) || part.equals( "F" ); // Create start pseudopage element if // this is the first word in a // pseudopage. if ( isLastWordPart && outputPseudoPageBoundaryMilestones && ( pseudoPageWordCount == 0 ) && ( !pseudoPageStarted ) ) { if ( ( p != null ) && ( p.length() > 0 ) ) { int bsPos = p.lastIndexOf( "\\" ); if ( bsPos > 0 ) { p = p.substring( 0 , bsPos ); } p = p + "\\milestone[" + ( pseudoPageCount + 1 ) + "]"; } emitPseudoPageElement ( createPseudoPageElement ( uri , false , true , p ) ); } // Increment count of words in current // pseudopage. pseudoPageWordCount++; // Assume we don't have a word // with an eos set to "1" indicating // and end of sentence boundary. boolean emitSentenceBoundary = false; // Force end of sentence flag true // if requested. if ( forceEOS ) { setAttributeValue( atts , WordAttributeNames.eos , "1" ); } // See if this word is the end if // the sentence. String eos = atts.getValue( WordAttributeNames.eos ); emitSentenceBoundary = ( eos != null ) && eos.equals( "1" ); // If the word token is the same as // the word text, and we are // outputting abbreviated attributes, // remove the redundant token // attribute. if ( outputNonredundantAttributesOnly || outputNonredundantTokenAttribute ) { // Get token attribute value. String tok = atts.getValue( WordAttributeNames.tok ); // If word text same as token text, // remove tok attribute. if ( tok.equals( wordText ) ) { removeAttribute( atts , WordAttributeNames.tok ); } // If part="N", remove part // attribute. if ( ( part != null ) && part.equals( "N" ) ) { removeAttribute( atts , WordAttributeNames.part ); } } // Remove sentence number and // word number attributes. These // are added back later if needed. removeAttribute( atts , WordAttributeNames.sn ); removeAttribute( atts , WordAttributeNames.wn ); // Output start <w> element. super.startElement ( uri , localName , qName , atts ); // Output word text. wordText = StringUtils.replaceAll ( wordText , CharUtils.CHAR_FAKE_SOFT_HYPHEN_STRING , "-" ); wordText = StringUtils.replaceAll ( wordText , CharUtils.CHAR_SUP_TEXT_MARKER_STRING , "" ); super.characters( wordText.toCharArray() , 0 , wordText.length() ); // Output end element. super.endElement( uri , localName , qName ); // Save word information for // generating word and sentence // numbers later. String id = atts.getValue( idAttrName ); String ord = atts.getValue( WordAttributeNames.ord ); if ( ord == null ) { ord = "0"; } sortedWords.add ( new SentenceAndWordNumber ( id , Integer.parseInt( ord ) , part , emitSentenceBoundary ) ); // Increment count of words emitted. emittedWordCount++; // Output a blank if necessary // following a word. if ( outputWhitespace && allowOutputWhitespace ) { if ( isFirstWord && isLastWordPart ) { sentenceMelder.outputBlank(); } } // Create end pseudopage element if // this is the last word in a // pseudopage. if ( isLastWordPart && outputPseudoPageBoundaryMilestones && ( ( pseudoPageWordCount >= pseudoPageSize ) || ( emittedWordCount >= totalWordsToEmit ) ) ) { if ( ( p != null ) && ( p.length() > 0 ) ) { int bsPos = p.lastIndexOf( "\\" ); if ( bsPos > 0 ) { p = p.substring( 0 , bsPos ); } p = p + "\\milestone[" + ( pseudoPageCount + 1 ) + "]"; } emitPseudoPageElement ( createPseudoPageElement ( uri , false , false , p ) ); } } /** Handle end of an element. * * @param uri The XML element's URI. * @param localName The XML element's local name. * @param qName The XML element's qname. */ public void endElement ( String uri , String localName , String qName ) throws SAXException { // Remember if we pop the div stack. boolean removedDiv = false; String removedDivType = ""; // Pop the foreign language // attribute stack. if ( !foreignStack.isEmpty() ) { foreignStack.pop(); } // If this the end of a div tag, // pop the div stack. else if ( qName.equals( "div" ) ) { if ( !divStack.isEmpty() ) { removedDivType = divStack.pop(); removedDiv = true; } } // Figure out what type of tag this is. boolean isJumpTag = MorphAdornerSettings.xgOptions.isJumpTag( qName ); boolean isSoftTag = MorphAdornerSettings.xgOptions.isSoftTag( qName ); boolean isHardTag = !( isJumpTag || isSoftTag ); boolean isWordTag = ( qName.equals( "w" ) ); // Output pending word element // and its text if this is not // a word element. if ( ( pendingWordElement != null ) && !isWordTag ) { // See if we have to force this word // to be the end of a sentence. boolean forceEOS = ( isHardTag && MorphAdornerSettings.closeSentenceAtEndOfHardTag ) || ( isJumpTag && MorphAdornerSettings.closeSentenceAtEndOfJumpTag ); emitWordElement ( pendingWordElement.getURI() , pendingWordElement.getLocalName() , pendingWordElement.getQName() , pendingWordElement.getAttributes() , pendingWordElement.getText() , isWordTag || isSoftTag , forceEOS ); // Clear pending word. pendingWordElement = null; } // Output end element except for word tag. // We already output the end element for // a word tag in emitWordElement. if ( !isWordTag ) { // If tag to eject, eject it, // otherwise let it through. if ( !qName.startsWith( "zzzz" ) ) { super.endElement( uri , localName , qName ); } } // If we are returning from a jump tag, // restore the previous sentence melder // state prior to the jump tag's // appearance. if ( isJumpTag ) { if ( !jumpStack.isEmpty() ) { XMLWriterState state = jumpStack.pop(); isFirstWord = state.getIsFirstWord(); sentenceMelder.setState ( state.getSentenceMelderState() ); } } // Do nothing for other soft tags. else if ( isSoftTag ) { } // If we have a hard tag, reset sentence // melder. else { sentenceMelder.reset(); isFirstWord = true; } // If this is the end of a div, // and it is a pseudopage ending // div type, make sure we emit // the end pseudopage after the // pending word is emitted by // setting the count of words in // the current pseudopage larger // than the size of a pseudo page. String p = null; if ( outputPseudoPageBoundaryMilestones && removedDiv && pseudoPageContainerDivTypes.contains( removedDivType ) ) { if ( ( p != null ) && ( p.length() > 0 ) ) { int bsPos = p.lastIndexOf( "\\" ); if ( bsPos > 0 ) { p = p.substring( 0 , bsPos ); } p = p + "\\milestone[" + ( pseudoPageCount + 1 ) + "]"; } if ( emittedWordCount < totalWordsToEmit ) { emitPseudoPageElement ( createPseudoPageElement ( uri , false , false , p ) ); emitPseudoPageElement ( createPseudoPageElement ( uri , false , true , p ) ); } } } /** Create a pseudo page milestone. * * @param uri Element URI. * @param forcedEmit Emit pseudo page milestone even if * not enough words accumulated, as long as * at least one word in current block. * @param start true if starting milestone, false if ending. * @param path Path attribute. May be null. * * @return The pseudo page element. */ public PendingElement createPseudoPageElement ( String uri , boolean forcedEmit , boolean start , String path ) { // Increment pseudo page count // if starting new pseudo page. if ( start ) { pseudoPageCount++; pseudoPageStarted = true; } else { pseudoPageStarted = false; } // Clear pseudo page word count. pseudoPageWordCount = 0; // Create attributes holder for // milestone element. AttributesImpl pageAttributes = new AttributesImpl(); // Create "unit=pseudopage" attribute for // pseudo page count. setAttributeValue ( pageAttributes , "unit" , "pseudopage" ); // Create "n=" attribute for // pseudo page count. setAttributeValue ( pageAttributes , "n" , pseudoPageCount + "" ); // Create "position=" attribute for // pseudo page count. setAttributeValue ( pageAttributes , "position" , ( start ? "start" : "end" ) ); // Add path attribute if not null. if ( ( path != null ) && ( path.length() > 0 ) ) { setAttributeValue ( pageAttributes , WordAttributeNames.p , path ); } // Create the pseudo page element. return new PendingElement ( uri , "milestone" , "milestone" , pageAttributes ); } /** Emit a pseudo page milestone. * * @param pseudoPageElement The pseudo page element to emit. */ public void emitPseudoPageElement( PendingElement pseudoPageElement ) { if ( pseudoPageElement != null ) { try { super.startElement ( pseudoPageElement.getURI() , pseudoPageElement.getLocalName() , pseudoPageElement.getQName() , pseudoPageElement.getAttributes() ); super.endElement ( pseudoPageElement.getURI() , pseudoPageElement.getLocalName() , pseudoPageElement.getQName() ); } catch ( Exception e ) { } } } /** Set the part of speech tags. * * @param posTags The part of speech tags. */ public void setPosTags( PartOfSpeechTags posTags ) { this.posTags = posTags; } /** Set split words. * * @param splitWords Map of split words. */ protected void setSplitWords( Map<Integer, Integer> splitWords ) { // Save split words map. this.splitWords = splitWords; // Get a modifyable copy of the // split words map. this.splitWordsCopy = MapFactory.createNewMap(); this.splitWordsCopy.putAll( splitWords ); } /** Set word ID format. * * @param outFile Output file name used to derive word IDs. * @param maxID Maximum integer word ID value. * @param maxPageBreaks Maximum number of page breaks. */ protected void setIDFormat ( String outFile , int maxID , int maxPageBreaks ) { // Get base for long word ID values, // (1) Remove path from file name. baseFileName = FileNameUtils.stripPathName( outFile ); // (2) Remove extension from file name. baseFileName = FileNameUtils.changeFileExtension( baseFileName , "" ); // (3) Convert remaining periods to // underlines. baseFileName = StringUtils.replaceAll( baseFileName , "." , "_" ); // Set ID type. idType = MorphAdornerSettings.xmlIDType; // Set ID spacing. idSpacing = MorphAdornerSettings.xmlIDSpacing; // Get number of digits for reading // context ID. This is based upon // the number of digits in the // largest word ID. int numIDDigits = (int)(ArithUtils.log10( (double)( maxID * idSpacing ) ) ) + 1; ID_FORMATTER.setMinimumIntegerDigits( numIDDigits ); // Get number of digits for // page number in word ID. int numPageDigits = 1; if ( maxPageBreaks > 0 ) { numPageDigits = (int)(ArithUtils.log10( (double)maxPageBreaks )) + 1; } PAGE_FORMATTER.setMinimumIntegerDigits( numPageDigits ); // Get number of digits for // word within page in word ID. int numWordDigits = (int)(ArithUtils.log10( (double)( 999 * idSpacing ) ) ) + 1; if ( maxPageBreaks <= 0 ) { numWordDigits = numIDDigits; } WORD_FORMATTER.setMinimumIntegerDigits( numWordDigits ); } /** Set associated XML writer. * * @param writer XML writer. */ public void setWriter( XMLWriter writer ) { // Save writer. this.writer = writer; // Create XML sentence melder. sentenceMelder = new XMLSentenceMelder( writer ); } /** Get the foreign language tag for XML element. * * @param qName XML element name. * @param atts XML element attributes. * * @return The language part of speech tag * for this element. */ public String getForeignLanguageTag ( String qName , Attributes atts ) { String languageTag = ""; // Get language tag from // "xml:lang" attribute. String language = atts.getValue( "xml:lang" ); // Get the language from the // "lang" attribute if it is not specified // by the "xml:lang" attribute. if ( language == null ) { language = atts.getValue( "lang" ); } // If no language attribute is given, // inherit the language tag from the // parent XML element. If there is // no parent element, set the // language tag to the empty string // except for a <foreign> tag. if ( language == null ) { if ( !foreignStack.isEmpty() ) { languageTag = foreignStack.peek(); } else { if ( qName.equals( "foreign" ) ) { languageTag = posTags.getForeignWordTag( "unknown" ); } else { languageTag = "" ; } } } // Language was specified in // attribute. Get the corrresponding // part of speech tag for this // language. else { // Strip eveything after the first // hyphen, if any. We only care // about the language, not the script // or other modifiers. int iPos = language.indexOf( "-" ); if ( iPos >= 0 ) { language = language.substring( 0 , iPos ); } // See if the language tag is one // we know. If so, return the // corresponding language name, // otherwise return "other". if ( languageTags.containsKey( language ) ) { language = languageTags.get( language ); } else { language = "other"; } // Add language tag to // foreign language tag stack. languageTag = posTags.getForeignWordTag( language ); } return languageTag; } // Initialize language map. static { languageTags.put( "deu" , "german" ); languageTags.put( "de" , "german" ); languageTags.put( "fra" , "french" ); languageTags.put( "fre" , "french" ); languageTags.put( "fr" , "french" ); languageTags.put( "grc" , "greek" ); languageTags.put( "gre" , "greek" ); languageTags.put( "ell" , "greek" ); languageTags.put( "el" , "greek" ); languageTags.put( "heb" , "hebrew" ); languageTags.put( "he" , "hebrew" ); languageTags.put( "ita" , "italian" ); languageTags.put( "it" , "italian" ); languageTags.put( "lat" , "latin" ); languageTags.put( "la" , "latin" ); } } /* Copyright (c) 2008, 2009 by Northwestern University. All rights reserved. Developed by: Academic and Research Technologies Northwestern University http://www.it.northwestern.edu/about/departments/at/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of Academic and Research Technologies, Northwestern University, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */