package org.juxtasoftware.service; import java.io.CharArrayWriter; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Stack; import org.juxtasoftware.model.JuxtaXslt; import org.juxtasoftware.model.Note; import org.juxtasoftware.model.PageMark; import org.juxtasoftware.model.RevisionInfo; import org.juxtasoftware.service.importer.jxt.Util; import org.juxtasoftware.service.importer.ps.WitnessParser.PsWitnessInfo; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import com.google.common.collect.Maps; import eu.interedition.text.Range; /** * JuxtaExtractor is a SAX xml parser that will collect the position information * for tags that require special handling by Juxta: notes, pagebreaks and revisions. * If requested, it will also extract witness content from a TEI parallel segmented source * * @author loufoster */ public class JuxtaTagExtractor extends DefaultHandler { private Note currNote = null; private StringBuilder currNoteContent; private List<Note> notes = new ArrayList<Note>(); private List<PageMark> marks = new ArrayList<PageMark>(); private Map<String, Range> identifiedRanges = Maps.newHashMap(); private Map<String,Integer> tagOccurences = Maps.newHashMap(); private JuxtaXslt xslt; private long currPos = 0; private boolean isExcluding = false; private Stack<String> exclusionContext = new Stack<String>(); private Stack<String> xmlIdStack = new Stack<String>(); private Stack<ExtractRevision> revisionExtractStack = new Stack<ExtractRevision>(); private List<RevisionInfo> revisions = new ArrayList<RevisionInfo>(); private PsWitnessInfo psWitnessInfo; private StringBuilder psWitnessContent; private CharArrayWriter contentBuffer = new CharArrayWriter(); private Stack<String> choiceStack = new Stack<String>(); private boolean choiceIncluded = false; /** * For parallel segmentated sources, set the witness information that will * be used to extract content. * @param info */ public void setPsTargetWitness( final PsWitnessInfo info ) { this.psWitnessInfo = info; this.psWitnessContent = new StringBuilder(); } public void extract(final Reader sourceReader, final JuxtaXslt xslt) throws SAXException, IOException { this.xslt = xslt; Util.saxParser().parse( new InputSource(sourceReader), this); } public List<RevisionInfo> getRevisions() { return this.revisions; } public List<Note> getNotes() { return this.notes; } public List<PageMark> getPageMarks() { return this.marks; } public String getPsWitnessContent() { return this.psWitnessContent.toString(); } private boolean isChoice( final String qName ) { final String localName = stripNamespace(qName); return ( localName.equals("choice") ); } private boolean isRevision(final String qName ) { final String localName = stripNamespace(qName); return ( localName.equals("add") || localName.equals("addSpan") || localName.equals("del") || localName.equals("delSpan")); } private boolean isNote( final String qName ) { final String localName = stripNamespace(qName); return ( localName.equals("note") ); } private boolean isPageBreak( final String qName ) { final String localName = stripNamespace(qName); return ( localName.equals("pb") ); } private boolean isLineNumber( final String qName ) { final String localName = stripNamespace(qName); return ( localName.equals("l") ); } private boolean isParagraph( final String qName ) { final String localName = stripNamespace(qName); return ( localName.equals("p") ); } private boolean isPsWitnessContent( final String qName ) { final String localName = stripNamespace(qName); return ( localName.equals("rdg") || localName.equals("lem")); } private String stripNamespace( final String qName ) { if ( qName.indexOf(":") > 0 ) { return qName.substring(qName.indexOf(":")+1); } return qName; } @Override public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException { if (systemId.endsWith(".dtd") || systemId.endsWith(".ent")) { StringReader stringInput = new StringReader(" "); return new InputSource(stringInput); } else { return super.resolveEntity(publicId, systemId); } } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { // each time a new element starts, be sure to process the content of the prior handleContent(); // If there is a default namespace, the qName will not have it prepended here. // Do it now because all of the exclusion/linefeed data in the XSLT must have it. if ( this.xslt.getDefaultNamespace() != null && this.xslt.getDefaultNamespace().length() > 0) { qName = this.xslt.getDefaultNamespace() + ":" + qName; } // always count up the number of occurrences for this tag countOccurrences(qName); // if an exclusion is currently in process, just push this qName // onto the context stack and bail if ( this.isExcluding ) { this.exclusionContext.push(qName); return; } // cache the exclusion state of this tag. Kinda expensive and used multiple times boolean isExcluded = this.xslt.isExcluded(qName, this.tagOccurences.get(qName)); // special choice tag handling for ps witnesses: only take content from the // first child for each choice element if ( this.psWitnessContent != null ) { if ( isExcluded == false && this.choiceStack.size() >= 1 && this.choiceIncluded == true) { // once the first nested tag of a choice is grabbed, bag the rest isExcluded = true; } } // Handle all tags with special extraction behavior first if ( this.psWitnessInfo != null && isPsWitnessContent(qName) ) { // exclude content from witnesses we are not interested in if ( matchesTargetWitness( attributes ) == false ) { this.isExcluding = true; this.exclusionContext.push(qName); } } else if (isChoice(qName) ) { this.choiceIncluded = false; this.choiceStack.push(qName); }else if ( isRevision(qName) ) { this.revisionExtractStack.push( new ExtractRevision(isExcluded, this.currPos) ); } else if ( isNote(qName) ) { handleNote(attributes); } else if ( isPageBreak(qName) ) { handlePageBreak(attributes); } else if ( isLineNumber(qName) ) { handleLineNumber(attributes); }else { // once a choice tag is found, keep pushing tags onto the stwck if ( this.choiceStack.size() >=1 ) { this.choiceStack.push(qName); } // default handling for all other tags if ( isExcluded ) { this.isExcluding = true; this.exclusionContext.push(qName); } else { if ( isParagraph(qName)) { extractParagraphNumber(attributes); } final String idVal = getAttributeValue("id", attributes); if ( idVal != null ) { this.identifiedRanges.put(idVal, new Range(this.currPos, this.currPos)); this.xmlIdStack.push(idVal); } else { this.xmlIdStack.push("NA"); } } } } private boolean matchesTargetWitness(Attributes attributes) { // get the value of the wit or lem attribute (only 1 should be present) String idAttr = getAttributeValue("wit", attributes); if ( idAttr == null ) { idAttr = getAttributeValue("lem", attributes); if ( idAttr == null ) { return false; } } // wit/lem ids are prefixed with # and separated by space. // Strip the # and break up into tokens. See if one of the // IDs matches the target ID for this parser pass. idAttr = idAttr.replaceAll("#", ""); String[] ids = idAttr.split(" "); for ( int i=0; i<ids.length; i++) { String id = ids[i].trim(); if ( id.equals(this.psWitnessInfo.getId()) || id.equals(this.psWitnessInfo.getGroupId()) ) { return true; } } return false; } private void countOccurrences(String qName) { Integer cnt = this.tagOccurences.get(qName); if ( cnt == null ) { this.tagOccurences.put(qName, 1); } else { this.tagOccurences.put(qName, cnt+1); } } private void handleNote(Attributes attributes) { this.currNote = new Note(); this.currNote.setAnchorRange(new Range(this.currPos, this.currPos)); this.currNoteContent = new StringBuilder(); //System.err.println("======> NOTE "+this.currPos); // search note tag attributes for type and target and add them to the note. for (int idx = 0; idx<attributes.getLength(); idx++) { String name = attributes.getQName(idx); if ( name.contains(":")) { name = name.split(":")[1]; } if ("type".equals(name)) { this.currNote.setType(attributes.getValue(idx)); } else if ("target".equals(name)) { this.currNote.setTargetID(attributes.getValue(idx)); } } this.notes.add(this.currNote); } private void handlePageBreak(Attributes attributes) { PageMark pb = new PageMark(); pb.setOffset(this.currPos); pb.setType(PageMark.Type.PAGE_BREAK); //System.err.println("======> PB "+this.currPos); for (int idx = 0; idx<attributes.getLength(); idx++) { String name = attributes.getQName(idx); if ( name.contains(":")) { name = name.split(":")[1]; } if ("n".equals(name)) { pb.setLabel( attributes.getValue(idx) ); } } this.marks.add(pb); } private void handleLineNumber(Attributes attributes) { PageMark mark = new PageMark(); mark.setOffset(this.currPos); mark.setType(PageMark.Type.LINE_NUMBER); //System.err.println("======> LINE NUMBER "+this.currPos); for (int idx = 0; idx<attributes.getLength(); idx++) { String name = attributes.getQName(idx); if ( name.contains(":")) { name = name.split(":")[1]; } if ("n".equals(name)) { mark.setLabel( "L"+attributes.getValue(idx) ); } } if ( mark.getLabel().trim().length() > 0 ) { this.marks.add(mark); } } private void extractParagraphNumber(Attributes attributes) { PageMark mark = new PageMark(); mark.setOffset(this.currPos); mark.setType(PageMark.Type.LINE_NUMBER); for (int idx = 0; idx<attributes.getLength(); idx++) { String name = attributes.getQName(idx); if ( name.contains(":")) { name = name.split(":")[1]; } if ("n".equals(name)) { mark.setLabel( "P"+attributes.getValue(idx) ); } } if ( mark.getLabel().length() > 0 ) { this.marks.add(mark); } } private String getAttributeValue( final String name, final Attributes attributes ){ for (int idx = 0; idx<attributes.getLength(); idx++) { String val = attributes.getQName(idx); if ( val.contains(":")) { val = val.split(":")[1]; } if ( val.equals(name)) { return attributes.getValue(idx); } } return null; } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // hande any buffered content from tha tag that has just finished handleContent(); if ( this.isExcluding ) { this.exclusionContext.pop(); this.isExcluding = !this.exclusionContext.empty(); return; } // If there is a default namespace, the qName will not have it prepended here. // Do it now because all of the exclusion/linefeed data in the XSLT must have it. if ( this.xslt.getDefaultNamespace() != null && this.xslt.getDefaultNamespace().length() > 0) { qName = this.xslt.getDefaultNamespace() + ":" + qName; } if ( isRevision(qName) ) { ExtractRevision rev = this.revisionExtractStack.pop(); final Range range = new Range(rev.startPosition, this.currPos); this.revisions.add( new RevisionInfo(qName, range, rev.content.toString(), !rev.isExcluded) ); } else if ( isNote(qName) ) { this.currNote.setContent(this.currNoteContent.toString().replaceAll("\\s+", " ").trim()); if ( this.currNote.getContent().length() == 0 ) { this.notes.remove(this.currNote); } this.currNote = null; this.currNoteContent = null; } else if ( isPageBreak(qName) ) { // pagebreaks always include a linebreak. add 1 to // current position to account for this if ( this.currNote == null ) { this.currPos++; if ( this.psWitnessContent != null ) { this.psWitnessContent.append("\n"); } } } else { if ( this.choiceStack.empty() == false ) { this.choiceStack.pop(); // back down to one tag in stack (choice). this means the first // content nested under the choice tag has been processed. all the // rest will be excluded until choice is empty if ( this.choiceStack.size() == 1) { this.choiceIncluded = true; } } // if the tag has an identifier, save it off for crossreference with targeted notes if ( this.xmlIdStack.empty() == false ) { final String xmlId = this.xmlIdStack.pop(); if (xmlId.equals("NA") == false ) { this.identifiedRanges.put(xmlId, new Range(this.identifiedRanges.get(xmlId).getStart(), this.currPos)); } } // if this tag is in the midst of a note, check it for // linebreaks and add a hard break now. Also, do NOT // increment position count if we are collecting a note. if ( this.currNote != null ) { if ( this.xslt.hasLineBreak(qName, this.tagOccurences.get(qName)) ){ this.currNoteContent.append("<br/>"); } } else if ( this.xslt.hasLineBreak(qName, this.tagOccurences.get(qName)) ){ // Only add 1 for the linebreak if we are non-revision or included revision if ( this.revisionExtractStack.empty() || this.revisionExtractStack.peek().isExcluded == false) { this.currPos++; if ( this.psWitnessContent != null ) { this.psWitnessContent.append("\n"); } } } } } private void handleContent() { String txt = this.contentBuffer.toString(); if ( txt.length() == 0 ) { return; } txt = txt.replaceAll("[\\n]\\s*$", " "); txt = txt.replaceAll("^[\\n]\\s*$", " "); txt = txt.replaceAll("\\n+", " "); txt = txt.replaceAll("\\s+", " "); // if ( txt.length() > 0 ) { // System.err.println("["+txt+"]"); // } if ( this.currNote != null ) { this.currNoteContent.append(txt); } else { if ( this.choiceStack.size() >= 1 && this.choiceIncluded == false ) { this.currPos += txt.length(); if ( this.psWitnessContent != null ) { this.psWitnessContent.append(txt); } } else { if ( this.revisionExtractStack.empty() || this.revisionExtractStack.peek().isExcluded == false) { this.currPos += txt.length(); if ( this.psWitnessContent != null ) { this.psWitnessContent.append(txt); } } if ( this.revisionExtractStack.empty() == false ) { this.revisionExtractStack.peek().content.append(txt); } } } this.contentBuffer.reset(); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if ( this.isExcluding == false ) { // just buffer data here.. it may not be complete this.contentBuffer.write( ch, start, length ); } } @Override public void endDocument() throws SAXException { // at the end of parsing, find all notes that have a target // specified. Look up that id and set the associated range // as the note anchor point for ( Note note : this.notes ) { String noteTargetId = note.getTargetID(); if ( noteTargetId != null && noteTargetId.length() > 0){ Range tgtRange = this.identifiedRanges.get(noteTargetId); if ( tgtRange != null ) { note.setAnchorRange( tgtRange ); } } } } /** * Track extraction of revision info during parse pass */ static class ExtractRevision { final boolean isExcluded; final long startPosition; StringBuilder content = new StringBuilder(); ExtractRevision( boolean exclude, long start) { this.isExcluded = exclude; this.startPosition = start; } } }