package org.juxtasoftware.service;
import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.juxtasoftware.model.JuxtaXslt;
import org.juxtasoftware.model.Note;
import org.juxtasoftware.model.PageMark;
import org.juxtasoftware.model.RevisionInfo;
import org.juxtasoftware.service.importer.jxt.Util;
import org.juxtasoftware.service.importer.ps.WitnessParser.PsWitnessInfo;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.google.common.collect.Maps;
import eu.interedition.text.Range;
/**
* JuxtaExtractor is a SAX xml parser that will collect the position information
* for tags that require special handling by Juxta: notes, pagebreaks and revisions.
* If requested, it will also extract witness content from a TEI parallel segmented source
*
* @author loufoster
*/
public class JuxtaTagExtractor extends DefaultHandler {
private Note currNote = null;
private StringBuilder currNoteContent;
private List<Note> notes = new ArrayList<Note>();
private List<PageMark> marks = new ArrayList<PageMark>();
private Map<String, Range> identifiedRanges = Maps.newHashMap();
private Map<String,Integer> tagOccurences = Maps.newHashMap();
private JuxtaXslt xslt;
private long currPos = 0;
private boolean isExcluding = false;
private Stack<String> exclusionContext = new Stack<String>();
private Stack<String> xmlIdStack = new Stack<String>();
private Stack<ExtractRevision> revisionExtractStack = new Stack<ExtractRevision>();
private List<RevisionInfo> revisions = new ArrayList<RevisionInfo>();
private PsWitnessInfo psWitnessInfo;
private StringBuilder psWitnessContent;
private CharArrayWriter contentBuffer = new CharArrayWriter();
private Stack<String> choiceStack = new Stack<String>();
private boolean choiceIncluded = false;
/**
* For parallel segmentated sources, set the witness information that will
* be used to extract content.
* @param info
*/
public void setPsTargetWitness( final PsWitnessInfo info ) {
this.psWitnessInfo = info;
this.psWitnessContent = new StringBuilder();
}
public void extract(final Reader sourceReader, final JuxtaXslt xslt) throws SAXException, IOException {
this.xslt = xslt;
Util.saxParser().parse( new InputSource(sourceReader), this);
}
public List<RevisionInfo> getRevisions() {
return this.revisions;
}
public List<Note> getNotes() {
return this.notes;
}
public List<PageMark> getPageMarks() {
return this.marks;
}
public String getPsWitnessContent() {
return this.psWitnessContent.toString();
}
private boolean isChoice( final String qName ) {
final String localName = stripNamespace(qName);
return ( localName.equals("choice") );
}
private boolean isRevision(final String qName ) {
final String localName = stripNamespace(qName);
return ( localName.equals("add") || localName.equals("addSpan") ||
localName.equals("del") || localName.equals("delSpan"));
}
private boolean isNote( final String qName ) {
final String localName = stripNamespace(qName);
return ( localName.equals("note") );
}
private boolean isPageBreak( final String qName ) {
final String localName = stripNamespace(qName);
return ( localName.equals("pb") );
}
private boolean isLineNumber( final String qName ) {
final String localName = stripNamespace(qName);
return ( localName.equals("l") );
}
private boolean isParagraph( final String qName ) {
final String localName = stripNamespace(qName);
return ( localName.equals("p") );
}
private boolean isPsWitnessContent( final String qName ) {
final String localName = stripNamespace(qName);
return ( localName.equals("rdg") || localName.equals("lem"));
}
private String stripNamespace( final String qName ) {
if ( qName.indexOf(":") > 0 ) {
return qName.substring(qName.indexOf(":")+1);
}
return qName;
}
@Override
public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException {
if (systemId.endsWith(".dtd") || systemId.endsWith(".ent")) {
StringReader stringInput = new StringReader(" ");
return new InputSource(stringInput);
}
else {
return super.resolveEntity(publicId, systemId);
}
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
// each time a new element starts, be sure to process the content of the prior
handleContent();
// If there is a default namespace, the qName will not have it prepended here.
// Do it now because all of the exclusion/linefeed data in the XSLT must have it.
if ( this.xslt.getDefaultNamespace() != null && this.xslt.getDefaultNamespace().length() > 0) {
qName = this.xslt.getDefaultNamespace() + ":" + qName;
}
// always count up the number of occurrences for this tag
countOccurrences(qName);
// if an exclusion is currently in process, just push this qName
// onto the context stack and bail
if ( this.isExcluding ) {
this.exclusionContext.push(qName);
return;
}
// cache the exclusion state of this tag. Kinda expensive and used multiple times
boolean isExcluded = this.xslt.isExcluded(qName, this.tagOccurences.get(qName));
// special choice tag handling for ps witnesses: only take content from the
// first child for each choice element
if ( this.psWitnessContent != null ) {
if ( isExcluded == false && this.choiceStack.size() >= 1 && this.choiceIncluded == true) {
// once the first nested tag of a choice is grabbed, bag the rest
isExcluded = true;
}
}
// Handle all tags with special extraction behavior first
if ( this.psWitnessInfo != null && isPsWitnessContent(qName) ) {
// exclude content from witnesses we are not interested in
if ( matchesTargetWitness( attributes ) == false ) {
this.isExcluding = true;
this.exclusionContext.push(qName);
}
} else if (isChoice(qName) ) {
this.choiceIncluded = false;
this.choiceStack.push(qName);
}else if ( isRevision(qName) ) {
this.revisionExtractStack.push( new ExtractRevision(isExcluded, this.currPos) );
} else if ( isNote(qName) ) {
handleNote(attributes);
} else if ( isPageBreak(qName) ) {
handlePageBreak(attributes);
} else if ( isLineNumber(qName) ) {
handleLineNumber(attributes);
}else {
// once a choice tag is found, keep pushing tags onto the stwck
if ( this.choiceStack.size() >=1 ) {
this.choiceStack.push(qName);
}
// default handling for all other tags
if ( isExcluded ) {
this.isExcluding = true;
this.exclusionContext.push(qName);
} else {
if ( isParagraph(qName)) {
extractParagraphNumber(attributes);
}
final String idVal = getAttributeValue("id", attributes);
if ( idVal != null ) {
this.identifiedRanges.put(idVal, new Range(this.currPos, this.currPos));
this.xmlIdStack.push(idVal);
} else {
this.xmlIdStack.push("NA");
}
}
}
}
private boolean matchesTargetWitness(Attributes attributes) {
// get the value of the wit or lem attribute (only 1 should be present)
String idAttr = getAttributeValue("wit", attributes);
if ( idAttr == null ) {
idAttr = getAttributeValue("lem", attributes);
if ( idAttr == null ) {
return false;
}
}
// wit/lem ids are prefixed with # and separated by space.
// Strip the # and break up into tokens. See if one of the
// IDs matches the target ID for this parser pass.
idAttr = idAttr.replaceAll("#", "");
String[] ids = idAttr.split(" ");
for ( int i=0; i<ids.length; i++) {
String id = ids[i].trim();
if ( id.equals(this.psWitnessInfo.getId()) ||
id.equals(this.psWitnessInfo.getGroupId()) ) {
return true;
}
}
return false;
}
private void countOccurrences(String qName) {
Integer cnt = this.tagOccurences.get(qName);
if ( cnt == null ) {
this.tagOccurences.put(qName, 1);
} else {
this.tagOccurences.put(qName, cnt+1);
}
}
private void handleNote(Attributes attributes) {
this.currNote = new Note();
this.currNote.setAnchorRange(new Range(this.currPos, this.currPos));
this.currNoteContent = new StringBuilder();
//System.err.println("======> NOTE "+this.currPos);
// search note tag attributes for type and target and add them to the note.
for (int idx = 0; idx<attributes.getLength(); idx++) {
String name = attributes.getQName(idx);
if ( name.contains(":")) {
name = name.split(":")[1];
}
if ("type".equals(name)) {
this.currNote.setType(attributes.getValue(idx));
} else if ("target".equals(name)) {
this.currNote.setTargetID(attributes.getValue(idx));
}
}
this.notes.add(this.currNote);
}
private void handlePageBreak(Attributes attributes) {
PageMark pb = new PageMark();
pb.setOffset(this.currPos);
pb.setType(PageMark.Type.PAGE_BREAK);
//System.err.println("======> PB "+this.currPos);
for (int idx = 0; idx<attributes.getLength(); idx++) {
String name = attributes.getQName(idx);
if ( name.contains(":")) {
name = name.split(":")[1];
}
if ("n".equals(name)) {
pb.setLabel( attributes.getValue(idx) );
}
}
this.marks.add(pb);
}
private void handleLineNumber(Attributes attributes) {
PageMark mark = new PageMark();
mark.setOffset(this.currPos);
mark.setType(PageMark.Type.LINE_NUMBER);
//System.err.println("======> LINE NUMBER "+this.currPos);
for (int idx = 0; idx<attributes.getLength(); idx++) {
String name = attributes.getQName(idx);
if ( name.contains(":")) {
name = name.split(":")[1];
}
if ("n".equals(name)) {
mark.setLabel( "L"+attributes.getValue(idx) );
}
}
if ( mark.getLabel().trim().length() > 0 ) {
this.marks.add(mark);
}
}
private void extractParagraphNumber(Attributes attributes) {
PageMark mark = new PageMark();
mark.setOffset(this.currPos);
mark.setType(PageMark.Type.LINE_NUMBER);
for (int idx = 0; idx<attributes.getLength(); idx++) {
String name = attributes.getQName(idx);
if ( name.contains(":")) {
name = name.split(":")[1];
}
if ("n".equals(name)) {
mark.setLabel( "P"+attributes.getValue(idx) );
}
}
if ( mark.getLabel().length() > 0 ) {
this.marks.add(mark);
}
}
private String getAttributeValue( final String name, final Attributes attributes ){
for (int idx = 0; idx<attributes.getLength(); idx++) {
String val = attributes.getQName(idx);
if ( val.contains(":")) {
val = val.split(":")[1];
}
if ( val.equals(name)) {
return attributes.getValue(idx);
}
}
return null;
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
// hande any buffered content from tha tag that has just finished
handleContent();
if ( this.isExcluding ) {
this.exclusionContext.pop();
this.isExcluding = !this.exclusionContext.empty();
return;
}
// If there is a default namespace, the qName will not have it prepended here.
// Do it now because all of the exclusion/linefeed data in the XSLT must have it.
if ( this.xslt.getDefaultNamespace() != null && this.xslt.getDefaultNamespace().length() > 0) {
qName = this.xslt.getDefaultNamespace() + ":" + qName;
}
if ( isRevision(qName) ) {
ExtractRevision rev = this.revisionExtractStack.pop();
final Range range = new Range(rev.startPosition, this.currPos);
this.revisions.add( new RevisionInfo(qName, range, rev.content.toString(), !rev.isExcluded) );
} else if ( isNote(qName) ) {
this.currNote.setContent(this.currNoteContent.toString().replaceAll("\\s+", " ").trim());
if ( this.currNote.getContent().length() == 0 ) {
this.notes.remove(this.currNote);
}
this.currNote = null;
this.currNoteContent = null;
} else if ( isPageBreak(qName) ) {
// pagebreaks always include a linebreak. add 1 to
// current position to account for this
if ( this.currNote == null ) {
this.currPos++;
if ( this.psWitnessContent != null ) {
this.psWitnessContent.append("\n");
}
}
} else {
if ( this.choiceStack.empty() == false ) {
this.choiceStack.pop();
// back down to one tag in stack (choice). this means the first
// content nested under the choice tag has been processed. all the
// rest will be excluded until choice is empty
if ( this.choiceStack.size() == 1) {
this.choiceIncluded = true;
}
}
// if the tag has an identifier, save it off for crossreference with targeted notes
if ( this.xmlIdStack.empty() == false ) {
final String xmlId = this.xmlIdStack.pop();
if (xmlId.equals("NA") == false ) {
this.identifiedRanges.put(xmlId, new Range(this.identifiedRanges.get(xmlId).getStart(), this.currPos));
}
}
// if this tag is in the midst of a note, check it for
// linebreaks and add a hard break now. Also, do NOT
// increment position count if we are collecting a note.
if ( this.currNote != null ) {
if ( this.xslt.hasLineBreak(qName, this.tagOccurences.get(qName)) ){
this.currNoteContent.append("<br/>");
}
} else if ( this.xslt.hasLineBreak(qName, this.tagOccurences.get(qName)) ){
// Only add 1 for the linebreak if we are non-revision or included revision
if ( this.revisionExtractStack.empty() || this.revisionExtractStack.peek().isExcluded == false) {
this.currPos++;
if ( this.psWitnessContent != null ) {
this.psWitnessContent.append("\n");
}
}
}
}
}
private void handleContent() {
String txt = this.contentBuffer.toString();
if ( txt.length() == 0 ) {
return;
}
txt = txt.replaceAll("[\\n]\\s*$", " ");
txt = txt.replaceAll("^[\\n]\\s*$", " ");
txt = txt.replaceAll("\\n+", " ");
txt = txt.replaceAll("\\s+", " ");
// if ( txt.length() > 0 ) {
// System.err.println("["+txt+"]");
// }
if ( this.currNote != null ) {
this.currNoteContent.append(txt);
} else {
if ( this.choiceStack.size() >= 1 && this.choiceIncluded == false ) {
this.currPos += txt.length();
if ( this.psWitnessContent != null ) {
this.psWitnessContent.append(txt);
}
} else {
if ( this.revisionExtractStack.empty() || this.revisionExtractStack.peek().isExcluded == false) {
this.currPos += txt.length();
if ( this.psWitnessContent != null ) {
this.psWitnessContent.append(txt);
}
}
if ( this.revisionExtractStack.empty() == false ) {
this.revisionExtractStack.peek().content.append(txt);
}
}
}
this.contentBuffer.reset();
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if ( this.isExcluding == false ) {
// just buffer data here.. it may not be complete
this.contentBuffer.write( ch, start, length );
}
}
@Override
public void endDocument() throws SAXException {
// at the end of parsing, find all notes that have a target
// specified. Look up that id and set the associated range
// as the note anchor point
for ( Note note : this.notes ) {
String noteTargetId = note.getTargetID();
if ( noteTargetId != null && noteTargetId.length() > 0){
Range tgtRange = this.identifiedRanges.get(noteTargetId);
if ( tgtRange != null ) {
note.setAnchorRange( tgtRange );
}
}
}
}
/**
* Track extraction of revision info during parse pass
*/
static class ExtractRevision {
final boolean isExcluded;
final long startPosition;
StringBuilder content = new StringBuilder();
ExtractRevision( boolean exclude, long start) {
this.isExcluded = exclude;
this.startPosition = start;
}
}
}