package org.myrobotlab.document.xml; import java.io.IOException; import java.util.Arrays; import java.util.Stack; import org.apache.commons.lang.StringUtils; import org.myrobotlab.document.Document; import org.myrobotlab.document.connector.AbstractConnector; import org.myrobotlab.framework.Service; import org.myrobotlab.logging.LoggerFactory; import org.slf4j.Logger; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; public class MRLChunkingXMLHandler implements ContentHandler { transient public final static Logger log = LoggerFactory.getLogger(Service.class); Stack<String> currentPath = new Stack<String>(); private AbstractConnector connector; private String documentRootPath; private String documentIDPath; private String docIDPrefix = ""; private boolean inDocID = false; // private boolean inDoc = false; private StringBuilder docIDBuilder = new StringBuilder(); private RecordingInputStream ris; @Override public void setDocumentLocator(Locator locator) { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void endDocument() throws SAXException { // TODO Auto-generated method stub } @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { // TODO Auto-generated method stub } @Override public void endPrefixMapping(String prefix) throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // // push on the stack. currentPath.push(qName); // log.info("Start element: {}",qName); String path = "/" + StringUtils.join(currentPath.toArray(), "/"); if (documentRootPath.equals(path)) { // this is the start of our page. // inDoc = true; docIDBuilder = new StringBuilder(); // ok we should clear our input buffer up to the current offset for this // start element. try { ris.clearUpTo("<" + qName); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (documentIDPath.equals(path)) { // this is the start of the document id field. inDocID = true; } // append to the current current page the tag and it's attributes. // TODO: properly encode/escape these!! could // cause xml parsing errors!?! eek. // for (int i = 0; i<atts.getLength(); i++) { // StringBuilder attrBuilder = new StringBuilder(); // attrBuilder.append(" "); // attrBuilder.append(atts.getQName(i)); // attrBuilder.append("=\""); // attrBuilder.append(atts.getValue(i)); // attrBuilder.append("\""); // pageBuffer.append(attrBuilder.toString()); // } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // we just finished a path. see if it's the doc root that we're looking for. String path = "/" + StringUtils.join(currentPath.toArray(), "/"); if (documentRootPath.equals(path)) { // ok, now we want the buffer up to the close tag. String xml = "Malformed"; try { xml = ris.returnUpTo("</" + qName + ">"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // System.out.println("------------------------------------"); // System.out.println(xml); // System.out.println("------------------------------------"); // this is the end of our page send the buffer as a document Document doc = new Document(docIDPrefix + docIDBuilder.toString()); // doc.setField("xml", pageBuffer.toString()); doc.setField("xml", xml); internalPublishDocument(doc); } if (documentIDPath.equals(path)) { // this is the end of the doc id tag. inDocID = false; } // pop up.. currentPath.pop(); // System.out.println(path); } private void internalPublishDocument(Document doc) { // publish the doc. // does this need to be an invoke? // always feed a batch connector.feed(doc); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (inDocID) { docIDBuilder.append(Arrays.copyOfRange(ch, start, start + length)); } } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String target, String data) throws SAXException { // TODO Auto-generated method stub } @Override public void skippedEntity(String name) throws SAXException { // TODO Auto-generated method stub } public String getDocumentRootPath() { return documentRootPath; } public void setDocumentRootPath(String documentRootPath) { this.documentRootPath = documentRootPath; } public String getDocumentIDPath() { return documentIDPath; } public void setDocumentIDPath(String documentIDPath) { this.documentIDPath = documentIDPath; } public String getDocIDPrefix() { return docIDPrefix; } public void setDocIDPrefix(String docIDPrefix) { this.docIDPrefix = docIDPrefix; } public void setConnector(AbstractConnector connector) { this.connector = connector; } public RecordingInputStream getRis() { return ris; } public void setRis(RecordingInputStream ris) { this.ris = ris; } }