package org.myrobotlab.service; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.myrobotlab.document.connector.AbstractConnector; import org.myrobotlab.document.connector.ConnectorState; import org.myrobotlab.document.transformer.ConnectorConfig; import org.myrobotlab.document.xml.MRLChunkingXMLHandler; import org.myrobotlab.document.xml.RecordingInputStream; import org.myrobotlab.framework.ServiceType; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; /** * * XMLConnector - This will parse a large xml file into many sub documents based * on the XMLRoot path. All of the xml under that path will be created as a * document that can be published to the doc pipeline, or other * DocumentListener. */ public class XMLConnector extends AbstractConnector { private static final long serialVersionUID = 1L; private String filename = "D:\\data\\wikipedia\\enwiki-20160113-pages-articles-multistream.xml"; private String xmlRootPath = "/page"; private String xmlIDPath = "/page/id"; private String docIDPrefix = "doc_"; // TODO: wire in so we can interrupt and stop the crawler. // private boolean interrupted = false; public XMLConnector(String name) { super(name); } @Override public void setConfig(ConnectorConfig config) { // TODO Auto-generated method stub log.info("Set Config not yet implemented"); } @Override public void startCrawling() { // avoid buffer overruns on the outbox.. connectors shouldn't drop messages. // (or run out of memory) this.outbox.setBlocking(true); state = ConnectorState.RUNNING; SAXParserFactory spf = SAXParserFactory.newInstance(); // spf.setNamespaceAware(false); ? Expose this? spf.setNamespaceAware(true); SAXParser saxParser = null; try { saxParser = spf.newSAXParser(); } catch (ParserConfigurationException | SAXException e) { // TODO Auto-generated catch block log.warn("SAX Parser Error {}", e); } try { XMLReader xmlReader = saxParser.getXMLReader(); MRLChunkingXMLHandler xmlHandler = new MRLChunkingXMLHandler(); xmlHandler.setConnector(this); xmlHandler.setDocumentRootPath(xmlRootPath); xmlHandler.setDocumentIDPath(xmlIDPath); xmlHandler.setDocIDPrefix(docIDPrefix); xmlReader.setContentHandler(xmlHandler); FileInputStream fis = new FileInputStream(new File(filename)); RecordingInputStream ris = new RecordingInputStream(fis); InputSource xmlSource = new InputSource(ris); xmlHandler.setRis(ris); xmlReader.parse(xmlSource); // xmlReader.parse(convertToFileURL(filename)); } catch (IOException | SAXException e) { // TODO Auto-generated catch block log.warn("SAX Parser Error {}", e); } state = ConnectorState.STOPPED; } @Override public void stopCrawling() { // Stop crawling! (maybe flush?) // interrupted = true; } public String getFilename() { return filename; } public void setFilename(String filename) { this.filename = filename; } public String getXmlRootPath() { return xmlRootPath; } public void setXmlRootPath(String xmlRootPath) { this.xmlRootPath = xmlRootPath; } public String getXmlIDPath() { return xmlIDPath; } public void setXmlIDPath(String xmlIDPath) { this.xmlIDPath = xmlIDPath; } public String getDocIDPrefix() { return docIDPrefix; } public void setDocIDPrefix(String docIDPrefix) { this.docIDPrefix = docIDPrefix; } /** * This static method returns all the details of the class without it having * to be constructed. It has description, categories, dependencies, and peer * definitions. * * @return ServiceType - returns all the data * */ static public ServiceType getMetaData() { ServiceType meta = new ServiceType(XMLConnector.class.getCanonicalName()); meta.addDescription("This is an XML Connector that will parse a large xml file into many small xml documents"); meta.addCategory("data"); return meta; } }