package org.myrobotlab.document.connector; import org.myrobotlab.document.transformer.StageConfiguration; import org.myrobotlab.document.transformer.WorkflowConfiguration; import org.myrobotlab.service.DocumentPipeline; import org.myrobotlab.service.Runtime; import org.myrobotlab.service.XMLConnector; public class WikipediaIndexer { public static void main(String[] args) throws ClassNotFoundException { // wikipedia xml file. String wikipediaFilename = "D:\\data\\wikipedia\\enwiki-20160113-pages-articles-multistream.xml"; XMLConnector wikipediaConnector = (XMLConnector) Runtime.start("wikipediaConnector", "XMLConnector"); wikipediaConnector.setFilename(wikipediaFilename); wikipediaConnector.setXmlRootPath("/mediawiki/page"); wikipediaConnector.setXmlIDPath("/mediawiki/page/id"); wikipediaConnector.setDocIDPrefix("wikipedia_"); // set the table field StageConfiguration staticFieldStageConfig = new StageConfiguration(); staticFieldStageConfig.setStageClass("org.myrobotlab.document.transformer.SetStaticFieldValue"); staticFieldStageConfig.setStageName("SetTableField"); staticFieldStageConfig.setStringParam("fieldName", "table"); staticFieldStageConfig.setStringParam("value", "wikipedia"); // an xpath extractor to parse the xml .. StageConfiguration xpathStageConfig = new StageConfiguration("extractXPaths", "org.myrobotlab.document.transformer.XPathExtractor"); xpathStageConfig.setStringParam("configFile", "test/resources/xpaths.txt"); // TODO: remove the xml field.. ?!?! argh! StageConfiguration deleteXMLFieldConfig = new StageConfiguration("deleteXMLField", "org.myrobotlab.document.transformer.DeleteField"); deleteXMLFieldConfig.setStringParam("fieldName", "xml"); // TODO: followed by a wiki markup parser // followed by a solr output stage. StageConfiguration solrStageConfig = new StageConfiguration("sendToSolr", "org.myrobotlab.document.transformer.SendToSolr"); solrStageConfig.setStringParam("solrUrl", "http://phobos:8983/solr/graph"); DocumentPipeline docproc = new DocumentPipeline("docproc"); // build the pipeline.. assemble the stages. // create our document processing pipeline workflow. WorkflowConfiguration workflowConfig = new WorkflowConfiguration("default"); workflowConfig.setNumWorkerThreads(5); // workflowConfig.setName("default"); workflowConfig.addStage(staticFieldStageConfig); workflowConfig.addStage(xpathStageConfig); // remove the original xml.. it's icky workflowConfig.addStage(deleteXMLFieldConfig); workflowConfig.addStage(solrStageConfig); docproc.setConfig(workflowConfig); docproc.initalize(); docproc.startService(); // attach the doc proc to the connector wikipediaConnector.addDocumentListener(docproc); wikipediaConnector.setBatchSize(20); // start crawling... wikipediaConnector.startCrawling(); } }