package de.dfki.km.leech.util; import java.io.IOException; import java.util.LinkedList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParseContext; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import de.dfki.inquisition.collections.MultiValueHashMap; import de.dfki.inquisition.processes.StopWatch; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser.WikipediaDumpParserConfig; import de.dfki.km.leech.sax.CrawlReportContentHandler; import de.dfki.km.leech.sax.DataSinkContentHandler; import de.dfki.km.leech.sax.DataSinkContentHandlerDecorator; import de.dfki.km.leech.sax.PrintlnContentHandler; import de.dfki.km.leech.sax.PrintlnContentHandler.Verbosity; import de.dfki.km.leech.solr.ToSolrContentHandler; /** * A very simple data sink for a Solr server. * * @author Christian Reuschling, Dipl.Ing.(BA) * */ public class SolrIndexCreator { public long cyclicReportTime = 1000 * 60; public static void main(String[] args) throws Exception { new SolrIndexCreator().createIndex(args); } public void createIndex(List<String> lUrls2Crawl, String strSolrUrl, MultiValueHashMap<String, String> hsStaticAttValuePairs, boolean bPrintErrors, boolean bCloudSolrClient, String defaultCollection) throws IOException, Exception, SAXException, TikaException { createIndex(lUrls2Crawl, strSolrUrl, hsStaticAttValuePairs, bPrintErrors, bCloudSolrClient, defaultCollection, null); } public void createIndex(List<String> lUrls2Crawl, String strSolrUrl, MultiValueHashMap<String, String> hsStaticAttValuePairs, boolean bPrintErrors, boolean bCloudSolrClient, String defaultCollection, ParseContext context) throws IOException, Exception, SAXException, TikaException { if(context == null) context = new ParseContext(); if(hsStaticAttValuePairs == null) hsStaticAttValuePairs = new MultiValueHashMap<>(); Logger.getLogger(SolrIndexCreator.class.getName()).info("Crawling " + lUrls2Crawl); if(hsStaticAttValuePairs.keySize() > 0) Logger.getLogger(SolrIndexCreator.class.getName()).info("Will add static attribute value pairs to each document: " + hsStaticAttValuePairs); Leech leech = new Leech(); long startTime = StopWatch.startAndLogTime(Level.INFO); CrawlReportContentHandler reportContentHandler; ToSolrContentHandler toSolrContentHandler = new ToSolrContentHandler(strSolrUrl, bCloudSolrClient, defaultCollection).setStaticAttributeValuePairs(hsStaticAttValuePairs); if(bPrintErrors) reportContentHandler = new CrawlReportContentHandler(new PrintlnContentHandler(Verbosity.all, toSolrContentHandler).setShowOnlyErrors(true)); else reportContentHandler = new CrawlReportContentHandler(toSolrContentHandler); reportContentHandler.setCyclicReportPrintln(cyclicReportTime); ContentHandler finalContentHandler; DataSinkContentHandlerDecorator postprocessingHandler = getPostprocessingHandler(); if(postprocessingHandler == null) finalContentHandler = reportContentHandler; else { finalContentHandler = postprocessingHandler; DataSinkContentHandlerDecorator lastHandlerInChain = postprocessingHandler; while (lastHandlerInChain.getWrappedDataSinkContentHandler() != null) { if(!(lastHandlerInChain.getWrappedDataSinkContentHandler() instanceof DataSinkContentHandlerDecorator)) throw new IllegalStateException( "Postprocessing handlers must be all of type DataSinkContentHandlerDecorator in order to plug in the Solr data sink handler"); lastHandlerInChain = (DataSinkContentHandlerDecorator) lastHandlerInChain.getWrappedDataSinkContentHandler(); } lastHandlerInChain.setWrappedDataSinkContentHandler(reportContentHandler); } leech.parse(lUrls2Crawl.toArray(new String[0]), finalContentHandler, context); StopWatch.stopAndLogDistance(startTime, Level.INFO); } /** * Returns a {@link DataSinkContentHandler} that will act as a postprocessing chain part. It will be processed directly after getting the data from the parsers, * before delegating it to succeeding report handlers of data sink handler like {@link SolrIndexCreator}. Thus, the data can be modified before writing it into the * data sink. If you overwrite a method from the returned decorator/wrapper, don't forget to call the super method for delegating the call to the wrapped Object * * @return */ public DataSinkContentHandlerDecorator getPostprocessingHandler() { return null; } public void createIndex(String[] args) throws IOException, SAXException, TikaException, Exception { if(args.length == 0 || (args.length != 0 && (args[0].equals("-?") || args[0].equals("-h") || args[0].equals("--help")))) { System.out.println("Usage: SolrIndexCreator [-noPageRedirects] [-noParseGeoCoordinates] [-parseInfoBoxes] [-parseLinksAndCategories]\n" + " [-<staticAttName>=<staticAttValue>] [-printErrors] [-crawlingDepth=<depth>] [-cloudSolrClient] [-defaultCollection=<collectionName>]\n" + " <fileOrDir2CrawlPath1> .. <fileOrDir2CrawlPathN> <solrURL>\n\nComments:\n - you can specify several static attribute value pairs.\n" + " - in the case you use no CloudSolrClient, the default is ConcurrentUpdateSolrClient, which is much faster.\n" + " In this case, you can specify the collection name either in the solrUrl OR as defaultCollection parameter."); System.out.println(); return; } LinkedList<String> llFile2CrawlPath = new LinkedList<>(); String strSolrUrl = null; String defaultCollection = null; int iCrawlingDepth = Integer.MAX_VALUE; boolean bPrintErrors = false; boolean bCloudSolrClient = false; ParseContext context = new ParseContext(); WikipediaDumpParserConfig wikipediaDumpParserConfig = new WikipediaDumpParserConfig().setDeterminePageRedirects(true).setParseGeoCoordinates(true).setParseInfoBoxes(false).setParseLinksAndCategories(false); context.set(WikipediaDumpParserConfig.class, wikipediaDumpParserConfig); MultiValueHashMap<String, String> hsStaticAttValuePairs = new MultiValueHashMap<String, String>(); for (int i = 0; i < args.length; i++) { String strArg = args[i]; if(strArg.equals("-noPageRedirects")) { wikipediaDumpParserConfig.setDeterminePageRedirects(false); } else if(strArg.equals("-noParseGeoCoordinates")) { wikipediaDumpParserConfig.setParseGeoCoordinates(false); } else if(strArg.equals("-parseInfoBoxes")) { wikipediaDumpParserConfig.setParseInfoBoxes(true); } else if(strArg.equals("-parseLinksAndCategories")) { wikipediaDumpParserConfig.setParseInfoBoxes(true); } else if(strArg.startsWith("-crawlingDepth=")) { iCrawlingDepth = Integer.valueOf(strArg.replace("-crawlingDepth=", "")); } else if(strArg.startsWith("-defaultCollection=")) { defaultCollection = strArg.replace("-defaultCollection=", ""); } else if(strArg.startsWith("-printErrors")) { bPrintErrors = true; } else if(strArg.startsWith("-cloudSolrClient")) { bCloudSolrClient = true; } else if(strArg.startsWith("-")) { strArg = strArg.substring(1); if(!strArg.contains("=")) continue; String[] split = strArg.split("="); hsStaticAttValuePairs.add(split[0], split[1]); } else if(llFile2CrawlPath.size() == 0 || i != (args.length - 1)) { llFile2CrawlPath.add(args[i]); } else strSolrUrl = args[i]; } Logger.getLogger(SolrIndexCreator.class.getName()).info("crawling depth is " + iCrawlingDepth); CrawlerContext crawlerContext = new CrawlerContext().setCrawlingDepth(iCrawlingDepth); context.set(CrawlerContext.class, crawlerContext); createIndex(llFile2CrawlPath, strSolrUrl, hsStaticAttValuePairs, bPrintErrors, bCloudSolrClient, defaultCollection, context); } }