/** * */ package uk.bl.wa.indexer; /* * #%L * warc-indexer * $Id:$ * $HeadURL:$ * %% * Copyright (C) 2013 - 2014 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import java.io.File; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import javax.xml.transform.OutputKeys; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrInputDocument; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; import org.archive.io.ArchiveRecord; import org.archive.util.SurtPrefixSet; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigValueFactory; import uk.bl.wa.annotation.Annotations; import uk.bl.wa.annotation.Annotator; import uk.bl.wa.solr.SolrFields; import uk.bl.wa.solr.SolrRecord; import uk.bl.wa.solr.SolrWebServer; import uk.bl.wa.util.Instrument; /** * @author Andrew Jackson <Andrew.Jackson@bl.uk> * */ public class WARCIndexerCommand { private static Log log = LogFactory.getLog(WARCIndexerCommand.class); private static final String CLI_USAGE = "[-o <output dir>] [-s <Solr instance>] [-t] <include text> [-r] <root/slash pages only> [-b <batch-submissions size>] [WARC File List]"; private static final String CLI_HEADER = "WARCIndexer - Extracts metadata and text from Archive Records"; private static final String CLI_FOOTER = ""; private static boolean debugMode = false; /** * * @param args * @throws NoSuchAlgorithmException * @throws IOException * @throws TransformerException * @throws TransformerFactoryConfigurationError * @throws SolrServerException */ public static void main( String[] args ) throws NoSuchAlgorithmException, IOException, TransformerFactoryConfigurationError, TransformerException { final long allStart = System.nanoTime(); CommandLineParser parser = new PosixParser(); String outputDir = null; String solrUrl = null; String configFile = null; boolean isTextRequired = false; boolean slashPages = false; int batchSize = -1; // No explicit batch size (defaults to 1 if not stated in the conf-file) String annotationsFile = null; boolean disableCommit; Options options = new Options(); options.addOption("o", "output", true, "The directory to contain the output XML files"); options.addOption("s", "solr", true, "The URL of the required Solr Instance"); options.addOption("t", "text", false, "Include text in XML in output files"); options.addOption("r", "slash", false, "Only process slash (root) pages."); options.addOption("a", "annotations", true, "A JSON file containing the annotations to apply during indexing."); options.addOption("b", "batch", true, "Batch size for submissions."); options.addOption("c", "config", true, "Configuration to use."); options.addOption("d", "disable_commit", false, "Disable client side commits (speeds up indexing at the cost of flush guarantee)."); try { // parse the command line arguments CommandLine line = parser.parse( options, args ); String cli_args[] = line.getArgs(); // Check that a mandatory Archive file(s) has been supplied if( !( cli_args.length > 0 ) ) { printUsage( options ); System.exit( 0 ); } // Get the output directory, if set if(line.hasOption("o")){ outputDir = line.getOptionValue("o"); if(outputDir.endsWith("/")||outputDir.endsWith("\\")){ outputDir = outputDir.substring(0, outputDir.length()-1); } outputDir = outputDir + "//"; System.out.println("Output Directory is: " + outputDir); File dir = new File(outputDir); if(!dir.exists()){ FileUtils.forceMkdir(dir); } } // Get the Solr Url, if set if(line.hasOption("s")){ solrUrl = line.getOptionValue("s"); if(solrUrl.contains("\"")){ solrUrl = solrUrl.replaceAll("\"", ""); } } // Check if the text field is required in the XML output if(line.hasOption("t") || line.hasOption("s")){ isTextRequired = true; } if( line.hasOption( "r" ) ) { slashPages = true; } if( line.hasOption( "b" ) ) { batchSize = Integer.parseInt( line.getOptionValue( "b" ) ); } if (line.hasOption("c")) { configFile = line.getOptionValue("c"); } // Check that either an output dir or Solr URL is supplied if(outputDir == null && solrUrl == null){ System.out.println( "A Solr URL or an Output Directory must be supplied" ); printUsage(options); System.exit( 0 ); } // Check that both an output dir and Solr URL are not supplied if(outputDir != null && solrUrl != null){ System.out.println( "A Solr URL and an Output Directory cannot both be specified" ); printUsage(options); System.exit( 0 ); } // Pick up any annotations specified: if (line.hasOption("a")) { annotationsFile = line.getOptionValue("a"); } // Check for commit disabling disableCommit = line.hasOption("d"); parseWarcFiles(configFile, outputDir, solrUrl, cli_args, isTextRequired, slashPages, batchSize, annotationsFile, disableCommit); } catch (org.apache.commons.cli.ParseException e) { log.error("Parse exception when processing command line arguments: "+e); } finally { Instrument.timeRel("WARCIndexerCommand.main#total", allStart); Instrument.log(true); } } /** * @param outputDir * @param args * @throws NoSuchAlgorithmException * @throws IOException * @throws TransformerFactoryConfigurationError * @throws TransformerException */ public static void parseWarcFiles(String configFile, String outputDir, String solrUrl, String[] args, boolean isTextRequired, boolean slashPages, int batchSize, String annotationsFile, boolean disableCommit) throws NoSuchAlgorithmException, TransformerFactoryConfigurationError, TransformerException, IOException { long startTime = System.currentTimeMillis(); final long start = System.nanoTime(); // If the Solr URL is set initiate a connections Config conf = ConfigFactory.load(); if (configFile != null) { log.info("Loading config from log file: " + configFile); conf = ConfigFactory.parseFile(new File(configFile)); // ConfigPrinter.print(conf); // conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); log.info("Loaded warc config."); log.info(conf.getString("warc.title")); } if(solrUrl != null) { conf = conf.withValue(SolrWebServer.CONF_HTTP_SERVER, ConfigValueFactory.fromAnyRef(solrUrl) ); } // Use config for default value if (conf.hasPath("warc.solr.disablecommit")) { disableCommit = disableCommit || conf.getBoolean("warc.solr.disablecommit"); } if (batchSize == -1) { // Batch size not set as command line, so resolve it from conf with default 1 batchSize = conf.hasPath("warc.solr.batch_size") ? conf.getInt("warc.solr.batch_size") : 1; } // Set up the server config: SolrWebServer solrWeb = new SolrWebServer(conf); // Also pass config down: WARCIndexer windex = new WARCIndexer(conf); // Add in annotations, if set: if (annotationsFile != null) { Annotations ann = Annotations.fromJsonFile(annotationsFile); SurtPrefixSet oaSurts = Annotator .loadSurtPrefix("openAccessSurts.txt"); windex.setAnnotations(ann, oaSurts); } // To be indexed: ArrayList<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(); int totInputFile = args.length; int curInputFile = 1; Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.parseWarcFiles#startup", start); // Loop through each Warc files for (int arcsIndex = 0; arcsIndex < args.length; arcsIndex++) { String inputFile = args[arcsIndex]; if (!disableCommit) { // Commit to make sure index is up to date: commit(solrWeb); } System.out.println("Parsing Archive File [" + curInputFile + "/" + totInputFile + "]:" + inputFile); File inFile = new File(inputFile); String fileName = inFile.getName(); String outputWarcDir = outputDir + fileName + "//"; File dir = new File(outputWarcDir); if (!dir.exists() && solrUrl == null) { FileUtils.forceMkdir(dir); } ArchiveReader reader = ArchiveReaderFactory.get(inputFile); Iterator<ArchiveRecord> ir = reader.iterator(); int recordCount = 1; // Iterate though each record in the WARC file while (ir.hasNext()) { final long recordStart = System.nanoTime(); ArchiveRecord rec = ir.next(); SolrRecord doc = new SolrRecord(inFile.getName(), rec.getHeader()); try { doc = windex.extract(inFile.getName(), rec, isTextRequired); } catch (Exception e) { log.warn("Exception on record " + rec.getHeader().getUrl() + " from " + inFile.getName(), e); doc.addParseException(e); continue; } catch (OutOfMemoryError e) { log.warn( "OutOfMemoryError on record " + rec.getHeader().getUrl() + " from " + inFile.getName(), e); doc.addParseException(e); } Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.parseWarcFiles#solrdocCreation", recordStart); if (doc != null) { final long updateStart = System.nanoTime(); File fileOutput = new File(outputWarcDir + "//" + "FILE_" + recordCount + ".xml"); if (!slashPages || (doc.getFieldValue(SolrFields.SOLR_URL_TYPE) != null && doc.getFieldValue(SolrFields.SOLR_URL_TYPE).equals(SolrFields.SOLR_URL_TYPE_SLASHPAGE))) { // Write XML to file if not posting straight to the server. if (solrUrl == null) { writeXMLToFile(doc.toXml(), fileOutput); } else { docs.add(doc.getSolrDocument()); checkSubmission(solrWeb, docs, batchSize, false); } recordCount++; } Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.parseWarcFiles#docdelivery", updateStart); } } curInputFile++; Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.parseWarcFiles#fullarcprocess", start); Instrument.log(arcsIndex < args.length-1); // Don't log the last on info to avoid near-duplicate logging } // Submit any remaining docs: checkSubmission(solrWeb, docs, batchSize, true); if (!disableCommit) { // Commit the updates: commit(solrWeb); } long endTime = System.currentTimeMillis(); System.out.println("WARC Indexer Finished in " + ((endTime - startTime) / 1000.0) + " seconds."); } private static void commit( SolrWebServer solrWeb) { // Commit any Solr Updates if( solrWeb != null ) { try { final long start = System.nanoTime(); solrWeb.commit(); Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.commit#success", start); } catch( SolrServerException s ) { log.warn( "SolrServerException when committing.", s ); } catch( IOException i ) { log.warn( "IOException when committing.", i ); } } } /** * Checks whether a List of SolrInputDocuments has grown large enough to * be submitted to a SolrWebServer. * * @param solr * @param docs * @param limit * @throws SolrServerException * @throws IOException */ private static void checkSubmission(SolrWebServer solr, List<SolrInputDocument> docs, int limit, boolean force) { if (docs.size() > 0 && docs.size() >= limit || force) { try { final long start = System.nanoTime(); if (log.isTraceEnabled() || debugMode) { for (SolrInputDocument doc : docs) { try { solr.updateSolrDoc(doc); } catch (Exception e) { log.error( "Failed to post document - got exception: ", e); log.error("Failed document was:\n" + ClientUtils.toXML(doc)); System.exit(1); } } } else { solr.add(docs); } Instrument.timeRel( "WARCIndexerCommand.parseWarcFiles#docdelivery", "WARCIndexerCommanc.checkSubmission#solradd", start); docs.clear(); } catch (SolrServerException s) { log.warn("SolrServerException: ", s); } catch (IOException i) { log.warn("IOException: ", i); } } } public static void prettyPrintXML( String doc ) throws TransformerFactoryConfigurationError, TransformerException { Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); //initialize StreamResult with File object to save to file StreamResult result = new StreamResult(new StringWriter()); StreamSource source = new StreamSource(new StringReader(doc)); transformer.transform(source, result); String xmlString = result.getWriter().toString(); System.out.println(xmlString); } /** * @param xml * @param file * @throws IOException * @throws TransformerFactoryConfigurationError * @throws TransformerException */ public static void writeXMLToFile( String xml, File file ) throws IOException, TransformerFactoryConfigurationError, TransformerException { Result result = new StreamResult(file); Source source = new StreamSource(new StringReader(xml)); Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); //FileUtils.writeStringToFile(file, xml); transformer.transform(source, result); } /** * @param options */ private static void printUsage( Options options ) { HelpFormatter helpFormatter = new HelpFormatter(); helpFormatter.setWidth( 80 ); helpFormatter.printHelp( CLI_USAGE, CLI_HEADER, options, CLI_FOOTER ); } }