/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.classifier.bayes; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.net.URI; import java.text.DecimalFormat; import java.text.NumberFormat; import com.google.common.io.Closeables; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.OptionException; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.iterator.FileLineIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * <p>The Bayes example package provides some helper classes for training the Naive Bayes classifier * on the Twenty Newsgroups data. See {@link org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups} * for details on running the trainer and * formatting the Twenty Newsgroups data properly for the training.</p> * * <p>The easiest way to prepare the data is to use the ant task in core/build.xml:</p> * * <p>{@code ant extract-20news-18828}</p> * * <p>This runs the arg line:</p> * * <p>{@code -p $\{working.dir\}/20news-18828/ -o $\{working.dir\}/20news-18828-collapse -a $\{analyzer\} -c UTF-8}</p> * * <p>To Run the Wikipedia examples (assumes you've built the Mahout Job jar):</p> * * <ol> * <li>Download the Wikipedia Dataset. Use the Ant target: {@code ant enwiki-files}</li> * <li>Chunk the data using the WikipediaXmlSplitter (from the Hadoop home): * {@code bin/hadoop jar $MAHOUT_HOME/target/mahout-examples-0.x * org.apache.mahout.classifier.bayes.WikipediaXmlSplitter * -d $MAHOUT_HOME/examples/temp/enwiki-latest-pages-articles.xml * -o $MAHOUT_HOME/examples/work/wikipedia/chunks/ -c 64}</li> * </ol> */ public final class WikipediaXmlSplitter { private static final Logger log = LoggerFactory.getLogger(WikipediaXmlSplitter.class); private WikipediaXmlSplitter() { } public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true).withArgument( abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).withDescription( "The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d").create(); Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument( abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).withDescription( "The output directory to place the splits in:\n" + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n" + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n" + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n" + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n") .withShortName("o").create(); Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false).withArgument( abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()).withDescription("Amazon S3 ID key") .withShortName("i").create(); Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false).withArgument( abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()).withDescription( "Amazon S3 secret key").withShortName("s").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true).withArgument( abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription( "The Size of the chunk, in megabytes").withShortName("c").create(); Option numChunksOpt = obuilder .withLongName("numChunks") .withRequired(false) .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of chunks to create. If specified, program will only create a subset of the chunks") .withShortName("n").create(); Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption( chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt).create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine; try { cmdLine = parser.parse(args); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); return; } Configuration conf = new Configuration(); String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt); String outputDirPath = (String) cmdLine.getValue(outputDirOpt); if (cmdLine.hasOption(s3IdOpt)) { String id = (String) cmdLine.getValue(s3IdOpt); conf.set("fs.s3n.awsAccessKeyId", id); conf.set("fs.s3.awsAccessKeyId", id); } if (cmdLine.hasOption(s3SecretOpt)) { String secret = (String) cmdLine.getValue(s3SecretOpt); conf.set("fs.s3n.awsSecretAccessKey", secret); conf.set("fs.s3.awsSecretAccessKey", secret); } // do not compute crc file when using local FS conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf); int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); int numChunks = Integer.MAX_VALUE; if (cmdLine.hasOption(numChunksOpt)) { numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt)); } String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ " + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n" + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n" + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n" + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n" + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n" + " <namespace key=\"1\">Talk</namespace>\n" + " <namespace key=\"2\">User</namespace>\n" + " <namespace key=\"3\">User talk</namespace>\n" + " <namespace key=\"4\">Wikipedia</namespace>\n" + " <namespace key=\"5\">Wikipedia talk</namespace>\n" + " <namespace key=\"6\">Image</namespace>\n" + " <namespace key=\"7\">Image talk</namespace>\n" + " <namespace key=\"8\">MediaWiki</namespace>\n" + " <namespace key=\"9\">MediaWiki talk</namespace>\n" + " <namespace key=\"10\">Template</namespace>\n" + " <namespace key=\"11\">Template talk</namespace>\n" + " <namespace key=\"12\">Help</namespace>\n" + " <namespace key=\"13\">Help talk</namespace>\n" + " <namespace key=\"14\">Category</namespace>\n" + " <namespace key=\"15\">Category talk</namespace>\n" + " <namespace key=\"100\">Portal</namespace>\n" + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n" + " </siteinfo>\n"; StringBuilder content = new StringBuilder(); content.append(header); NumberFormat decimalFormatter = new DecimalFormat("0000"); File dumpFile = new File(dumpFilePath); FileLineIterator it; if (dumpFilePath.endsWith(".bz2")) { // default compression format from http://download.wikimedia.org CompressionCodec codec = new BZip2Codec(); it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile))); } else { // assume the user has previously de-compressed the dump file it = new FileLineIterator(dumpFile); } int filenumber = 0; while (it.hasNext()) { String thisLine = it.next(); if (thisLine.trim().startsWith("<page>")) { boolean end = false; while (!thisLine.trim().startsWith("</page>")) { content.append(thisLine).append('\n'); if (it.hasNext()) { thisLine = it.next(); } else { end = true; break; } } content.append(thisLine).append('\n'); if (content.length() > chunkSize || end) { content.append("</mediawiki>"); filenumber++; String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml"; BufferedWriter chunkWriter = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8")); try { chunkWriter.write(content.toString(), 0, content.length()); } finally { Closeables.closeQuietly(chunkWriter); } if (filenumber >= numChunks) { break; } content = new StringBuilder(); content.append(header); } } } } }