package org.wikipedia.miner.extract; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.text.DateFormat; import java.text.SimpleDateFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.wikipedia.miner.extract.steps.finalSummary.FinalSummaryStep; import org.wikipedia.miner.extract.steps.labelOccurrences.LabelOccurrenceStep; import org.wikipedia.miner.extract.steps.labelSenses.LabelSensesStep; import org.wikipedia.miner.extract.steps.pageDepth.PageDepthStep; import org.wikipedia.miner.extract.steps.pageSummary.PageSummaryStep; import org.wikipedia.miner.extract.steps.primaryLabel.PrimaryLabelStep; import org.wikipedia.miner.extract.steps.sortedPages.PageSortingStep; import org.wikipedia.miner.extract.util.Languages; import org.wikipedia.miner.extract.util.Languages.Language; /** * @author dnk2 * * This class extracts summaries (link graphs, etc) from Wikipedia xml dumps. * It calls a sequence of Hadoop Map/Reduce jobs to do so in a scalable, timely fashion. * * * */ public class DumpExtractor { private Configuration conf ; private String[] args ; private Path inputFile ; private Path langFile ; private String lang ; private Path sentenceModel ; private Path workingDir ; private Path finalDir ; //private Logger logger ; public static final String KEY_INPUT_FILE = "wm.inputDir" ; public static final String KEY_OUTPUT_DIR = "wm.workingDir" ; public static final String KEY_LANG_FILE = "wm.langFile" ; public static final String KEY_LANG_CODE = "wm.langCode" ; public static final String KEY_SENTENCE_MODEL = "wm.sentenceModel" ; public static final String LOG_ORPHANED_PAGES = "orphanedPages" ; public static final String LOG_WEIRD_LABEL_COUNT = "wierdLabelCounts" ; public static final String LOG_MEMORY_USE = "memoryUsage" ; public static final String OUTPUT_SITEINFO = "final/siteInfo.xml" ; public static final String OUTPUT_PROGRESS = "tempProgress.csv" ; public static final String OUTPUT_TEMPSTATS = "tempStats.csv" ; public static final String OUTPUT_STATS = "final/stats.csv" ; DateFormat timeFormat = new SimpleDateFormat("HH:mm:ss") ; public DumpExtractor(String[] args) throws Exception { GenericOptionsParser gop = new GenericOptionsParser(args) ; conf = gop.getConfiguration() ; //outputFileSystem = FileSystem.get(conf); this.args = gop.getRemainingArgs() ; configure() ; } public static void main(String[] args) throws Exception { //PropertyConfigurator.configure("log4j.properties"); DumpExtractor de = new DumpExtractor(args) ; int result = de.run(); System.exit(result) ; } public static JobConf configureJob(JobConf conf, String[] args) { conf.set(KEY_INPUT_FILE, args[0]) ; conf.set(KEY_LANG_FILE, args[1]) ; conf.set(KEY_LANG_CODE, args[2]) ; conf.set(KEY_SENTENCE_MODEL, args[3]) ; conf.set(KEY_OUTPUT_DIR, args[4]) ; //set a reasonable number of maps. This is going to be ignored for very large inputs (e.g. the en wiki dump) anyway. conf.setNumMapTasks(16) ; //force one reducer by default. These don't take very long, and multiple reducers would make finalise file functions more complicated. conf.setNumReduceTasks(1) ; //many of our tasks require pre-loading lots of data, may as well reuse this as much as we can. //conf.setNumTasksToExecutePerJvm(-1) ; //conf.setInt("mapred.tasktracker.map.tasks.maximum", 2) ; //conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1) ; //TODO: really don't want this hard coded. conf.set("mapred.child.java.opts", "-Xmx500M -Dapple.awt.UIElement=true") ; //conf.setBoolean("mapred.used.genericoptionsparser", true) ; return conf ; } private FileSystem getFileSystem(Path path) throws IOException { return path.getFileSystem(conf) ; } private Path getPath(String pathStr) { return new Path(pathStr) ; } private FileStatus getFileStatus(Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); return fs.getFileStatus(path) ; } private void configure() throws Exception { if (args.length != 6) throw new IllegalArgumentException("Please specify a xml dump of wikipedia, a language.xml config file, a language code, an openNLP sentence detection model, an hdfs writable working directory, and an output directory") ; //check input file inputFile = getPath(args[0]); FileStatus fs = getFileStatus(inputFile) ; if (fs.isDir() || !fs.getPermission().getUserAction().implies(FsAction.READ)) throw new IOException("'" +inputFile + " is not readable or does not exist") ; //check lang file and language langFile = getPath(args[1]) ; lang = args[2] ; //TODO: should read language here, just to check it is valid /* Language language = Languages.load(new File(langFile.toString())).get(lang) ; if (language == null) throw new IOException("Could not load language configuration for '" + lang + "' from '" + langFile + "'") ; */ sentenceModel = new Path(args[3]) ; fs = getFileStatus(sentenceModel) ; if (fs.isDir() || !fs.getPermission().getUserAction().implies(FsAction.READ)) throw new IOException("'" + sentenceModel + " is not readable or does not exist") ; //check working directory workingDir = new Path(args[4]) ; if (!getFileSystem(workingDir).exists(workingDir)) getFileSystem(workingDir).mkdirs(workingDir) ; fs = getFileStatus(workingDir) ; if (!fs.isDir() || !fs.getPermission().getUserAction().implies(FsAction.WRITE)) throw new IOException("'" +workingDir + " is not a writable directory") ; //set up directory where final data will be placed finalDir = new Path(args[5]) ; /* if (getFileSystem(finalDir).exists(finalDir)) getFileSystem(finalDir).delete(finalDir, true) ; getFileSystem(finalDir).mkdirs(finalDir) ; fs = getFileStatus(finalDir) ; if (!fs.isDir() || !fs.getPermission().getUserAction().implies(FsAction.WRITE)) throw new IOException("'" +workingDir + " is not a writable directory") ; */ } private int run() throws Exception { Logger.getLogger(DumpExtractor.class).info("Extracting site info") ; extractSiteInfo() ; //extract basic page summaries int summaryIteration = 0 ; PageSummaryStep summaryStep ; while (true) { //long startTime = System.currentTimeMillis() ; summaryStep = new PageSummaryStep(workingDir, summaryIteration) ; ToolRunner.run(new Configuration(), summaryStep, args); //System.out.println("intitial step completed in " + timeFormat.format(System.currentTimeMillis()-startTime)) ; if (!summaryStep.furtherIterationsRequired()) break ; else summaryIteration++ ; } PageSortingStep sortingStep = new PageSortingStep(workingDir, summaryStep) ; ToolRunner.run(new Configuration(), sortingStep, args); //calculate page depths int depthIteration = 0 ; PageDepthStep depthStep ; while (true) { depthStep = new PageDepthStep(workingDir, depthIteration, sortingStep) ; ToolRunner.run(new Configuration(), depthStep, args); if (!depthStep.furtherIterationsRequired()) break ; else depthIteration++ ; } //gather label senses LabelSensesStep sensesStep = new LabelSensesStep(workingDir, sortingStep) ; ToolRunner.run(new Configuration(), sensesStep, args); //gather primary labels PrimaryLabelStep primaryLabelStep = new PrimaryLabelStep(workingDir, sensesStep) ; ToolRunner.run(new Configuration(), primaryLabelStep, args); //gather label occurrences LabelOccurrenceStep occurrencesStep = new LabelOccurrenceStep(workingDir, sensesStep) ; ToolRunner.run(new Configuration(), occurrencesStep, args); FinalSummaryStep finalStep = new FinalSummaryStep(finalDir, sortingStep, depthStep, primaryLabelStep, sensesStep, occurrencesStep) ; finalStep.run() ; return 0 ; } private void extractSiteInfo() throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(getFileSystem(inputFile).open(inputFile))) ; BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(getFileSystem(workingDir).create(new Path(workingDir + "/" + OUTPUT_SITEINFO)))) ; String line = null; boolean startedWriting = false ; while ((line = reader.readLine()) != null) { if (!startedWriting && line.matches("\\s*\\<siteinfo\\>\\s*")) startedWriting = true ; if (startedWriting) { writer.write(line) ; writer.newLine() ; if (line.matches("\\s*\\<\\/siteinfo\\>\\s*")) break ; } } reader.close() ; writer.close(); } }