WikipediaDumpSplitter.java example

Explorer
marytts-master
/**
 * Copyright 2000-2009 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.tools.dbselection;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Vector;

public class WikipediaDumpSplitter {

	private int maxPages = 25000;
	private String xmlWikipediaDumpFile = null;
	private String dirOuputFiles = null;

	public void setXmlWikipediaDumpFile(String str) {
		xmlWikipediaDumpFile = str;
	}

	public void setDirOuputFiles(String str) {
		dirOuputFiles = str;
	}

	public void setMaxPages(int val) {
		maxPages = val;
	}

	public String getXmlWikipediaDumpFile() {
		return xmlWikipediaDumpFile;
	}

	public String getDirOuputFiles() {
		return dirOuputFiles;
	}

	public int getMaxPages() {
		return maxPages;
	}

	/***
	 * This function splits a big XML wikipedia file (ex. 19GB for enwiki) into small XML chunks according to the specified
	 * maximum number of pages per chunk.
	 * 
	 * @param xmlFile
	 *            name of the XML wikipedia file.
	 * @param dirFiles
	 *            directory where to save the small xml chunks.
	 * @param maxPagesPerChunk
	 *            maximum number of pages per chunk, it can be for example 250000 pages (~30MB).
	 */
	private void splitWikipediaDump(String xmlFile, String dirFiles, int maxPagesPerChunk) {

		int totalPageNumber = 0;
		int currentPageNumber = 0;
		int numFiles = 0;
		String outFileName = "";
		String nextLine;
		boolean checkSiteInfo = true;
		boolean siteInfo = false;
		StringBuilder strInfo = new StringBuilder();
		FileWriter outputStream = null;
		int num = (int) Math.round(maxPagesPerChunk * 0.50);

		// we need to scan line by line a big (for ex. 19GB for enwiki) xml file

		BufferedReader inputStream = null;

		try {

			inputStream = new BufferedReader(new FileReader(xmlFile));
			while ((nextLine = inputStream.readLine()) != null) {

				// get first the siteinfo
				if (checkSiteInfo) {
					if (nextLine.startsWith("  <siteinfo"))
						siteInfo = true;
					else if (nextLine.startsWith("  </siteinfo")) {
						siteInfo = false;
						checkSiteInfo = false;
						strInfo.append(nextLine + "\n");
						System.out.println("Extracted <siteinfo> from header, it will be added to all the xml files.\n");
						// System.out.println("siteInfo:" + strInfo);
					} else if (nextLine.startsWith("  </page")) {
						// if a page appears before the siteInfo maybe there is no siteinfo in the header
						System.out.println("Error: problem with siteInfo in file " + xmlFile);
						return;
					}
					if (siteInfo)
						strInfo.append(nextLine + "\n");

				} else if (!nextLine.startsWith("<mediawiki") && !nextLine.startsWith("</mediawiki>")) {

					if (currentPageNumber == maxPagesPerChunk) {
						outputStream.write("</mediawiki>\n");
						currentPageNumber = 0;
						outputStream.close();
						outputStream = null;
					}
					if (outputStream == null) {
						numFiles++;
						outFileName = dirFiles + "page" + Integer.toString(numFiles) + ".xml";
						System.out.println("outFileName(" + maxPagesPerChunk + "):" + outFileName);
						outputStream = new FileWriter(outFileName);
						outputStream.write("<mediawiki>\n");
						// we need the siteinfo at the begining of each chunk
						outputStream.write(strInfo.toString());
						outputStream.write(nextLine + "\n");
					} else
						outputStream.write(nextLine + "\n");

					if (nextLine.startsWith("  </page")) {
						currentPageNumber++;
						totalPageNumber++;
						if ((totalPageNumber % num) == 0)
							System.out.println("number of wikipages = " + totalPageNumber);
					}
				} // if no mediawiki line
			} // while next line

			// final part if remaining pages
			if (currentPageNumber > 0) {
				System.out.println("number of wikipages = " + totalPageNumber + " last chunk with " + currentPageNumber
						+ " pages.");
				outputStream.write("</mediawiki>\n");
				outputStream.close();
			}
		} catch (Exception e) {
			System.err.println("Exception: " + e.getMessage());
		} finally {
			try {
				if (inputStream != null) {
					inputStream.close();
				}
			} catch (Exception e) {
				System.err.println("Exception: " + e.getMessage());
			}
		}
	}

	/**
	 * Read and parse the command line args
	 * 
	 * @param args
	 *            the args
	 * @return true, if successful, false otherwise
	 */
	private boolean readArgs(String[] args) {

		String help = "\nUsage: java WikipediaDumpSplitter -xmlDump xmlDumpFile -dirOut outputFilesDir -maxPages maxNumberPages \n"
				+ "      -xmlDump xml wikipedia dump file. \n"
				+ "      -outDir directory where the small xml chunks will be saved.\n"
				+ "      -maxPages maximum number of pages of each small xml chunk (if no specified default 25000). \n\n";

		if (args.length >= 4) { // minimum 2 parameters
			for (int i = 0; i < args.length; i++) {
				if (args[i].contentEquals("-xmlDump") && args.length >= (i + 1))
					setXmlWikipediaDumpFile(args[++i]);

				else if (args[i].contentEquals("-outDir") && args.length >= (i + 1))
					setDirOuputFiles(args[++i]);
				// this argument is optional
				else if (args[i].contentEquals("-maxPages") && args.length >= (i + 1))
					setMaxPages(Integer.parseInt(args[++i]));
			}
		} else {
			System.out.println(help);
			return false;
		}

		if (getXmlWikipediaDumpFile() == null || getDirOuputFiles() == null) {
			System.out.println("\nMissing required parameter -xmlDump or -dirOut.");
			System.out.println(help);
			return false;
		}
		if (getMaxPages() == 0) {
			System.out.println("Number of pages per xml file not specified. Using defaul value maxPages = 25000");
			setMaxPages(25000);
		}
		return true;
	}

	private void printParameters() {
		System.out.println("\nWikipediaDumpSplitter parameters:" + "\n  -xmlDump  " + getXmlWikipediaDumpFile()
				+ "\n  -outDir   " + getDirOuputFiles() + "\n  -maxPages " + getMaxPages() + "\n");

	}

	public static void main(String[] args) throws Exception {
		String wFile, cmdLine;
		Vector<String> filesToProcess;
		Vector<String> filesDone;

		WikipediaDumpSplitter wiki = new WikipediaDumpSplitter();

		/* check the arguments */
		if (!wiki.readArgs(args))
			return;
		wiki.printParameters();

		wiki.splitWikipediaDump(wiki.getXmlWikipediaDumpFile(), wiki.getDirOuputFiles(), wiki.getMaxPages());

	}
}