/* * WPCleaner: A tool to help on Wikipedia maintenance tasks. * Copyright (C) 2016 Nicolas Vervelle * * See README.txt file for licensing information. */ package org.wikipediacleaner.api.dump; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.Reader; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.xml.sax.InputSource; /** * Wiki dumps processor. */ public class DumpProcessor { /** Logger */ private final Log log = LogFactory.getLog(DumpProcessor.class); /** Page processor */ private PageProcessor pageProcessor; /** * Create a wiki dumps processor. * * @param pageProcessor Page processor. */ public DumpProcessor(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; } /** * Process a wiki dump. * * @param file File containing the wiki dump. */ public void processDump(File file) { if (file == null) { return; } FileInputStream fis = null; BufferedInputStream bis = null; BZip2CompressorInputStream bzis = null; try { SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser parser = factory.newSAXParser(); fis = new FileInputStream(file); bis = new BufferedInputStream(fis); bzis = new BZip2CompressorInputStream(bis); Reader reader = new InputStreamReader(bzis, "UTF-8"); InputSource is = new InputSource(reader); DumpHandler dh = new DumpHandler(); dh.setPageProcessor(pageProcessor); parser.parse(is, dh); } catch (Exception e) { log.error("Error processing dump file", e); } finally { try { if (bzis != null) { bzis.close(); } if (bis != null) { bis.close(); } if (fis != null) { fis.close(); } } catch (Exception e) { log.error("Error closing dump file", e); } } } }