/** * @Author: Jonathan Shuman * @Purpose: For indexing an XML file from WikiQuotes * * Output is a series of text files with TREC formatted quotes. * Broken into a series of files with 1000 quote authors in ach file. */ package edu.uncc.cs.watsonsim.scripts; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.Writer; import java.util.Scanner; /** * @author ShumanLaptop * */ public class WikiquoteParser { /** * @param args */ public static void main(String[] args) { Scanner in = null; InputStreamReader isr = null; FileInputStream fis = null; int curDocNumber = 1; Writer out = null; PrintWriter writer = null; WikiquoteQuote curQuote = new WikiquoteQuote(); try { File file = new File("enwikiquote-20140121-pages-articles.xml"); fis = new FileInputStream(file); isr = new InputStreamReader(fis,"UTF-8"); in = new Scanner(isr); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.exit(1); } boolean finishFile = false; boolean keepGrabbing = false; while (in.hasNextLine()) { if (curDocNumber % 1000 == 0 || curDocNumber == 1) { out = getNewFile(curDocNumber); curDocNumber++; try { writer = new PrintWriter(out); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.exit(2); } } String curLine = in.nextLine(); if (curLine.contains("<title>")) { // We want to skip meta articles with wikiquote in them curQuote.setTitle(curLine); } else { if (curLine.contains("<text")) { keepGrabbing = true; } else if (curLine.contains("</text>")) { curQuote.setText(curQuote.getText() + "\n" + curLine); keepGrabbing = false; finishFile = true; } } //Get more text this line. This will quit upon finding </text> if(keepGrabbing){ curQuote.setText(curQuote.getText() + System.getProperty("line.separator") + curLine); } if (finishFile) { // Write this one to the file only if its not trash if (!curQuote.isTrash()) { curQuote.setDocNumber(curDocNumber); try { InputStreamReader text = new InputStreamReader( new ByteArrayInputStream(curQuote.toString().getBytes())); BufferedReader reader = new BufferedReader(text); String line = null; line = reader.readLine(); while(line != null){ if(!line.isEmpty()) writer.print(line + "\n"); line = reader.readLine(); } curDocNumber++; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); System.exit(3); } } curQuote.reset(); if (curDocNumber % 1000 == 0) { writer.close(); } finishFile = false; } } try { isr.close(); fis.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } in.close(); } private static Writer getNewFile(int curDocNumber) { String filename = "wikiquote-trec-" + curDocNumber + ".txt"; //We need to specify UTF-8 for special character encoding Writer out = null; try { out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(filename), "UTF-8")); System.out.println("created new File" + filename); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } return out; } }