package edu.uncc.cs.watsonsim.scripts;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
public class WiktionaryParser {
public static void main (String[] args) throws IOException{
String title = "";
String def = "";
int defNum = 0;
try(BufferedReader br = new BufferedReader(new FileReader("Test2.xml"))) {
String line = br.readLine();
FileWriter fstream = new FileWriter("out.txt");
BufferedWriter out = new BufferedWriter(fstream);
while (line != null) {
if(line.contains("<page>")){
defNum++;
out.newLine();
out.newLine();
line = br.readLine();
outerloop:
while ((line.contains("</page>")) != true){
if (line.contains("<title>") && (line.contains("Wiktionary") == false)){
out.write("____________________________________");
out.newLine();
out.newLine();
title = line;
out.write("<DOC>");
out.newLine();
out.write("<TITLE>");
title = title.replaceAll("<title>", "").replaceAll("</title>", "");
title = title.trim();
out.write(title);
out.write("</TITLE>");
out.newLine();
out.write("<TEXT>");
}else if(line.contains("<title>") && (line.contains("Wiktionary") == true)){
defNum = 0;
break outerloop;
}
if (line.contains("# ")){
def = line;
def = def.replace("[", "");
def = def.replace("]", "");
def = def.replace("{", "");
def = def.replace("}", "");
out.write(def);
out.newLine();
}
if (line.contains("===Etymology===")){
line = br.readLine();
while(line.contains("===") != true){
if(line.contains("*")){
line = line.replace("[", "");
line = line.replace("]", "");
line = line.replace("{", "");
line = line.replace("}", "");
out.write(line);
out.newLine();
}
line = br.readLine();
}
out.newLine();
}
line = br.readLine();
}
out.write("</TEXT>");
out.newLine();
out.write("</DOC>");
}
line = br.readLine();
}
System.out.println(defNum + " definitions exported to out.txt");
out.close();
}
}
}