import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.FileReader; import java.io.FileWriter; /** * This class opens a file whose absolute path is specified in args[0]. * One line at a time it extracts the text from the file and saves it to corpus.txt in the current working directory. * @author Jasmin Suljkic * @author Jonatan Asketorp */ public class ukWacToTxt { public static void main(String[] args) { try { BufferedReader br = new BufferedReader(new FileReader(args[0])); BufferedWriter bw = new BufferedWriter(new FileWriter("corpus.txt")); String temp; StringBuffer sb = new StringBuffer(); while((temp=br.readLine())!=null) { if(temp.startsWith("<s>")){ sb = new StringBuffer(); } else if(temp.startsWith("</s>")){ bw.write(sb.toString().toLowerCase().trim()); bw.newLine(); } else{ String[] text = temp.split("\\t"); sb.append(text[0]);//ignoring the rest which is tags... sb.append(" "); } } br.close(); bw.close(); }catch(IOException e) { e.printStackTrace(); } } }