/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package di.uniba.it.tri.extractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
/**
* Extract textual content from a file in the Gutenberg format
* @author pierpaolo
*/
public class GutenbergExtractor implements Extractor {
@Override
public StringReader extract(File txtfile) throws IOException {
BufferedReader reader = new BufferedReader(new FileReader(txtfile));
StringBuilder sb = new StringBuilder();
while (reader.ready()) {
sb.append(reader.readLine()).append("\n");
}
reader.close();
//remove intro
int l = sb.indexOf("*** START OF THIS PROJECT GUTENBERG");
if (l >= 0) {
int end = sb.indexOf("\n\n", l + 1);
if (end >= 0) {
sb = sb.delete(l, end + 1);
}
}
l = sb.indexOf("Produced by");
if (l >= 0) {
int end = sb.indexOf("\n\n", l + 1);
if (end >= 0) {
sb = sb.delete(l, end + 1);
}
}
l = sb.indexOf("This file was produced");
if (l >= 0) {
int end = sb.indexOf("\n\n", l + 1);
if (end >= 0) {
sb = sb.delete(l, end + 1);
}
}
//remove end
l = sb.indexOf("End of the Project Gutenberg");
if (l >= 0) {
sb = sb.delete(l, sb.length());
}
l = sb.indexOf("*** END OF THIS PROJECT GUTENBERG");
if (l >= 0) {
sb = sb.delete(l, sb.length());
}
return new StringReader(sb.toString());
}
}