package org.aksw.gerbil.dataset.impl.senseval;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.xml.sax.InputSource;
public class SensevalDataset extends AbstractDataset implements InitializableDataset {
protected List<Document> documents;
private String wordsFile;
private Boolean senseval3;
public SensevalDataset(String wordsFile){
this(wordsFile, "false");
}
public SensevalDataset(String wordsFile, String senseval3){
this.wordsFile = wordsFile;
this.senseval3 = Boolean.valueOf(senseval3);
documents = new ArrayList<Document>();
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(this.wordsFile));
}
private List<Document> loadDocuments(File file) throws GerbilException {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser=null;
try{
InputSource is;
if(senseval3){
//FIXME: Better solution, its just one line where & is as content
String content = org.apache.commons.io.FileUtils.readFileToString(new File(this.wordsFile), "UTF-8");
content = content.replace("&", "&").trim();
is = new InputSource(new ByteArrayInputStream(content.getBytes()));
is.setEncoding("UTF-8");
}
else{
is = new InputSource(new FileInputStream(file));
is.setEncoding("UTF-8");
}
saxParser = factory.newSAXParser();
saxParser.parse(is, new SensevalSAXHandler(documents));
} catch (Exception e) {
throw new GerbilException("Exception while reading dataset.", e,
ErrorTypes.DATASET_LOADING_ERROR);
}
return documents;
}
}