/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.erd;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.nio.charset.Charset;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.apache.commons.io.IOUtils;
@Deprecated
public class ERDDataset extends AbstractDataset implements InitializableDataset {
private static final String FREEBASE_URI = "https://www.googleapis.com/freebase";
private String file_text;
private String file_annotation;
private List<Document> documents;
public ERDDataset(String filetext, String fileannotation) {
this.file_text = filetext;
this.file_annotation = fileannotation;
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(file_text), new File(file_annotation));
}
private String generateDocumentUri(String fileName) {
StringBuilder builder = new StringBuilder();
builder.append("http://");
builder.append(name);
builder.append('/');
builder.append(Paths.get(fileName).getFileName().toString());
return builder.toString();
}
protected List<Document> loadDocuments(File textfile, File annotationfile) throws GerbilException {
if (!textfile.exists()) {
throw new GerbilException("The given text file (" + textfile.getAbsolutePath() + ") does not exist.", ErrorTypes.DATASET_LOADING_ERROR);
}
if (!annotationfile.exists()) {
throw new GerbilException("The given annotation file (" + annotationfile.getAbsolutePath() + ") does not exist.", ErrorTypes.DATASET_LOADING_ERROR);
}
List<Document> docs = new ArrayList<>();
String documentUri = generateDocumentUri(textfile.getAbsolutePath());
Map<String, ERDTrec> textMap = new HashMap<>();
String text_data = "";
byte[] filedata = new byte[(int) textfile.length()];
ERDTrec datatrec = null;
RandomAccessFile raf;
try {
raf = new RandomAccessFile(textfile, "r");
raf.seek(0);
raf.readFully(filedata);
text_data = new String(filedata);
raf.close();
} catch (IOException e) {
throw new GerbilException("Exception while reading text file of dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
}
int error = 0;
String[] text_split = text_data.split("\n");
for (String line : text_split) {
String[] line_part = line.split("\t");
String key;
if (line_part.length != 2) {
error++;
key = "ERROR " + error;
} else {
key = line_part[0];
}
datatrec = new ERDTrec(line, datatrec);
textMap.put(key, datatrec);
}
BufferedReader reader = null;
List<Marking> markings = new ArrayList<>();
String line;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(annotationfile), Charset.forName("UTF-8")));
while ((line = reader.readLine()) != null) {
String[] line_split = line.split("\t");
if (line_split.length != 5) continue;
datatrec = textMap.get(line_split[0]);
if (datatrec != null) {
int position = datatrec.getTextPosition(line_split[3]);
int length = line_split[3].length();
markings.add(new NamedEntity(position, length, FREEBASE_URI + line_split[2]));
}
}
} catch (IOException e) {
throw new GerbilException("Exception while reading annotation file of dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
} finally {
IOUtils.closeQuietly(reader);
}
docs.add(new DocumentImpl(text_data, documentUri, markings));
return docs;
}
}