/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.gerdaq;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class GERDAQDataset extends AbstractDataset implements InitializableDataset {
private static final Logger LOGGER = LoggerFactory.getLogger(GERDAQDataset.class);
private static final String WIKIPEDIA_URI = "http://en.wikipedia.org/wiki/";
private static final String DBPEDIA_URI = "http://dbpedia.org/resource/";
private static final String ANNOTATION_TAG = "annotation";
private static final String DOCUMENT_TAG = "instance";
private String file;
private List<Document> documents;
public GERDAQDataset(String file) {
this.file = file;
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(file));
}
protected static String generateDocumentUri(String datasetName, String fileName) {
StringBuilder builder = new StringBuilder();
builder.append("http://");
builder.append(datasetName.replace(' ', '_'));
builder.append('/');
builder.append(fileName);
builder.append('_');
return builder.toString();
}
private List<Document> loadDocuments(File filePath) throws GerbilException {
List<Document> docs = new ArrayList<>();
if (!filePath.exists()) {
throw new GerbilException("The given file (" + filePath.getAbsolutePath() + ") is not existing.",
ErrorTypes.DATASET_LOADING_ERROR);
}
if (filePath.isDirectory()) {
String directoryPath = filePath.getAbsolutePath();
if (!directoryPath.endsWith(File.separator)) {
directoryPath = directoryPath + File.separator;
}
for (File tmpFile : new File(directoryPath).listFiles()) {
docs.addAll(createDocument(tmpFile));
}
} else {
docs.addAll(createDocument(filePath));
}
return docs;
}
private List<Document> createDocument(File file) throws GerbilException {
List<Document> documents = new ArrayList<Document>();
String documentUriStart = generateDocumentUri(name, file.getName());
InputStream inputStream = null;
InputSource is = null;
try {
inputStream = new BufferedInputStream(new FileInputStream(file));
is = new InputSource(inputStream);
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(is, new DefaultHandler() {
private StringBuilder text = new StringBuilder();
private int markingStart;
private String markingTitle;
private List<Marking> markings;
@Override
public void startDocument() throws SAXException {
super.startDocument();
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
throws SAXException {
if (qName.equals(ANNOTATION_TAG)) {
markingTitle = atts.getValue("rank_0_title");
if (markingTitle != null) {
markingStart = text.length();
} else {
LOGGER.error("Found a marking without the necessary \"rank_0_title\" attribute.");
}
markingTitle = markingTitle.replace(' ', '_');
} else if (qName.equals(DOCUMENT_TAG)) {
this.markings = new ArrayList<>();
}
}
@Override
public void characters(char[] ch, int start, int length) {
text.append(ch, start, length);
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (qName.equals(DOCUMENT_TAG)) {
documents.add(new DocumentImpl(text.toString(), documentUriStart + documents.size(), markings));
text.delete(0, text.length());
} else if (qName.equals(ANNOTATION_TAG) && (markingTitle != null)) {
markings.add(new NamedEntity(markingStart, text.length() - markingStart, new HashSet<String>(
Arrays.asList(DBPEDIA_URI + markingTitle, WIKIPEDIA_URI + markingTitle))));
}
}
});
} catch (Exception e) {
throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
} finally {
IOUtils.closeQuietly(inputStream);
}
return documents;
}
}