/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.iitb;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.aksw.gerbil.utils.WikipediaHelper;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class IITBDataset extends AbstractDataset implements InitializableDataset {
private static final Logger LOGGER = LoggerFactory.getLogger(IITBDataset.class);
protected List<Document> documents;
protected String textsDirectory;
protected String annotationsDirectory;
protected int unknownEntitiesCount = 0;
public IITBDataset(String textsDirectory, String annotationsDirectory) throws GerbilException {
this.textsDirectory = textsDirectory;
this.annotationsDirectory = annotationsDirectory;
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(textsDirectory), new File(annotationsDirectory));
}
protected List<Document> loadDocuments(File textDir, File annoFile) throws GerbilException {
if ((!textDir.exists()) || (!textDir.isDirectory())) {
throw new GerbilException(
"The given text directory (" + textDir.getAbsolutePath() + ") is not existing or not a directory.",
ErrorTypes.DATASET_LOADING_ERROR);
}
String textDirPath = textDir.getAbsolutePath();
if (!textDirPath.endsWith(File.separator)) {
textDirPath = textDirPath + File.separator;
}
if (!annoFile.exists()) {
throw new GerbilException("The given annotation file (" + annoFile.getAbsolutePath() + ") does not exist.",
ErrorTypes.DATASET_LOADING_ERROR);
}
Map<String, Set<IITB_Annotation>> documentAnnotationsMap = loadAnnotations(annoFile);
String text;
List<Document> documents = new ArrayList<Document>();
for (String textFile : documentAnnotationsMap.keySet()) {
// read the text file
try {
text = FileUtils.readFileToString(new File(textDirPath + textFile));
} catch (IOException e) {
throw new GerbilException("Couldn't read text file \"" + textDirPath + textFile + "\".", e,
ErrorTypes.DATASET_LOADING_ERROR);
}
// create document
documents.add(createDocument(textFile, text, documentAnnotationsMap.get(textFile)));
}
return documents;
}
protected Map<String, Set<IITB_Annotation>> loadAnnotations(File annotationsFile) throws GerbilException {
IITB_XMLParser parser = new IITB_XMLParser();
try {
return parser.parseAnnotationsFile(annotationsFile);
} catch (Exception e) {
throw new GerbilException(
"Couldn't parse given annotation file (\"" + annotationsFile.getAbsolutePath() + "\".", e,
ErrorTypes.DATASET_LOADING_ERROR);
}
}
protected Document createDocument(String fileName, String text, Set<IITB_Annotation> annotations) {
String documentUri = generateDocumentUri(fileName);
List<Marking> markings = new ArrayList<Marking>(annotations.size());
int endPosition;
Set<String> uris;
for (IITB_Annotation annotation : annotations) {
endPosition = annotation.offset + annotation.length;
if ((annotation.offset > 0) && (Character.isAlphabetic(text.charAt(annotation.offset - 1)))) {
LOGGER.warn("In document " + documentUri + ", the named entity \""
+ text.substring(annotation.offset, annotation.offset + annotation.length)
+ "\" has an alphabetic character in front of it (\"" + text.charAt(annotation.offset - 1)
+ "\").");
}
if (Character.isWhitespace(text.charAt(annotation.offset))) {
LOGGER.warn("In document " + documentUri + ", the named entity \""
+ text.substring(annotation.offset, endPosition) + "\" starts with a whitespace.");
}
if ((endPosition < text.length()) && Character.isAlphabetic(text.charAt(endPosition))) {
LOGGER.warn("In document " + documentUri + ", the named entity \""
+ text.substring(annotation.offset, endPosition)
+ "\" has an alphabetic character directly behind it (\"" + text.charAt(endPosition) + "\").");
}
if (Character.isWhitespace(text.charAt(endPosition - 1))) {
LOGGER.warn("In document " + documentUri + ", the named entity \""
+ text.substring(annotation.offset, annotation.offset + annotation.length)
+ "\" ends with a whitespace.");
}
uris = WikipediaHelper.generateUriSet(annotation.wikiTitle);
if (uris.size() == 0) {
uris.add(generateEntityUri());
}
markings.add(new NamedEntity(annotation.offset, annotation.length, uris));
}
return new DocumentImpl(text, documentUri, markings);
}
private String generateEntityUri() {
StringBuilder builder = new StringBuilder();
builder.append("http://");
builder.append(name);
builder.append("/notInWiki/entity_");
builder.append(unknownEntitiesCount);
++unknownEntitiesCount;
return builder.toString();
}
protected String generateDocumentUri(String fileName) {
StringBuilder builder = new StringBuilder();
builder.append("http://");
builder.append(name);
builder.append('/');
builder.append(fileName);
return builder.toString();
}
}