/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.dataset.impl.msnbc; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import org.aksw.gerbil.dataset.InitializableDataset; import org.aksw.gerbil.dataset.impl.AbstractDataset; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.Span; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.NamedEntity; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class MSNBCDataset extends AbstractDataset implements InitializableDataset, Comparator<Span> { private static final Logger LOGGER = LoggerFactory.getLogger(MSNBCDataset.class); protected List<Document> documents; protected String textsDirectory; protected String annotationsDirectory; public MSNBCDataset(String textsDirectory, String annotationsDirectory) throws GerbilException { this.textsDirectory = textsDirectory; this.annotationsDirectory = annotationsDirectory; } @Override public int size() { return documents.size(); } @Override public List<Document> getInstances() { return documents; } @Override public void init() throws GerbilException { this.documents = loadDocuments(new File(textsDirectory), new File(annotationsDirectory)); } protected List<Document> loadDocuments(File textDir, File annoDir) throws GerbilException { if ((!textDir.exists()) || (!textDir.isDirectory())) { throw new GerbilException( "The given text directory (" + textDir.getAbsolutePath() + ") is not existing or not a directory.", ErrorTypes.DATASET_LOADING_ERROR); } String textDirPath = textDir.getAbsolutePath(); if (!textDirPath.endsWith(File.separator)) { textDirPath = textDirPath + File.separator; } if ((!annoDir.exists()) || (!annoDir.isDirectory())) { throw new GerbilException("The given annotation directory (" + annoDir.getAbsolutePath() + ") is not existing or not a directory.", ErrorTypes.DATASET_LOADING_ERROR); } MSNBC_XMLParser parser = new MSNBC_XMLParser(); MSNBC_Result parsedResult; String text; List<Document> documents = new ArrayList<Document>(); for (File annoFile : annoDir.listFiles()) { // parse the annotation file try { parsedResult = parser.parseAnnotationsFile(annoFile); } catch (Exception e) { throw new GerbilException( "Couldn't parse given annotation file (\"" + annoFile.getAbsolutePath() + "\".", e, ErrorTypes.DATASET_LOADING_ERROR); } if (parsedResult.getTextFileName() == null) { throw new GerbilException("The parsed annotation file (\"" + annoFile.getAbsolutePath() + "\" did not define a text file name.", ErrorTypes.DATASET_LOADING_ERROR); } // read the text file try { text = FileUtils.readFileToString(new File(textDirPath + parsedResult.getTextFileName())); } catch (IOException e) { throw new GerbilException( "Couldn't read text file \"" + textDirPath + parsedResult.getTextFileName() + "\" mentioned in the annotations file \"" + annoFile.getAbsolutePath() + "\".", e, ErrorTypes.DATASET_LOADING_ERROR); } // create document documents.add(createDocument(parsedResult.getTextFileName(), text, parsedResult)); } return documents; } protected Document createDocument(String fileName, String text, MSNBC_Result parsedResult) { String documentUri = generateDocumentUri(fileName); List<Marking> markings = new ArrayList<Marking>(parsedResult.getMarkings().size()); String retrievedSurfaceForm; for (MSNBC_NamedEntity ne : parsedResult.getMarkings()) { retrievedSurfaceForm = text.substring(ne.getStartPosition(), ne.getStartPosition() + ne.getLength()); if (!retrievedSurfaceForm.equals(ne.getSurfaceForm())) { LOGGER.warn("In document " + documentUri + ", the expected surface form of the named entity " + ne + " does not fit the surface form derived from the text \"" + retrievedSurfaceForm + "\"."); } addDBpediaUris(ne.getUris()); markings.add(ne.toNamedEntity()); } Document document = new DocumentImpl(text, documentUri, markings); mergeSubNamedEntity(document); return document; } /** * Merge {@link NamedEntity}s that are sub spans of another named entity and * that have the same URIs. * * @param document */ private void mergeSubNamedEntity(Document document) { List<NamedEntity> spanList = document.getMarkings(NamedEntity.class); NamedEntity nes[] = spanList.toArray(new NamedEntity[spanList.size()]); Arrays.sort(nes, this); Set<Marking> markingsToRemove = new HashSet<Marking>(); boolean uriOverlapping; Iterator<String> uriIterator; for (int i = 0; i < nes.length; ++i) { uriOverlapping = false; for (int j = i + 1; (j < nes.length) && (!uriOverlapping); ++j) { // if nes[i] is a "sub span" of nes[j] if ((nes[i].getStartPosition() >= nes[j].getStartPosition()) && ((nes[i].getStartPosition() + nes[i].getLength()) <= (nes[j].getStartPosition() + nes[j].getLength()))) { uriOverlapping = false; uriIterator = nes[i].getUris().iterator(); while ((!uriOverlapping) && (uriIterator.hasNext())) { uriOverlapping = nes[j].containsUri(uriIterator.next()); } if (uriOverlapping) { nes[j].getUris().addAll(nes[j].getUris()); markingsToRemove.add(nes[i]); } else { LOGGER.debug("There are two overlapping named entities with different URI sets. {}, {}", nes[i], nes[j]); } } } } document.getMarkings().removeAll(markingsToRemove); } protected String generateDocumentUri(String fileName) { StringBuilder builder = new StringBuilder(); builder.append("http://"); builder.append(name); builder.append('/'); builder.append(fileName); return builder.toString(); } /** * Adds DBpedia URIs by transforming Wikipeda URIs. * * @param uris */ protected static void addDBpediaUris(Set<String> uris) { List<String> dbpediaUris = new ArrayList<String>(uris.size()); for (String uri : uris) { if (uri.contains("en.wikipedia.org/wiki")) { dbpediaUris.add(uri.replace("en.wikipedia.org/wiki", "dbpedia.org/resource")); } else { dbpediaUris.add(uri.replace("wikipedia.org/wiki", "dbpedia.org/resource")); } } uris.addAll(dbpediaUris); } @Override public int compare(Span s1, Span s2) { // sort them based on their length int diff = s1.getLength() - s2.getLength(); if (diff == 0) { return 0; } else if (diff < 0) { return -1; } else { return 1; } } }