/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.dataset.impl.aida; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.aksw.gerbil.dataset.InitializableDataset; import org.aksw.gerbil.dataset.impl.AbstractDataset; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.NamedEntity; import org.aksw.gerbil.utils.WikipediaHelper; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import au.com.bytecode.opencsv.CSVReader; /** * AIDA/CoNLL Dataset. * * @author Michael Röder (roeder@informatik.uni-leipzig.de) * */ public class AIDACoNLLDataset extends AbstractDataset implements InitializableDataset { private static final Logger LOGGER = LoggerFactory.getLogger(AIDACoNLLDataset.class); private static final char SEPARATION_CHAR = '\t'; private static final char QUOTATION_CHAR = '\0'; private static final int TEXT_INDEX = 0; private static final int NE_TYPE_INDEX = 1; private static final int ANNOTATION_SURFACE_FORM_INDEX = 2; private static final int ANNOTATION_TITLE_INDEX = 3; private static final int ANNOTATION_URI_INDEX = 4; private static final String DOCUMENT_START_TAG = "-DOCSTART-"; private static final String ANNOTATION_FIRST_WORD_TAG = "B"; // private static final String ANNOTATION_NEXT_WORD_TAG = "I"; private static final String ANNOTATION_NOT_IN_WIKI_TAG = "--NME--"; private static final String WIKIPEDIA_URI_START = "http://en.wikipedia.org/wiki/"; private String file; private List<Document> documents; private int firstDocId; private int lastDocId; public AIDACoNLLDataset(String file) { this(file, -1, -1); } public AIDACoNLLDataset(String file, String firstDocId, String lastDocId) { this(file, Integer.parseInt(firstDocId), Integer.parseInt(lastDocId)); } public AIDACoNLLDataset(String file, int firstDocId, int lastDocId) { this.file = file; this.firstDocId = firstDocId; this.lastDocId = lastDocId; } @Override public int size() { return documents.size(); } @Override public List<Document> getInstances() { return documents; } @Override public void init() throws GerbilException { this.documents = loadDocuments(new File(file)); if ((firstDocId > 0) && (lastDocId > 0)) { this.documents = this.documents.subList(firstDocId - 1, lastDocId); } } protected List<Document> loadDocuments(File file) throws GerbilException { String documentUriPrefix = "http://" + getName() + "/"; BufferedReader bReader = null; CSVReader reader = null; List<Document> documents = new ArrayList<Document>(); try { bReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8"))); reader = new CSVReader(bReader, SEPARATION_CHAR, QUOTATION_CHAR); String line[]; Document currentDoc = null; StringBuilder textBuilder = new StringBuilder(); List<Marking> markings = null; NamedEntity lastNE = null; Set<String> uris; line = reader.readNext(); boolean quoteCharSeenBefore = false; boolean whiteSpaceInFront, whiteSpaceBehind = true; while (line != null) { if (line.length > TEXT_INDEX) { // If a new document starts if (line[TEXT_INDEX].startsWith(DOCUMENT_START_TAG)) { if (currentDoc != null) { currentDoc.setText(textBuilder.toString().trim()); textBuilder.setLength(0); quoteCharSeenBefore = false; } markings = new ArrayList<Marking>(); currentDoc = new DocumentImpl(null, documentUriPrefix + documents.size(), markings); documents.add(currentDoc); } else { if (!line[TEXT_INDEX].isEmpty()) { // if we should insert a whitespace whiteSpaceInFront = whiteSpaceBehind; whiteSpaceBehind = true; if ((textBuilder.length() > 0) && (line[TEXT_INDEX].length() >= 1)) { if (line[TEXT_INDEX].length() == 1) { switch (line[TEXT_INDEX].charAt(0)) { case '?': // falls through case '!': case ',': case ')': case ']': case '}': case '.': { whiteSpaceInFront = false; break; } case '"': { // If we have seen another quote char // before if (!quoteCharSeenBefore) { whiteSpaceBehind = false; } else { whiteSpaceInFront = false; } quoteCharSeenBefore = !quoteCharSeenBefore; break; } case '(': // falls through case '[': case '{': { whiteSpaceBehind = false; break; } default: { break; } } } else if (!Character.isLetterOrDigit(line[TEXT_INDEX].charAt(0))) { whiteSpaceInFront = false; } if (whiteSpaceInFront) { textBuilder.append(' '); } } // If there is a named entity if ((line.length > NE_TYPE_INDEX) && !line[NE_TYPE_INDEX].isEmpty()) { // If this is a new named entity if (line[NE_TYPE_INDEX].equals(ANNOTATION_FIRST_WORD_TAG)) { if (line[ANNOTATION_TITLE_INDEX].equals(ANNOTATION_NOT_IN_WIKI_TAG)) { uris = generateArtificialUri(documentUriPrefix, line[ANNOTATION_SURFACE_FORM_INDEX]); } else { // Add the DBpdia URI if this is a wiki // URI if (line[ANNOTATION_URI_INDEX].startsWith(WIKIPEDIA_URI_START)) { uris = WikipediaHelper.generateUriSet( line[ANNOTATION_URI_INDEX].substring(WIKIPEDIA_URI_START.length())); } else { LOGGER.warn( "Found a URI that is not part of the English Wikipedia \"{}\". This was not expected.", line[ANNOTATION_URI_INDEX]); uris = new HashSet<String>(); } uris.add(line[ANNOTATION_URI_INDEX]); } lastNE = new NamedEntity(textBuilder.length(), 0, uris); markings.add(lastNE); } } else { lastNE = null; } textBuilder.append(line[TEXT_INDEX]); if (lastNE != null) { lastNE.setLength(textBuilder.length() - lastNE.getStartPosition()); } } } } line = reader.readNext(); } // set the text of the last document if (currentDoc != null) { currentDoc.setText(textBuilder.toString().trim()); textBuilder.setLength(0); } } catch (IOException e) { throw new GerbilException("Couldn't read dataset file.", e, ErrorTypes.DATASET_LOADING_ERROR); } finally { IOUtils.closeQuietly(reader); IOUtils.closeQuietly(bReader); } return documents; } protected Set<String> generateArtificialUri(String uriPrefix, String surfaceForm) throws GerbilException { StringBuilder builder = new StringBuilder(); builder.append(uriPrefix); builder.append("notInWiki/"); try { builder.append(URLEncoder.encode(surfaceForm, "UTF-8")); } catch (UnsupportedEncodingException e) { LOGGER.error("Couldn't encode surface form data.", e); throw new GerbilException("Couldn't encode surface form data.", e, ErrorTypes.DATASET_LOADING_ERROR); } Set<String> uris = new HashSet<String>(2); uris.add(builder.toString()); return uris; } }