/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.dataset.impl.micro; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.aksw.gerbil.dataset.InitializableDataset; import org.aksw.gerbil.dataset.impl.AbstractDataset; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.NamedEntity; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.carrotsearch.hppc.IntArrayList; import au.com.bytecode.opencsv.CSVReader; /** * @author Giuseppe Rizzo (giuse.rizzo@gmail.com) * @author Michael Röder (roeder@informatik.uni-leipzig.de) */ public class Microposts2014Dataset extends AbstractDataset implements InitializableDataset { private static final Logger LOGGER = LoggerFactory.getLogger(Microposts2014Dataset.class); private static final char SEPARATION_CHAR = '\t'; private static final int TWEET_ID_INDEX = 0; private static final int TWEET_TEXT_INDEX = 1; private static final int FIRST_ANNOTATION_INDEX = 2; protected List<Document> documents; private String tweetsFile; public Microposts2014Dataset(String tweetsFile) { this.tweetsFile = tweetsFile; } @Override public int size() { return documents.size(); } @Override public List<Document> getInstances() { return documents; } @Override public void init() throws GerbilException { this.documents = loadDocuments(new File(tweetsFile)); } protected List<Document> loadDocuments(File tweetsFile) throws GerbilException { BufferedReader bReader = null; CSVReader reader = null; List<Document> documents = new ArrayList<Document>(); String documentUriPrefix = "http://" + getName() + "/"; try { bReader = new BufferedReader( new InputStreamReader(new FileInputStream(tweetsFile), Charset.forName("UTF-8"))); reader = new CSVReader(bReader, SEPARATION_CHAR); String line[] = reader.readNext(); String text; int start, end; List<Marking> markings; while (line != null) { if ((line.length & 1) == 0) { start = line[TWEET_TEXT_INDEX].startsWith("\"") ? 1 : 0; end = line[TWEET_TEXT_INDEX].endsWith("\"") ? (line[TWEET_TEXT_INDEX].length() - 1) : line[TWEET_TEXT_INDEX].length(); text = line[TWEET_TEXT_INDEX].substring(start, end).trim(); markings = findMarkings(line, text); documents.add(new DocumentImpl(text, documentUriPrefix + line[TWEET_ID_INDEX], markings)); } else { throw new GerbilException( "Dataset is malformed. Each line shoud have an even number of cells. Malformed line = " + Arrays.toString(line), ErrorTypes.DATASET_LOADING_ERROR); } line = reader.readNext(); } } catch (IOException e) { throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); } finally { IOUtils.closeQuietly(reader); IOUtils.closeQuietly(bReader); } return documents; } protected static List<Marking> findMarkings(String line[], String text) { List<Marking> markings = new ArrayList<Marking>(line.length / 2); String textWithoutHashes = null; int start, pos; IntArrayList hashes = new IntArrayList(); int end = 0; for (int i = FIRST_ANNOTATION_INDEX; i < line.length; i = i + 2) { start = text.indexOf(line[i], end); // The mentioned entity couldn't be found. Let's search // in a text that contains no hashes. if (start < 0) { if (textWithoutHashes == null) { /* * A very simple workaround to search for a mention without * hashes. Note that this only works, if the mention * couldn't be found because the tweets contains hash tags * that should be part of the mentions. */ pos = text.indexOf('#'); while (pos >= 0) { hashes.add(pos); pos = text.indexOf('#', pos + 1); } textWithoutHashes = text.replaceAll("#", ""); } // The offset might have been moved through the // removing // of the hashes. for (int j = 0; (i < hashes.elementsCount) && (hashes.buffer[j] < end); ++j) { --end; } // search again start = textWithoutHashes.indexOf(line[i], end); if (start >= 0) { // find the start and end positions of the // mention // inside the original tweet by looking at the // list // of hashes end = start + line[i].length(); for (int j = 0; (j < hashes.elementsCount) && (hashes.buffer[j] < end); ++j) { ++end; if (hashes.buffer[j] < start) { ++start; } } } } else { end = start + line[i].length(); } if (start < 0) { LOGGER.warn("Couldn't find \"{}\" inside \"{}\". This annotation will be ignored.", line[i], text); } else { markings.add(new NamedEntity(start, end - start, line[i + 1])); } } return markings; } }