/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.dataset.impl.micro; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import org.aksw.gerbil.dataset.InitializableDataset; import org.aksw.gerbil.dataset.impl.AbstractDataset; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity; import org.apache.commons.io.IOUtils; //import org.slf4j.Logger; //import org.slf4j.LoggerFactory; import au.com.bytecode.opencsv.CSVReader; /** * Based Upon Micrposts2014Dataset * * @author Felix Conrads (conrads@informatik.uni-leipzig.de) * * Microposts2014Dataset: * @author Giuseppe Rizzo (giuse.rizzo@gmail.com) * @author Michael Röder (roeder@informatik.uni-leipzig.de) */ public class Microposts2013Dataset extends AbstractDataset implements InitializableDataset { // // private static final Logger LOGGER = LoggerFactory // .getLogger(Microposts2013Dataset.class); private static final char SEPARATION_CHAR = '\t'; private static final String SEPARATION_CHAR_ANNOTATION = ";"; private static final int TWEET_ID_INDEX = 0; private static final int TWEET_TEXT_INDEX = 2; private static final int ANNOTATION_INDEX = 1; private static final String HASHTAG = "_HASHTAG_"; protected List<Document> documents; private String tweetsFile; public Microposts2013Dataset(String tweetsFile) { this.tweetsFile = tweetsFile; } @Override public int size() { return documents.size(); } @Override public List<Document> getInstances() { return documents; } @Override public void init() throws GerbilException { this.documents = loadDocuments(new File(tweetsFile)); } protected List<Document> loadDocuments(File tweetsFile) throws GerbilException { BufferedReader bReader = null; CSVReader reader = null; List<Document> documents = new ArrayList<Document>(); String documentUriPrefix = "http://" + getName() + "/"; try { bReader = new BufferedReader(new InputStreamReader( new FileInputStream(tweetsFile), Charset.forName("UTF-8"))); reader = new CSVReader(bReader, SEPARATION_CHAR); String line[] = reader.readNext(); String text; int start, end; List<Marking> markings = null; while (line != null) { if (line.length == 3) { start = line[TWEET_TEXT_INDEX].startsWith("\"") ? 1 : 0; end = line[TWEET_TEXT_INDEX].endsWith("\"") ? (line[TWEET_TEXT_INDEX] .length() - 1) : line[TWEET_TEXT_INDEX].length(); text = line[TWEET_TEXT_INDEX].substring(start, end).trim(); markings = findMarkings(line[ANNOTATION_INDEX], text); text = text.replaceAll(HASHTAG + " ", "#"); text = text.replaceAll(HASHTAG, "#"); documents.add(new DocumentImpl(text, documentUriPrefix + line[TWEET_ID_INDEX], markings)); } else { throw new GerbilException( "Dataset is malformed. Each line shoud have an exactly 3 cells. Malformed line = " + Arrays.toString(line), ErrorTypes.DATASET_LOADING_ERROR); } line = reader.readNext(); } } catch (IOException e) { throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); } finally { IOUtils.closeQuietly(reader); IOUtils.closeQuietly(bReader); } return documents; } protected static List<Marking> findMarkings(String line, String text) { String[] annotations = line.split(SEPARATION_CHAR_ANNOTATION); List<Marking> markings = new ArrayList<Marking>(annotations.length); if (line.isEmpty()) { return markings; } int start; int end = 0; Set<String> types = new HashSet<String>(); for (int i = 0; i < annotations.length; i++) { int offset = 0; String annotation = null; // Annotations has the form PER/PersonName etc. annotations[i] = annotations[i].replace("MISC:", "MISC/"); if (annotations[i].contains("/")) { String type = annotations[i].split("/")[0]; switch (type) { case "PER": types.add("http://dbpedia.org/ontology/Person"); break; case "ORG": types.add("http://dbpedia.org/ontology/Organisation"); break; case "LOC": types.add("http://dbpedia.org/ontology/Place"); break; case "MISC": // TODO } text = text.replaceAll(HASHTAG + " ", "#"); text = text.replaceAll(HASHTAG, "#"); try { annotation = annotations[i].split("/")[1]; } catch (Exception e) { e.printStackTrace(); } } else { annotation = annotations[i]; } start = text.indexOf(annotation, end); // check if mention has a hashtag before if (start > 0 && text.substring(start - 1, start).equals("#")) { start -= 1; offset = 1; } end = start + annotation.length() + offset; markings.add(new TypedNamedEntity(start, end - start, "", types)); } return markings; } }