package org.aksw.gerbil.dataset.impl.micro;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Microposts2015Dataset extends AbstractDataset implements
InitializableDataset {
private static final Logger LOGGER = LoggerFactory
.getLogger(Microposts2016Dataset.class);
protected List<Document> documents;
private String annotatedFile;
private String tweetsFile;
protected static int typeIndex = 4;
public Microposts2015Dataset(String annotatedFile, String tweetsFile) {
this.annotatedFile = annotatedFile;
this.tweetsFile = tweetsFile;
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(annotatedFile), new File(
tweetsFile));
}
protected List<Document> loadDocuments(File annotations, File tweetsFile)
throws GerbilException {
List<Document> documents = new ArrayList<Document>();
String documentUriPrefix = "http://" + getName() + "/";
try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
new FileInputStream(tweetsFile), Charset.forName("UTF-8")))) {
String line;
List<Marking> markings;
while ((line = bReader.readLine()) != null) {
String[] tweet = line.split("\t");
if (tweet.length < 2) {
continue;
}
String id = tweet[0];
String text = tweet[1];
markings = findMarkings(getMarkingLines(annotations, id), text);
documents.add(new DocumentImpl(text, documentUriPrefix + id,
markings));
}
} catch (IOException e) {
throw new GerbilException("Exception while reading dataset.", e,
ErrorTypes.DATASET_LOADING_ERROR);
}
return documents;
}
protected static List<Marking> findMarkings(Set<String> lines, String text) {
List<Marking> markings = new ArrayList<Marking>();
for (String line : lines) {
String[] annotation = line.split("\t");
int start = Integer.parseInt(annotation[1]);
int end = Integer.parseInt(annotation[2]);
int length = end - start;
String uri = annotation[3];
if (uri.startsWith("NIL")) {
uri = "";
}
Set<String> types = new HashSet<String>();
types.add(getTypeURI(annotation[typeIndex]));
markings.add(new TypedNamedEntity(start, length, uri, types));
}
return markings;
}
private static Set<String> getMarkingLines(File annotations, String id) {
Set<String> lines = new HashSet<String>();
try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
new FileInputStream(annotations), Charset.forName("UTF-8")))) {
String line;
Boolean annotationSeen = false;
while ((line = bReader.readLine()) != null) {
String[] annotation = line.split("\t");
if (id.equals(annotation[0])) {
annotationSeen = true;
lines.add(line);
} else if (annotationSeen) {
// as the annotations are ordered by id, the last annotation
// was added
return lines;
}
}
} catch (IOException e) {
LOGGER.error("Could not find Markings due to ", e);
}
return lines;
}
protected static String getTypeURI(String type) {
switch (type.toLowerCase()) {
case "thing":
return "http://dbpedia.org/ontology/Thing";
case "person":
return "http://dbpedia.org/ontology/Person";
case "organization":
return "http://dbpedia.org/ontology/Organisation";
case "location":
return "http://dbpedia.org/ontology/Place";
case "event":
return "http://dbpedia.org/ontology/Event";
case "product":
return "http://dbpedia.org/ontology/Product";
}
return "";
}
}