package org.aksw.gerbil.dataset.impl.derczysnki;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
import org.apache.commons.io.IOUtils;
public class DerczynskiDataset extends AbstractDataset implements
InitializableDataset {
private static StringBuilder realTweet;
private String file;
private List<Document> documents;
private int firstDocId;
private int lastDocId;
public DerczynskiDataset(String file) {
this.file = file;
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(file));
if ((firstDocId > 0) && (lastDocId > 0)) {
this.documents = this.documents.subList(firstDocId - 1, lastDocId);
}
}
protected List<Document> loadDocuments(File tweetsFile)
throws GerbilException {
BufferedReader reader = null;
// CSVReader reader = null;
List<Document> documents = new ArrayList<Document>();
String documentUriPrefix = "http://" + getName() + "/";
try {
reader = new BufferedReader(new InputStreamReader(
new FileInputStream(tweetsFile), Charset.forName("UTF-8")));
String line = reader.readLine();
int tweetIndex = 0;
List<Marking> markings = new ArrayList<Marking>();
StringBuilder tweet = new StringBuilder("");
while (line != null) {
if (line.trim().isEmpty()) {
// Get Markings
markings = findMarkings(tweet.toString());
// Save old tweet
documents.add(new DocumentImpl(realTweet.toString(),
documentUriPrefix + tweetIndex, markings));
// New Tweet
tweet.delete(0, tweet.length());
line = reader.readLine();
tweetIndex++;
continue;
}
tweet.append(line + "\n");
line = reader.readLine();
}
} catch (IOException e) {
throw new GerbilException("Exception while reading dataset.", e,
ErrorTypes.DATASET_LOADING_ERROR);
} finally {
IOUtils.closeQuietly(reader);
// IOUtils.closeQuietly(bReader);
}
return documents;
}
public static List<Marking> findMarkings(String tweet) {
int start = 0;
List<Marking> markings = new ArrayList<Marking>();
realTweet = new StringBuilder();
String[] line = tweet.split("\n");
int i = 0;
for (String tokenFull : line) {
String[] token = tokenFull.split("\t+");
realTweet.append(token[0] + " ");
token[1] = token[1].trim();
if (token.length>2&&token[2].startsWith("B-")) {
String[] marking = getWholeMarking(line, i);
Set<String> types = new HashSet<String>();
types.add(marking[2]);
markings.add(new TypedNamedEntity(start, marking[0].length(),
marking[1], types));
}
start += token[0].length() + 1;
i++;
}
return markings;
}
private static String[] getWholeMarking(String line[], int pos) {
String[] ret = new String[3];
String[] token = line[pos].split("\t+");
StringBuilder name = new StringBuilder().append(token[0]);
if (!token[1].equals("O") & !token[1].equals("") && !token[1].equals("NIL"))
ret[1] = token[1];
else
ret[1] = "";
ret[2] = getType(token[2].substring(2));
for (int i = pos + 1; i < line.length; i++) {
token = line[i].split("\t+");
if (token.length >2 && token[2].startsWith("I-")) {
name.append(" ").append(token[0]);
} else {
break;
}
}
ret[0] = name.toString();
return ret;
}
private static String getType(String type) {
switch (type) {
case "sportsteam":
return "http://dbpedia.org/ontology/SportsTeam";
case "person":
return "http://dbpedia.org/ontology/Person";
case "geo-loc":
return "http://dbpedia.org/ontology/Place";
case "facility":
return "http://dbpedia.org/ontology/Place";
case "movie":
return "http://dbpedia.org/ontology/Film";
case "tv-show":
return "http://dbpedia.org/ontology/TelevisionShow";
case "company":
return "http://dbpedia.org/ontology/company";
case "product":
return "http://dbpedia.org/ontology/product";
default:
return "";
}
}
}