package org.aksw.gerbil.dataset.impl.wsdm;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.Annotation;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.aksw.gerbil.utils.WikipediaHelper;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WSDMDataset extends AbstractDataset implements
InitializableDataset {
private static final Logger LOGGER = LoggerFactory
.getLogger(WSDMDataset.class);
private static String WIKIPEDIA_DOMAIN = "en.wikipedia.org";
protected List<Document> documents;
private String annotatedFile;
private String tweetsFile;
public WSDMDataset(String annotatedFile, String tweetsFile){
this.annotatedFile = annotatedFile;
this.tweetsFile = tweetsFile;
}
@Override
public int size() {
return documents.size();
}
@Override
public List<Document> getInstances() {
return documents;
}
@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(annotatedFile), new File(tweetsFile));
}
private List<Document> loadDocuments(File annotations, File tweets)
throws GerbilException {
List<Document> documents = new ArrayList<Document>();
String documentUriPrefix = "http://" + getName() + "/";
//its json per line
try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
new FileInputStream(tweets), Charset.forName("UTF-8")))) {
String line;
List<Marking> markings;
while ((line = bReader.readLine()) != null) {
JSONObject json = new JSONObject(line);
String id = json.getString("id_str");
String text = json.getString("text");
markings = findMarkings(getMarkingLines(annotations, id), text);
documents.add(new DocumentImpl(text, documentUriPrefix + id,
markings));
}
} catch (IOException e) {
throw new GerbilException("Exception while reading dataset.", e,
ErrorTypes.DATASET_LOADING_ERROR);
}
return documents;
}
protected static List<Marking> findMarkings(Set<String> lines, String text) {
List<Marking> markings = new ArrayList<Marking>();
for (String line : lines) {
String[] annotation = line.split("\t");
String uri = WikipediaHelper.getWikipediaUri(WIKIPEDIA_DOMAIN , annotation[2]);
markings.add(new Annotation(uri));
}
return markings;
}
private static Set<String> getMarkingLines(File annotations, String id) {
Set<String> lines = new HashSet<String>();
try (BufferedReader bReader = new BufferedReader(new InputStreamReader(
new FileInputStream(annotations), Charset.forName("UTF-8")))) {
String line;
Boolean annotationSeen = false;
while ((line = bReader.readLine()) != null) {
String[] annotation = line.split("\t");
if (id.equals(annotation[0])) {
annotationSeen = true;
lines.add(line);
} else if (annotationSeen) {
// as the annotations are ordered by id, the last annotation
// was added
return lines;
}
}
} catch (IOException e) {
LOGGER.error("Could not find Markings due to ", e);
}
return lines;
}
}