package org.wikibrain.sr.wikify;
import org.apache.commons.io.IOUtils;
import org.wikibrain.core.lang.Language;
import org.wikibrain.utils.WpIOUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
/**
* @author Shilad Sen
*/
public class WbCorpusLineReader implements Iterable<WbCorpusLineReader.Line> {
private final File path;
public WbCorpusLineReader(File path) {
this.path = path;
}
@Override
public Iterator<Line> iterator() {
try {
return new WBCorpusLineIterator(path);
} catch (IOException e) {
throw new IllegalArgumentException("Could not open: " + path, e);
}
}
public static class CorpusInfo {
private final Language language;
private final String corpusClass;
private final String wikifierClass;
private final String creationTime;
public CorpusInfo(String line) {
if (!line.startsWith("@WikiBrainCorpus")) {
throw new IllegalArgumentException("Invalid corpus line: " + line);
}
String tokens[] = line.split("\t");
if (tokens.length != 5) {
throw new IllegalArgumentException("Invalid corpus line: " + line);
}
language = Language.getByLangCode(tokens[1]);
corpusClass = tokens[2].trim();
wikifierClass = tokens[3].trim();
creationTime = tokens[4].trim();
}
public Language getLanguage() {
return language;
}
public String getCorpusClass() {
return corpusClass;
}
public String getWikifierClass() {
return wikifierClass;
}
public String getCreationTime() {
return creationTime;
}
}
public static class DocInfo {
private final int id;
private final String title;
int lineCounter = 0;
int charCounter = 0;
public DocInfo(String line) {
if (!line.startsWith("@WikiBrainDoc")) {
throw new IllegalArgumentException("Invalid doc line: " + line);
}
String tokens[] = line.split("\t");
if (tokens.length != 3) {
throw new IllegalArgumentException("Invalid corpus line: " + line);
}
id = Integer.valueOf(tokens[1].trim());
title = tokens[2].trim();
}
public int getId() {
return id;
}
public String getTitle() {
return title;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
return id == ((DocInfo)o).id;
}
@Override
public int hashCode() { return id; }
@Override
public String toString() {
return "DocInfo{" + "id=" + id + ", title='" + title + '\'' + '}';
}
}
public static class Line {
private final CorpusInfo corpus;
private final DocInfo doc;
private final String line;
private final int lineNumber;
private final int charNumber;
public Line(CorpusInfo corpus, DocInfo doc, String line, int lineNumber, int charNumber) {
this.corpus = corpus;
this.doc = doc;
this.line = line;
this.lineNumber = lineNumber;
this.charNumber = charNumber;
}
public CorpusInfo getCorpus() {
return corpus;
}
public DocInfo getDoc() {
return doc;
}
public int getDocId() {
return doc.getId();
}
public String getTitle() {
return doc.getTitle();
}
public String getLine() {
return line;
}
public int getLineNumber() {
return lineNumber;
}
public int getCharNumber() {
return charNumber;
}
}
public static class WBCorpusLineIterator implements Iterator<Line> {
private final File path;
private BufferedReader reader;
private CorpusInfo corpus;
private DocInfo doc;
private String line;
public WBCorpusLineIterator(File path) throws IOException {
this.path = path;
reader = WpIOUtils.openBufferedReader(path);
}
@Override
public synchronized boolean hasNext() {
if (reader == null && line == null) {
return false;
}
boolean success = false;
try {
advanceIfNecessary();
success = true;
return line != null;
} finally {
if (!success) close();
}
}
@Override
public synchronized Line next() {
if (reader == null && line == null) {
return null;
}
boolean success = false;
try {
advanceIfNecessary();
success = true;
if (line == null) {
return null;
} else {
int lineNum = doc.lineCounter++;
int charNum = doc.charCounter;
doc.charCounter += line.length();
Line res = new Line(corpus, doc, line, lineNum, charNum);
line = null; // consume buffer
return res;
}
} finally {
if (!success) close();
}
}
@Override
public void remove() { throw new UnsupportedOperationException(); }
private synchronized void advanceIfNecessary() {
if (line == null) {
if (reader == null) throw new IllegalStateException();
while (line == null) {
String s = null;
try {
s = reader.readLine();
} catch (IOException e) {
throw new IllegalStateException("Unexpected IO Exception: ", e);
}
if (s == null) {
close();
return;
}
// Ignore blank lines.
if (s.trim().isEmpty()) {
continue;
}
if (s.startsWith("@WikiBrainCorpus")) {
corpus = new CorpusInfo(s);
} else if (corpus == null) {
throw new IllegalStateException("Did not find corpus header in first line of " + path);
} else if (s.startsWith("@WikiBrainDoc")) {
doc = new DocInfo(s);
} else if (doc == null) {
throw new IllegalStateException("Did not find doc header in second line of " + path);
} else {
line = s;
}
}
}
}
public synchronized void close() {
if (reader != null) {
IOUtils.closeQuietly(reader);
reader = null;
}
}
private void ensureNotUsedUp() {
}
}
}