package is2.io;
import is2.data.PSTree;
import is2.parser.Parser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Stack;
import java.util.StringTokenizer;
/**
* @author Dr. Bernd Bohnet, 17.01.2011
*
* Reads a sentences in Penn Tree Bank bracket style and return sentences.
*/
public class TigerReader implements PSReader {
BufferedReader inputReader;
ArrayList<File> psFiles = new ArrayList<>();
ArrayList<PSTree> psCache = new ArrayList<>();
String filter[] = null;
int startFilter = -1;
int endFilter = -1;
public TigerReader() {
}
public TigerReader(String file) {
try {
inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO-8859-1"), 32768);
} catch (FileNotFoundException | UnsupportedEncodingException e) {
e.printStackTrace();
}
}
/**
* @param ps
*/
@Override
public void startReading(String file, String[] filter) {
try {
this.filter = filter;
startFilter = filter == null ? -1 : 1;
endFilter = filter == null ? -1 : 1;
inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO-8859-1"), 32768);
} catch (Exception e) {
e.printStackTrace();
}
}
public static class Line {
String form;
String lemma;
String morph;
String pos;
int parent;
String edge;
}
static int stop = 0;
/**
* @return
*/
@Override
public PSTree getNext() {
PSTree ps = null;
String l;
ArrayList<Line> lines = new ArrayList<>();
try {
int state = 1, terminals = 0, nonterminals = 0;
while ((l = inputReader.readLine()) != null) {
if (startFilter == 1 && l.startsWith("#BOS " + filter[0])) {
Parser.out.println("found start " + l);
startFilter = 2;
}
if (endFilter == 1 && l.startsWith("#EOS " + filter[1])) {
Parser.out.println("found end " + l);
endFilter = 2;
}
if (startFilter == 1 || endFilter == 2) {
continue;
}
if (l.startsWith("#BOS")) {
state = 2;
continue;
}
if (l.startsWith("#500")) {
state = 3;
}
if (l.startsWith("#EOS")) {
state = 4;
}
if (state < 2) {
continue;
}
if (state == 4) {
ps = new PSTree();
ps.create(terminals, nonterminals);
// Parser.out.println("terminals "+terminals);
//build ps tree
int cnt = 0;
// ps.entries[0] =CONLLReader09.ROOT;
// ps.head[0]=-1;
int root = -1;
for (Line line : lines) {
/*
* if (cnt==terminals) { // insert root root =cnt;
* cnt++; }
*/
ps.entries[cnt] = line.form;
if (cnt < terminals) {
ps.pos[cnt] = line.pos;
} else {
ps.entries[cnt] = line.pos;
}
ps.lemmas[cnt] = line.lemma;
ps.head[cnt] = line.parent == 0 ? lines.size() - 1 : line.parent >= 500 ? line.parent - 500 + terminals : line.parent;
// ps.head[cnt] = line.parent==0?lines.size()-1:line.parent>=500?line.parent-500+terminals:line.parent;
ps.morph[cnt] = line.morph;
cnt++;
}
if (root == -1) {
root = terminals;
}
ps.head[cnt - 1] = 0; // root
ps.terminalCount = terminals;
lines.clear();
state = 1;
/*
* for(int k=0;k<ps.head.length;k++) { if
* (ps.head[k]<terminals && k!=root) { ps.head[k]=root; //
* DB.println("error "+k+" "+ps.head[k]); } }
*/
// Parser.out.println(""+ps.toString());
// if (stop++ == 4)System.exit(0);
return ps;
}
StringTokenizer t = new StringTokenizer(l, "\t");
int tc = 0;
Line line = new Line();
lines.add(line);
while (t.hasMoreTokens()) {
String token = t.nextToken();
if (token.equals("\t")) {
continue;
}
if (tc == 0) {
if (token.startsWith("#5") || token.startsWith("#6")) {
nonterminals++;
} else {
terminals++;
//change it back to the wrong format since the conll stuff was derived from this.
// if (token.equals("durchblicken")) token="durchblikken";
line.form = token;
}
} else if (tc == 1) {
line.lemma = token;
} else if (tc == 2) {
line.pos = token;
} else if (tc == 3) {
line.morph = token;
} else if (tc == 4) {
line.edge = token;
} else if (tc == 5) {
line.parent = Integer.parseInt(token);
}
if (token.length() > 0) {
tc++;
}
}
// read till #EOS
}
} catch (IOException | NumberFormatException e) {
e.printStackTrace();
}
return ps;
}
/**
* @param tree
*/
private void removeTraces(ArrayList<Object> tree) {
Stack<ArrayList<Object>> s = new Stack<>();
s.push(tree);
ArrayList<Object> list = null;
while (!s.isEmpty()) {
ArrayList<Object> last = list;
list = s.pop();
for (int k = 0; k < list.size(); k++) {
Object o = list.get(k);
if (o instanceof String) {
String t = (String) o;
if ((t.endsWith("-1") || t.endsWith("-2") || t.endsWith("-3") || t.endsWith("-4")) && list.size() > (k + 1)) {
t = t.substring(0, t.length() - 2);
list.set(k, t);
}
if (t.startsWith("-NONE-")) {
// remove the bigger surrounding phrase, e.g. (NP (-NONE- *))
if (last.size() == 2 && last.get(0) instanceof String && last.contains(list)) {
ArrayList<Object> rest = remove(tree, last);
if (rest != null && rest.size() == 1) {
rest = remove(tree, rest);
}
} // remove the phrase only, e.g. (NP (AP nice small) (-NONE- *))
else {
// there might a phrase with two empty elements (VP (-NONE- *) (-NONE- ...))
// Parser.out.println("last "+last+" list "+list );
ArrayList<Object> rest = remove(tree, list);
removeTraces(rest);
if (rest.size() == 1) {
rest = remove(tree, rest);
if (rest != null && rest.size() == 1) {
Parser.out.println("rest " + rest);
System.exit(0);
}
}
}
continue;
}
}
if (o instanceof ArrayList) {
s.push((ArrayList<Object>) o);
}
}
}
}
/**
* Remove from tree p
*
* @param tree phrase structure tree
* @param p elment to remove
*/
private ArrayList<Object> remove(ArrayList<Object> tree, Object p) {
Stack<ArrayList<Object>> s = new Stack<>();
s.push(tree);
while (!s.isEmpty()) {
ArrayList<Object> list = s.pop();
for (int k = 0; k < list.size(); k++) {
Object o = list.get(k);
if (o == p) {
list.remove(p);
return list;
}
if (o instanceof ArrayList) {
s.push((ArrayList<Object>) o);
}
}
}
return null;
}
/**
* Count the terminals
*
* @param current
* @return
*/
private int countTerminals(ArrayList<Object> current) {
int count = 0;
boolean found = false, all = true;
for (Object o : current) {
if (o instanceof String) {
found = true;
} else {
all = false;
if (o instanceof ArrayList) {
count += countTerminals((ArrayList<Object>) o);
}
}
}
if (found && all) {
// Parser.out.println(""+current);
count++;
}
return count;
}
/**
* Count the terminals
*
* @param current
* @return
*/
private int insert(PSTree ps, ArrayList<Object> current, Integer terminal, Integer xxx, int head) {
boolean found = false, all = true;
String term = null;
String pos = null;
for (Object o : current) {
if (o instanceof String) {
if (found) {
term = (String) o;
}
if (!found) {
pos = (String) o;
}
found = true;
} else {
all = false;
// if (o instanceof ArrayList) count +=countTerminals((ArrayList<Object>)o);
}
}
if (found && all) {
if (term.equals("-LRB-")) {
term = "(";
}
if (term.equals("-RRB-")) {
term = ")";
}
if (term.equals("-LCB-")) {
term = "{";
}
if (term.equals("-RCB-")) {
term = "}";
}
if (term.contains("1\\/2-year")) {
term = term.replace("\\/", "/");
}
if (term.contains("1\\/2-foot-tall")) {
term = term.replace("\\/", "/");
}
ps.entries[ps.terminalCount] = term;
ps.pos[ps.terminalCount] = pos;
ps.head[ps.terminalCount] = head;
// Parser.out.println("terminal "+term+" "+ps.terminal+" head "+head);
ps.terminalCount++;
} else if (found && !all) {
if (pos.startsWith("NP-SBJ")) {
pos = "NP-SBJ";
}
if (pos.startsWith("WHNP")) {
pos = "WHNP";
}
ps.entries[ps.non] = pos;
ps.head[ps.non] = head;
// Parser.out.println("non terminal "+pos+" "+ps.non+" head "+ head);
int non = ps.non++;
for (Object o : current) {
if (o instanceof ArrayList) {
insert(ps, (ArrayList<Object>) o, terminal, ps.non, non);
}
}
}
if (!all && !found) {
for (Object o : current) {
if (o instanceof ArrayList) {
insert(ps, (ArrayList<Object>) o, terminal, 0, ps.non - 1);
}
}
}
return terminal;
}
/**
* Count the terminals
*
* @param current
* @return
*/
private int countNonTerminals(ArrayList<Object> current) {
int count = 0;
boolean found = false, all = true;
for (Object o : current) {
if (o instanceof String) {
found = true;
} else {
all = false;
if (o instanceof ArrayList) {
count += countNonTerminals((ArrayList<Object>) o);
}
}
}
if (found && !all) {
count++;
}
return count;
}
}