package is2.data;
import is2.parser.Parser;
import is2.util.DB;
import java.io.*;
import java.util.ArrayList;
/**
* @author Dr. Bernd Bohnet, 28.10.2010
*
*
*/
final public class Thesaurus {
public static final String LPATH = "LP";
public static final String SPATH = "SP";
// [word][p] p = [0:long-path | 1:short-path]
final private int[][] word2path;
public Thesaurus() {
word2path = new int[0][];
}
/**
* @param clusterFile
* @param mf
*
*/
public Thesaurus(String clusterFile, IEncoderPlus mf, int ls) {
final String REGEX = "\t";
// register words
try {
try (BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768)) {
int cnt = 0;
String line;
while ((line = inputReader.readLine()) != null) {
cnt++;
try {
String[] split = line.split(REGEX);
// mf.register(LPATH, split[0].length()<ls?split[0]:split[0].substring(0,ls));
mf.register(PipeGen.WORD, split[0]);
mf.register(PipeGen.WORD, split[1]);
} catch (Exception e) {
Parser.out.println("Error in cluster line " + cnt + " error: " + e.getMessage());
}
}
Parser.out.println("read number of thesaury entries " + cnt);
}
} catch (Exception e) {
e.printStackTrace();
}
word2path = new int[mf.getFeatureCounter().get(PipeGen.WORD)][];
// insert words
try {
String line;
try (BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768)) {
int startWd = -1;
ArrayList<Integer> wrds = new ArrayList<>();
while ((line = inputReader.readLine()) != null) {
String[] split = line.split(REGEX);
int wd = mf.getValue(PipeGen.WORD, split[0]);
// DB.println("wd "+wd+" "+startWd);
if (startWd == wd) {
int thesaurusWrd = mf.getValue(PipeGen.WORD, split[1]);
if (thesaurusWrd != wd) {
wrds.add(thesaurusWrd);
}
} else if (startWd != -1) {
int[] ths = new int[wrds.size()];
for (int k = 0; k < ths.length; k++) {
ths[k] = wrds.get(k);
}
word2path[startWd] = ths;
// DB.println(""+wrds+" size "+ths.length);
wrds.clear();
int thesaurusWrd = mf.getValue(PipeGen.WORD, split[1]);
if (thesaurusWrd != wd) {
wrds.add(thesaurusWrd);
}
}
startWd = wd;
}
if (!wrds.isEmpty()) {
// put rest of the words
int[] ths = new int[wrds.size()];
for (int k = 0; k < ths.length; k++) {
ths[k] = wrds.get(k);
}
word2path[startWd] = ths;
// DB.println(""+wrds+" size "+ths.length);
wrds.clear();
}
}
int fill = 0;
for (int l = 0; l < word2path.length; l++) {
if (word2path[l] != null) {
fill++;
}
}
/*
* for(int l = 0; l<word2path.length; l++ ){ if (word2path[l][1]!=0)
* fillL++; if (word2path[l][1]<-1) Parser.out.println("lower
* "+word2path[l][1]); }
*/
Parser.out.println("filled " + fill + " of " + word2path.length);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Read the cluster
*
* @param dos
* @throws IOException
*/
public Thesaurus(DataInputStream dis) throws IOException {
word2path = new int[dis.readInt()][];
for (int i = 0; i < word2path.length; i++) {
int len = dis.readInt();
if (len > 0) {
word2path[i] = new int[len];
for (int j = 0; j < len; j++) {
word2path[i][j] = dis.readInt();
}
}
word2path[i][0] = dis.readShort();
}
DB.println("Read cluster with " + word2path.length + " words ");
}
/**
* Write the cluster
*
* @param dos
* @throws IOException
*/
public void write(DataOutputStream dos) throws IOException {
dos.writeInt(word2path.length);
for (int[] i : word2path) {
dos.writeInt(i == null ? 0 : i.length);
if (i != null) {
for (int j = 0; j < i.length; j++) {
dos.writeInt(i[j]);
}
}
}
}
/**
* @param form the id of a word form
* @return the short path to the word form in the cluster
*
* final public int getSP(int form) { if (word2path.length<form) return -1;
* return word2path[form][0]; }
*/
/**
* get the long path to a word form in the cluster
*
* @param form the id of a word form
* @return the long path to the word
*/
final public int get(int form, int k) {
if (word2path.length < form || word2path[form] == null) {
return -1;
}
return word2path[form][k];
}
}