package is2.tag;
import is2.data.IEncoderPlus;
import is2.data.PipeGen;
import is2.parser.Parser;
import is2.util.DB;
import java.io.*;
/**
* @author Dr. Bernd Bohnet, 07.01.2011
*
*
*/
public class Lexicon {
public static final String FR = "FR", TAG = "TAG";
final byte[][] word2tag;
public Lexicon(byte[][] w2t) {
word2tag = w2t;
}
public Lexicon(String clusterFile, IEncoderPlus mf) {
final String REGEX = "\t";
// register words
try {
BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768);
int cnt = 0;
String line;
while ((line = inputReader.readLine()) != null) {
try {
String[] split = line.split(REGEX);
// int f = Integer.parseInt(split[2]);
// if (f>2) {
cnt++;
mf.register(PipeGen.WORD, split[0]);
mf.register(TAG, split[1]); //tag
if (split.length > 1) {
mf.register(FR, split[1]); // frequency
}// }
} catch (Exception e) {
Parser.out.println("Error in lexicon line " + cnt + " error: " + e.getMessage());
}
}
Parser.out.println("read number of words from lexicon " + cnt);
inputReader.close();
} catch (Exception e) {
e.printStackTrace();
}
word2tag = new byte[mf.getFeatureCounter().get(PipeGen.WORD)][1];
// insert words
try {
String line;
BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768);
while ((line = inputReader.readLine()) != null) {
String[] split = line.split(REGEX);
int w = mf.getValue(PipeGen.WORD, split[0]);
if (w < 0) {
continue;
}
word2tag[w][0] = (byte) mf.getValue(TAG, split[1]);
// if (split.length>1) word2tag[w][1]= (byte)mf.getValue(FR, split[2]); // frequency
}
inputReader.close();
int fill = 0;
for (int l = 0; l < word2tag.length; l++) {
if (word2tag[l][0] != 0) {
fill++;
}
}
Parser.out.println("filled " + fill + " of " + word2tag.length);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Read the cluster
*
* @param dos
* @throws IOException
*/
public Lexicon(DataInputStream dis) throws IOException {
word2tag = new byte[dis.readInt()][1];
for (int i = 0; i < word2tag.length; i++) {
word2tag[i][0] = dis.readByte();
// word2tag[i][1]=dis.readByte();
}
DB.println("Read lexicon with " + word2tag.length + " words ");
}
/**
* Write the cluster
*
* @param dos
* @throws IOException
*/
public void write(DataOutputStream dos) throws IOException {
dos.writeInt(word2tag.length);
for (byte[] i : word2tag) {
dos.writeByte(i[0]);
// dos.writeByte(i[1]);
}
}
/**
* @param form
* @return
*/
public int getTag(int form) {
if (word2tag.length < form || form < 0) {
return -1;
}
return word2tag[form][0];
}
/**
* @param form
* @return
*/
public int getConf(int form) {
if (word2tag.length < form || form < 0) {
return -1;
}
return word2tag[form][1];
}
}