package is2.data; import is2.parser.Parser; import is2.util.DB; import java.io.*; /** * @author Dr. Bernd Bohnet, 28.10.2010 * * */ final public class Cluster { public static final String LPATH = "LP"; public static final String SPATH = "SP"; // [word][p] p = [0:long-path | 1:short-path] final private short[][] word2path; public Cluster() { word2path = new short[0][0]; } /** * @param clusterFile * @param mf * */ public Cluster(String clusterFile, IEncoderPlus mf, int ls) { final String REGEX = "\t"; // register words try { BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768); int cnt = 0; String line; while ((line = inputReader.readLine()) != null) { cnt++; try { String[] split = line.split(REGEX); mf.register(SPATH, split[0].length() < ls ? split[0] : split[0].substring(0, ls)); mf.register(LPATH, split[0]); mf.register(PipeGen.WORD, split[1]); } catch (Exception e) { Parser.out.println("Error in cluster line " + cnt + " error: " + e.getMessage()); } } Parser.out.println("read number of clusters " + cnt); inputReader.close(); } catch (Exception e) { e.printStackTrace(); } word2path = new short[mf.getFeatureCounter().get(PipeGen.WORD)][2]; // insert words try { String line; BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768); while ((line = inputReader.readLine()) != null) { String[] split = line.split(REGEX); int wd = mf.getValue(PipeGen.WORD, split[1]); word2path[wd][0] = (short) mf.getValue(SPATH, split[0].length() < ls ? split[0] : split[0].substring(0, ls)); word2path[wd][1] = (short) mf.getValue(LPATH, split[0]); } inputReader.close(); int fill = 0; for (int l = 0; l < word2path.length; l++) { if (word2path[l][0] != 0) { fill++; } } /* * for(int l = 0; l<word2path.length; l++ ){ if (word2path[l][1]!=0) * fillL++; if (word2path[l][1]<-1) Parser.out.println("lower * "+word2path[l][1]); } */ Parser.out.println("filled " + fill + " of " + word2path.length); } catch (Exception e) { e.printStackTrace(); } } /** * Read the cluster * * @param dos * @throws IOException */ public Cluster(DataInputStream dis) throws IOException { word2path = new short[dis.readInt()][2]; for (int i = 0; i < word2path.length; i++) { word2path[i][0] = dis.readShort(); word2path[i][1] = dis.readShort(); } DB.println("Read cluster with " + word2path.length + " words "); } /** * Write the cluster * * @param dos * @throws IOException */ public void write(DataOutputStream dos) throws IOException { dos.writeInt(word2path.length); for (short[] i : word2path) { dos.writeShort(i[0]); dos.writeShort(i[1]); } } /** * @param form the id of a word form * @return the short path to the word form in the cluster * * final public int getSP(int form) { if (word2path.length<form) return -1; * return word2path[form][0]; } */ /** * get the long path to a word form in the cluster * * @param form the id of a word form * @return the long path to the word */ final public int getLP(int form) { if (word2path.length <= form || word2path[form].length <= 0) { return -1; } return word2path[form][0] == 0 ? -1 : word2path[form][0]; } final public int getLP(int form, int l) { if (word2path.length < form) { return -1; } return word2path[form][l] == 0 ? -1 : word2path[form][l]; } final public int size() { return word2path.length; } }