/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.lemmatizer.model;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import java.io.IOException;
import java.util.*;
/**
* @version $Revision$
*/
public class LemmaDeSerializer {
private static final int INT_ZK = 1;
private static final int CHAR_ZK = 2;
private static final char CHAR_OFFSET = 'a';
protected static byte[] createLemmasData(Iterable<LemmaSuffix> suffixes) throws IOException {
final IntList cuts = new IntArrayList();
final IntList lens = new IntArrayList();
final List<CharSequence> suffs = new ArrayList<CharSequence>();
for (final LemmaSuffix suf : suffixes) {
cuts.add(suf.getCut());
final CharSequence suffix = suf.getSuffix();
lens.add(suffix.length());
suffs.add(suffix);
}
final byte[] buf = new byte[4096];
final OutputBitStream out = new OutputBitStream(buf);
out.writeZeta(cuts.size() - 1, INT_ZK);
writeInts(out, lens);
for (CharSequence suffix : suffs) {
writeSuffix(out, suffix);
}
writeInts(out, cuts);
out.flush();
return Arrays.copyOfRange(buf, 0, (int) (out.writtenBits() / Byte.SIZE));
}
private static void writeSuffix(OutputBitStream out, CharSequence suffix) throws IOException {
for (int i = 0; i < suffix.length(); i++) {
out.writeZeta(suffix.charAt(i) - CHAR_OFFSET, CHAR_ZK);
}
}
private static void writeInts(OutputBitStream out, IntList lens) throws IOException {
for (IntIterator it = lens.iterator(); it.hasNext();) {
out.writeZeta(it.nextInt(), INT_ZK);
}
}
public static Iterator<String> createIterator(CharSequence lemma, byte[] data) throws IOException {
return new LemmaIterator(lemma, data);
}
private static class LemmaIterator implements Iterator<String> {
private final CharSequence lemma;
private final int count;
private final int[] cuts;
private final char[][] suffixes;
private int idx = 0;
public LemmaIterator(CharSequence lemma, byte[] data) throws IOException {
this.lemma = lemma;
final InputBitStream in = new InputBitStream(data);
count = in.readZeta(INT_ZK) + 1;
cuts = new int[count];
in.readZetas(INT_ZK, cuts, count);
suffixes = new char[count][];
for (int j = 0; j < cuts.length; j++) {
int len = cuts[j];
final char[] suf = new char[len];
suffixes[j] = suf;
for (int i = 0; i < len; i++) {
suf[i] = (char) (in.readZeta(CHAR_ZK) + CHAR_OFFSET);
}
}
in.readZetas(INT_ZK, cuts, count);
}
@Override
public boolean hasNext() {
return idx < count;
}
@Override
public String next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
final MutableString l = new MutableString(lemma.length() - cuts[idx] + suffixes[idx].length);
l.append(lemma, 0, lemma.length() - cuts[idx]);
l.append(suffixes[idx++]);
return l.toString();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
}