/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.lemmatizer.model; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Iterator; import java.util.concurrent.TimeUnit; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.OutputBitStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import no.trank.openpipe.lemmatizer.util.TernarySearchTree; import no.trank.openpipe.lemmatizer.util.TreeValue; import no.trank.openpipe.lemmatizer.util.TreeValueFactory; import no.trank.openpipe.util.Iterators; import no.trank.openpipe.util.log.DefaultTimedLogger; import no.trank.openpipe.util.log.TimedLogger; /** * @version $Revision$ */ public class LemmatizeModel { private static final Logger log = LoggerFactory.getLogger(LemmatizeModel.class); private final TimedLogger tlA = createTimedLogger("Added %1$d (%3$d) lemmas at %2$.2f (%4$.2f) micros/lemma"); private final TimedLogger tlC = createTimedLogger("Created %1$d (%3$d) kb at %2$.2f (%4$.2f) kb/sec", TimeUnit.SECONDS, DefaultTimedLogger.Calculator.UNIT_PER_TIME); private final TimedLogger tlG = createTimedLogger("Got %1$d (%3$d) lemmas at %2$.2f (%4$.2f) micros/lemma"); private final TernarySearchTree<Lemmas> lemmas; private final LemmasFactory factory; private int rest = 0; private static TimedLogger createTimedLogger(String format) { return createTimedLogger(format, TimeUnit.MICROSECONDS, DefaultTimedLogger.Calculator.TIME_PER_UNIT); } private static TimedLogger createTimedLogger(String format, TimeUnit unit, DefaultTimedLogger.Calculator calculator) { final DefaultTimedLogger logger = new DefaultTimedLogger(log, format, unit, calculator); logger.setLogPeriodInSeconds(60); return logger; } public LemmatizeModel() { factory = new LemmasFactory(); lemmas = new TernarySearchTree<Lemmas>(factory); } public void add(CharSequence term, Iterable<LemmaSuffix> suffixes) { try { tlC.startTimer(); final byte[] data = LemmaDeSerializer.createLemmasData(suffixes); final int len = data.length + rest; tlC.stopTimerAndIncrement(len / 1024); rest = len % 1024; tlA.startTimer(); lemmas.put(term, factory.newValue(data)); tlA.stopTimerAndIncrement(); } catch (IOException e) { throw new RuntimeException(e); } } public Iterator<String> get(CharSequence term) { tlG.startTimer(); try { final Lemmas lemmas = this.lemmas.get(term); if (lemmas != null) { return lemmas.getLemmas(term); } return Iterators.emptyIterator(); } finally { tlG.stopTimerAndIncrement(); } } public void log() { tlA.log(); tlC.log(); tlG.log(); } public void write(OutputStream out) throws IOException { lemmas.write(out); } public void read(InputStream in) throws IOException { lemmas.read(in); reset(); } public void reset() { tlA.reset(); tlC.reset(); tlG.reset(); } private static class LemmasFactory implements TreeValueFactory<Lemmas> { private int zetaK = 2; @Override public Lemmas newValue() { return new Lemmas(this); } public Lemmas newValue(byte[] data) { return new Lemmas(data, this); } @Override public void writeHeader(OutputBitStream out) throws IOException { out.writeNibble(zetaK); } @Override public void readHeader(InputBitStream in) throws IOException { zetaK = in.readNibble(); } @Override public long getSerialVersionUID() { return 16092084964425728L; } public int getZetaK() { return zetaK; } } public static class Lemmas implements TreeValue { private final LemmasFactory factory; private byte[] data; private Lemmas(LemmasFactory factory) { this.factory = factory; } private Lemmas(byte[] data, LemmasFactory factory) { this.data = data; this.factory = factory; } public Iterator<String> getLemmas(final CharSequence lemma) { try { return LemmaDeSerializer.createIterator(lemma, data); } catch (IOException e) { throw new RuntimeException(e); } } @Override public void write(OutputBitStream out) throws IOException { out.writeZeta(data.length - 1, factory.getZetaK()); out.write(data, data.length * Byte.SIZE); } @Override public void read(InputBitStream in) throws IOException { data = new byte[in.readZeta(factory.getZetaK()) + 1]; in.read(data, data.length * Byte.SIZE); } } }