package org.fnlp.app.lucene; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import edu.fudan.nlp.cn.CNFactory; import edu.fudan.nlp.cn.tag.CWSTagger; import edu.fudan.nlp.cn.tag.POSTagger; public final class WordTokenFilter extends TokenFilter { private Iterator<String> tokenIter; private List<String> tokenBuffer; private Iterator<String> posIter; private List<String> posBuffer; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final POSAttribute posAtt = addAttribute(POSAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private int tokStart; // only used if the length changed before this filter private int tokEnd; // only used if the length changed before this filter private boolean hasIllegalOffsets; // only if the length changed before this filter private int idx=0; CNFactory factory; /** * Construct a new WordTokenizer. * * @param in {@link TokenStream} of sentences */ public WordTokenFilter(TokenStream in) { super(in); factory = CNFactory.getInstance(); } @Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; // a new sentence is available: process it. String[] w = factory.seg(termAtt.toString()); String[] p = factory.tag(w); tokenBuffer = Arrays.asList(w); posBuffer = Arrays.asList(p); tokenIter = tokenBuffer.iterator(); posIter = posBuffer.iterator(); idx = 0; /* * it should not be possible to have a sentence with 0 words, check just in case. * returning EOS isn't the best either, but its the behavior of the original code. */ if (!tokenIter.hasNext()) return false; } else { return false; // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. clearAttributes(); // There are remaining tokens from the current sentence, return the next one. String nextWord = tokenIter.next(); String pos = posIter.next(); termAtt.append(nextWord); posAtt.setPartOfSpeech(pos); int end = idx+nextWord.length(); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(idx, end-1); } idx = end; typeAtt.setType("word"); return true; } @Override public void reset() throws IOException { super.reset(); tokenIter = null; } }