WordTokenFilter.java example

Explorer
fudannlp-master
package org.fnlp.app.lucene;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import edu.fudan.nlp.cn.CNFactory;
import edu.fudan.nlp.cn.tag.CWSTagger;
import edu.fudan.nlp.cn.tag.POSTagger;

public final class WordTokenFilter extends TokenFilter {

	private Iterator<String> tokenIter;
	private List<String> tokenBuffer;
	
	private Iterator<String> posIter;
	private List<String> posBuffer;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final POSAttribute posAtt = addAttribute(POSAttribute.class);
	
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

	private int tokStart; // only used if the length changed before this filter
	private int tokEnd; // only used if the length changed before this filter
	private boolean hasIllegalOffsets; // only if the length changed before this filter
	private int idx=0;

	CNFactory factory;
	
	/**
	 * Construct a new WordTokenizer.
	 * 
	 * @param in {@link TokenStream} of sentences 
	 */
	public WordTokenFilter(TokenStream in) {
		super(in);
		factory = CNFactory.getInstance();
	}

	@Override
	public boolean incrementToken() throws IOException {   
		if (tokenIter == null || !tokenIter.hasNext()) {
			// there are no remaining tokens from the current sentence... are there more sentences?
			if (input.incrementToken()) {
				tokStart = offsetAtt.startOffset();
				tokEnd = offsetAtt.endOffset();
				// if length by start + end offsets doesn't match the term text then assume
				// this is a synonym and don't adjust the offsets.
				hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
				// a new sentence is available: process it.
				String[] w = factory.seg(termAtt.toString());
				String[] p = factory.tag(w);
				tokenBuffer = Arrays.asList(w);
				posBuffer = Arrays.asList(p);
				tokenIter = tokenBuffer.iterator();
				posIter = posBuffer.iterator();
				idx = 0;
				/* 
				 * it should not be possible to have a sentence with 0 words, check just in case.
				 * returning EOS isn't the best either, but its the behavior of the original code.
				 */
				if (!tokenIter.hasNext())
					return false;
			} else {
				return false; // no more sentences, end of stream!
			}
		} 
		// WordTokenFilter must clear attributes, as it is creating new tokens.
		clearAttributes();
		// There are remaining tokens from the current sentence, return the next one. 
		String nextWord = tokenIter.next();
		String pos = posIter.next();
		termAtt.append(nextWord);
		posAtt.setPartOfSpeech(pos);
		int end = idx+nextWord.length();
		if (hasIllegalOffsets) {
			offsetAtt.setOffset(tokStart, tokEnd);
		} else {
			offsetAtt.setOffset(idx, end-1);
		}
		idx = end;
		typeAtt.setType("word");
		return true;
	}

	@Override
	public void reset() throws IOException {
		super.reset();
		tokenIter = null;
	}
}