/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.analysis; import java.io.*; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.*; /** The tokenizer used for Nutch document text. Implemented in terms of our * JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared * with the query parser. */ public final class NutchDocumentTokenizer extends Tokenizer implements NutchAnalysisConstants { private final NutchAnalysisTokenManager tokenManager; private final TermAttribute termAtt; private final PositionIncrementAttribute posIncrAtt; private final TypeAttribute typeAtt; private final OffsetAttribute offsetAtt; /** Construct a tokenizer for the text in a Reader. */ public NutchDocumentTokenizer(Reader reader) { super(reader); tokenManager = new NutchAnalysisTokenManager(reader); this.termAtt = addAttribute(TermAttribute.class); this.offsetAtt = addAttribute(OffsetAttribute.class); this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); this.typeAtt = addAttribute(TypeAttribute.class); } /** Returns the next token in the stream, or null at EOF. */ private final Token next() throws IOException { org.apache.nutch.analysis.Token t; try { loop: { while (true) { t = tokenManager.getNextToken(); switch (t.kind) { // skip query syntax tokens case EOF: case WORD: case ACRONYM: case SIGRAM: break loop; default: } } } } catch (TokenMgrError e) { // translate exceptions throw new IOException("Tokenizer error:" + e); } if (t.kind == EOF) // translate tokens return null; else { return new Token(t.image,t.beginColumn,t.endColumn,tokenImage[t.kind]); } } /** Lucene 3.0 API. */ public boolean incrementToken() throws IOException { clearAttributes(); final Token t = next(); if (t != null) { termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength()); offsetAtt.setOffset(t.startOffset(), t.endOffset()); posIncrAtt.setPositionIncrement(t.getPositionIncrement()); typeAtt.setType(t.type()); return true; } else { return false; } } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Text: "); String line = in.readLine(); Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line)); TermAttribute termAtt = tokenizer.getAttribute(TermAttribute.class); System.out.print("Tokens: "); while (tokenizer.incrementToken()) { System.out.print(termAtt.term()); System.out.print(" "); } System.out.println(); } } }