DocumentTokenizer.java example

Explorer
ontopia-master
/*
 * #!
 * Ontopia Classify
 * #-
 * Copyright (C) 2001 - 2013 The Ontopia Project
 * #-
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * !#
 */

package net.ontopia.topicmaps.classify;

import java.util.ArrayList;
import java.util.List;

/**
 * INTERNAL: 
 */
public class DocumentTokenizer {
  TermDatabase tdb;
  TokenizerIF tokenizer;
  DelimiterTrimmerIF delimiterTrimmer;
  List<TermNormalizerIF> termNormalizers = new ArrayList<TermNormalizerIF>();

  public DocumentTokenizer(TermDatabase tdb) {
    this.tdb = tdb;
  }

  public TermDatabase getTermDatabase() {
    return tdb;
  }

  public void setTermDatabase(TermDatabase tdb) {    
    this.tdb = tdb;
  }
  
  // --------------------------------------------------------------------------
  // configuration
  // --------------------------------------------------------------------------

  public void setTokenizer(TokenizerIF tokenizer) {
    this.tokenizer = tokenizer;
  }

  public void setDelimiterTrimmer(DelimiterTrimmerIF trimmer) {
    this.delimiterTrimmer = trimmer;
  }
  
  public void addTermNormalizer(TermNormalizerIF normalizer) {
    this.termNormalizers.add(normalizer);
  }
  
  // --------------------------------------------------------------------------
  // document tokenization
  // --------------------------------------------------------------------------
  
  public void tokenize(Document doc) {
    // turn text blocks into lists of tokens
    tokenize(doc.getRoot());
    doc.setTokenized(true);
  }
  
  protected void tokenize(Region region) {
    // loop over region's children
    for (Object child : region.getChildren()) {
      if (child instanceof TextBlock) {
        TextBlock tb = (TextBlock)child;
        tokenize(region, tb);
      } else {
        Region tr = (Region)child;
        tokenize(tr);
      }
    }
  }
  
  protected void tokenize(Region parent, TextBlock tb) {
    String text = tb.getText();
    
    // tokenize
    tokenizer.setText(text);
    while (tokenizer.next()) {
      // normalize term (stemming, junk filter, synonyms etc.)
      tokenize(tb, tokenizer.getToken());
    }
  }
  
  protected void tokenize(TextBlock tb, String token) {
    if (token == null) return;
    
    // sentence boundaries; extract delimiters
    String delimiterBefore = null;
    String delimiterAfter = null;
    int six = delimiterTrimmer.trimStart(token);
    int eix = delimiterTrimmer.trimEnd(token);
    if (six > 0 && eix > six && eix < token.length() - 1) {
      delimiterBefore = token.substring(0, six);
      delimiterAfter = token.substring(eix+1);
      token = token.substring(six, eix+1);
    } else if (six > 0) {
      delimiterBefore = token.substring(0, six);
      token = token.substring(six);
    } else if (eix < token.length() - 1) {
      delimiterAfter = token.substring(eix+1);
      token = token.substring(0, eix+1);
    }
    
    // normalize token
    String normalized = token;
    if (termNormalizers != null && !termNormalizers.isEmpty()) {
      int size = termNormalizers.size();
      for (int i=0; i < size; i++) {
        TermNormalizerIF normalizer = termNormalizers.get(i);
        normalized = normalizer.normalize(normalized);
        if (normalized == null) break;
      }
    }

    // create token object
    Token t;
    if (normalized == null) {
      // found junk
      t = tdb.createDelimiter(normalized);
    } else {
      // found variant
      t = tdb.createVariant(normalized);
    }

    // add before delimiter
    if (delimiterBefore != null)
      tb.addToken(tdb.createDelimiter(delimiterBefore));

    // add token to text block
    tb.addToken(t);
    
    // add after delimiter
    if (delimiterAfter != null)
      tb.addToken(tdb.createDelimiter(delimiterAfter));
  }
  
}