/*
* #!
* Ontopia Classify
* #-
* Copyright (C) 2001 - 2013 The Ontopia Project
* #-
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* !#
*/
package net.ontopia.topicmaps.classify;
import java.util.ArrayList;
import java.util.List;
/**
* INTERNAL:
*/
public class DocumentTokenizer {
TermDatabase tdb;
TokenizerIF tokenizer;
DelimiterTrimmerIF delimiterTrimmer;
List<TermNormalizerIF> termNormalizers = new ArrayList<TermNormalizerIF>();
public DocumentTokenizer(TermDatabase tdb) {
this.tdb = tdb;
}
public TermDatabase getTermDatabase() {
return tdb;
}
public void setTermDatabase(TermDatabase tdb) {
this.tdb = tdb;
}
// --------------------------------------------------------------------------
// configuration
// --------------------------------------------------------------------------
public void setTokenizer(TokenizerIF tokenizer) {
this.tokenizer = tokenizer;
}
public void setDelimiterTrimmer(DelimiterTrimmerIF trimmer) {
this.delimiterTrimmer = trimmer;
}
public void addTermNormalizer(TermNormalizerIF normalizer) {
this.termNormalizers.add(normalizer);
}
// --------------------------------------------------------------------------
// document tokenization
// --------------------------------------------------------------------------
public void tokenize(Document doc) {
// turn text blocks into lists of tokens
tokenize(doc.getRoot());
doc.setTokenized(true);
}
protected void tokenize(Region region) {
// loop over region's children
for (Object child : region.getChildren()) {
if (child instanceof TextBlock) {
TextBlock tb = (TextBlock)child;
tokenize(region, tb);
} else {
Region tr = (Region)child;
tokenize(tr);
}
}
}
protected void tokenize(Region parent, TextBlock tb) {
String text = tb.getText();
// tokenize
tokenizer.setText(text);
while (tokenizer.next()) {
// normalize term (stemming, junk filter, synonyms etc.)
tokenize(tb, tokenizer.getToken());
}
}
protected void tokenize(TextBlock tb, String token) {
if (token == null) return;
// sentence boundaries; extract delimiters
String delimiterBefore = null;
String delimiterAfter = null;
int six = delimiterTrimmer.trimStart(token);
int eix = delimiterTrimmer.trimEnd(token);
if (six > 0 && eix > six && eix < token.length() - 1) {
delimiterBefore = token.substring(0, six);
delimiterAfter = token.substring(eix+1);
token = token.substring(six, eix+1);
} else if (six > 0) {
delimiterBefore = token.substring(0, six);
token = token.substring(six);
} else if (eix < token.length() - 1) {
delimiterAfter = token.substring(eix+1);
token = token.substring(0, eix+1);
}
// normalize token
String normalized = token;
if (termNormalizers != null && !termNormalizers.isEmpty()) {
int size = termNormalizers.size();
for (int i=0; i < size; i++) {
TermNormalizerIF normalizer = termNormalizers.get(i);
normalized = normalizer.normalize(normalized);
if (normalized == null) break;
}
}
// create token object
Token t;
if (normalized == null) {
// found junk
t = tdb.createDelimiter(normalized);
} else {
// found variant
t = tdb.createVariant(normalized);
}
// add before delimiter
if (delimiterBefore != null)
tb.addToken(tdb.createDelimiter(delimiterBefore));
// add token to text block
tb.addToken(t);
// add after delimiter
if (delimiterAfter != null)
tb.addToken(tdb.createDelimiter(delimiterAfter));
}
}