/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.util.ArrayList; import java.util.List; /** * INTERNAL: */ public class DocumentClassifier { TermDatabase tdb; TermStemmerIF termStemmer; List<DocumentAnalyzerIF> docAnalyzers = new ArrayList<DocumentAnalyzerIF>(); List<TermAnalyzerIF> termAnalyzers = new ArrayList<TermAnalyzerIF>(); public DocumentClassifier(TermDatabase tdb) { this.tdb = tdb; } public TermDatabase getTermDatabase() { return tdb; } public void setTermDatabase(TermDatabase tdb) { this.tdb = tdb; } // -------------------------------------------------------------------------- // configuration // -------------------------------------------------------------------------- public void setTermStemmer(TermStemmerIF stemmer) { this.termStemmer = stemmer; } public void addDocumentAnalyzer(DocumentAnalyzerIF analyzer) { this.docAnalyzers.add(analyzer); } public void addTermAnalyzer(TermAnalyzerIF analyzer) { this.termAnalyzers.add(analyzer); } // -------------------------------------------------------------------------- // term extraction // -------------------------------------------------------------------------- protected void extractTerms(Document doc) { // turn text blocks into lists of terms extractTerms(doc.getRoot()); } protected void extractTerms(Region region) { // loop over region's children for (Object child : region.getChildren()) { if (child instanceof TextBlock) { TextBlock tb = (TextBlock)child; extractTerms(region, tb); } else { Region tr = (Region)child; extractTerms(tr); } } } protected void extractTerms(Region parent, TextBlock tb) { for (Token token : tb.getTokens()) { if (token.getType() == Token.TYPE_VARIANT) { Variant variant = (Variant)token; Term term = variant.getTerm(); if (term == null) { String normalized = token.getValue(); String stem = termStemmer.stem(normalized); term = tdb.createTerm(stem); variant.setTerm(term); } term.addVariant(variant); } } } // -------------------------------------------------------------------------- // document analysis // -------------------------------------------------------------------------- public void analyzeDocument(Document doc) { // turn text blocks into lists of terms extractTerms(doc); // do document analysis if (docAnalyzers != null && !docAnalyzers.isEmpty()) { Region root = doc.getRoot(); for (DocumentAnalyzerIF analyzer : docAnalyzers) { analyzer.startAnalysis(); try { while (analyzer.doDocumentAnalysis()) { analyzer.startDocument(doc); analyzeRegion(root, analyzer); analyzer.endDocument(doc); } } finally { analyzer.endAnalysis(); } } } } protected void analyzeRegion(Region region, DocumentAnalyzerIF analyzer) { analyzer.startRegion(region); // loop over region's children for (Object child : region.getChildren()) { if (child instanceof TextBlock) { TextBlock tb = (TextBlock)child; analyzeTextBlock(region, tb, analyzer); } else { Region tr = (Region)child; analyzeRegion(tr, analyzer); } } analyzer.endRegion(region); } protected void analyzeTextBlock(Region parent, TextBlock tb, DocumentAnalyzerIF analyzer) { // loop over terms in text block List<Token> tokens = tb.getTokens(); int size = tokens.size(); for (int i=0; i < size; i++) { Token t = tokens.get(i); analyzer.analyzeToken(tb, t, i); } } // -------------------------------------------------------------------------- // term analysis // -------------------------------------------------------------------------- public void analyzeTerms() { if (termAnalyzers != null && !termAnalyzers.isEmpty()) { for (TermAnalyzerIF analyzer : termAnalyzers) { analyzer.startAnalysis(tdb); try { Term[] terms = tdb.getTerms().toArray(new Term[] {}); // create array to avoid CME for (int x=0; x < terms.length; x++) analyzer.analyzeTerm(terms[x]); } finally { analyzer.endAnalysis(); } } } } // -------------------------------------------------------------------------- // debug // -------------------------------------------------------------------------- public void dump() { for (TermAnalyzerIF ta : termAnalyzers) { if (ta instanceof CompoundAnalyzer) { CompoundAnalyzer ca = (CompoundAnalyzer)ta; Term[] terms = tdb.getTermsByRank(); for (int i=0; i < terms.length; i++) { ca.dump(terms[i]); } } } } }