DocumentClassifier.java example

Explorer
ontopia-master
/*
 * #!
 * Ontopia Classify
 * #-
 * Copyright (C) 2001 - 2013 The Ontopia Project
 * #-
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * !#
 */

package net.ontopia.topicmaps.classify;

import java.util.ArrayList;
import java.util.List;

/**
 * INTERNAL: 
 */
public class DocumentClassifier {
  TermDatabase tdb;
  TermStemmerIF termStemmer;
  List<DocumentAnalyzerIF> docAnalyzers = new ArrayList<DocumentAnalyzerIF>();
  List<TermAnalyzerIF> termAnalyzers = new ArrayList<TermAnalyzerIF>();

  public DocumentClassifier(TermDatabase tdb) {
    this.tdb = tdb;
  }

  public TermDatabase getTermDatabase() {
    return tdb;
  }

  public void setTermDatabase(TermDatabase tdb) {    
    this.tdb = tdb;
  }
  
  // --------------------------------------------------------------------------
  // configuration
  // --------------------------------------------------------------------------
  
  public void setTermStemmer(TermStemmerIF stemmer) {
    this.termStemmer = stemmer;
  }
  
  public void addDocumentAnalyzer(DocumentAnalyzerIF analyzer) {
    this.docAnalyzers.add(analyzer);
  }

  public void addTermAnalyzer(TermAnalyzerIF analyzer) {
    this.termAnalyzers.add(analyzer);
  }
  
  // --------------------------------------------------------------------------
  // term extraction
  // --------------------------------------------------------------------------

  protected void extractTerms(Document doc) {
    // turn text blocks into lists of terms
    extractTerms(doc.getRoot());
  }
  
  protected void extractTerms(Region region) {
    // loop over region's children
    for (Object child : region.getChildren()) {
      if (child instanceof TextBlock) {
        TextBlock tb = (TextBlock)child;
        extractTerms(region, tb);
      } else {
        Region tr = (Region)child;
        extractTerms(tr);
      }
    }
  }
  
  protected void extractTerms(Region parent, TextBlock tb) {
    for (Token token : tb.getTokens()) {
      if (token.getType() == Token.TYPE_VARIANT) {
        Variant variant = (Variant)token;
        Term term = variant.getTerm();
        if (term == null) {        
          String normalized = token.getValue();        
          String stem = termStemmer.stem(normalized);
          term = tdb.createTerm(stem);
          variant.setTerm(term);
        }
        term.addVariant(variant);
      }
    }
  }
  
  // --------------------------------------------------------------------------
  // document analysis
  // --------------------------------------------------------------------------
  
  public void analyzeDocument(Document doc) {
    // turn text blocks into lists of terms
    extractTerms(doc);
    // do document analysis
    if (docAnalyzers != null && !docAnalyzers.isEmpty()) {
      Region root = doc.getRoot();
      for (DocumentAnalyzerIF analyzer : docAnalyzers) {
        analyzer.startAnalysis();
        try {
          while (analyzer.doDocumentAnalysis()) {
            analyzer.startDocument(doc);
            analyzeRegion(root, analyzer);
            analyzer.endDocument(doc);
          }
        } finally {
          analyzer.endAnalysis();
        }
      }
    }
  }
  
  protected void analyzeRegion(Region region, DocumentAnalyzerIF analyzer) {
    analyzer.startRegion(region);
    // loop over region's children
    for (Object child : region.getChildren()) {
      if (child instanceof TextBlock) {
        TextBlock tb = (TextBlock)child;
        analyzeTextBlock(region, tb, analyzer);
      } else {
        Region tr = (Region)child;
        analyzeRegion(tr, analyzer);
      }
    }
    analyzer.endRegion(region);
  }
  
  protected void analyzeTextBlock(Region parent, TextBlock tb, DocumentAnalyzerIF analyzer) {
    // loop over terms in text block
    List<Token> tokens = tb.getTokens();
    int size = tokens.size();
    for (int i=0; i < size; i++) {
      Token t = tokens.get(i);
      analyzer.analyzeToken(tb, t, i);
    }
  }

  // --------------------------------------------------------------------------
  // term analysis
  // --------------------------------------------------------------------------
  
  public void analyzeTerms() {
    if (termAnalyzers != null && !termAnalyzers.isEmpty()) {
      for (TermAnalyzerIF analyzer : termAnalyzers) {
        analyzer.startAnalysis(tdb);
        try {
          Term[] terms = tdb.getTerms().toArray(new Term[] {}); // create array to avoid CME
          for (int x=0; x < terms.length; x++)
            analyzer.analyzeTerm(terms[x]);

        } finally {
          analyzer.endAnalysis();
        }
      }
    }
  }
  
  // --------------------------------------------------------------------------
  // debug
  // --------------------------------------------------------------------------

  public void dump() {
    for (TermAnalyzerIF ta : termAnalyzers) {
      if (ta instanceof CompoundAnalyzer) {
        CompoundAnalyzer ca = (CompoundAnalyzer)ta;
        Term[] terms = tdb.getTermsByRank();
        for (int i=0; i < terms.length; i++) {
          ca.dump(terms[i]);
        }
      }
    }
  }  
}