TopicMapClassification.java example

Explorer
ontopia-master
/*
 * #!
 * Ontopia Classify
 * #-
 * Copyright (C) 2001 - 2013 The Ontopia Project
 * #-
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * !#
 */

package net.ontopia.topicmaps.classify;

import java.util.Collection;
import net.ontopia.topicmaps.core.TopicIF;
import net.ontopia.topicmaps.core.TopicMapIF;

/**
 * INTERNAL: 
 */
public class TopicMapClassification {
  TermDatabase tdb;
  TopicMapAnalyzer ta;
  TermAnalyzerIF customTermAnalyzer;
  
  public TopicMapClassification() {
    this.tdb = new TermDatabase();    
  }
  
  public TopicMapClassification(TopicMapIF topicmap) {
    this.tdb = new TermDatabase();    
    this.ta = new TopicMapAnalyzer(topicmap);
  }

  public void setCustomTermAnalyzer(TermAnalyzerIF customTermAnalyzer) {
    this.customTermAnalyzer = customTermAnalyzer;
  }
  
  public void classify(ClassifiableContentIF cc) {
    // detect document format and read document
    Document doc = new Document();
    new FormatModule().readContent(cc, doc);

    // tokenize document
    DocumentTokenizer dt = new DocumentTokenizer(tdb);
    dt.setTokenizer(new DefaultTokenizer());
    
    SpecialCharNormalizer specialChars = new SpecialCharNormalizer();
    dt.setDelimiterTrimmer(specialChars);
    
    dt.addTermNormalizer(new JunkNormalizer());
    dt.addTermNormalizer(specialChars);

    dt.tokenize(doc);

    // detect language
    Language language = Language.detectLanguage(doc);
    
    // set up document classifier and term database
    DocumentClassifier dc = new DocumentClassifier(tdb);   
    
    TermStemmerIF stemmer = language.getStemmer();
    dc.setTermStemmer(stemmer);
    
    dc.addDocumentAnalyzer(new DistanceAnalyzer());
    CompoundAnalyzer ca = new CompoundAnalyzer();
    ca.setTermStemmer(stemmer);
    dc.addDocumentAnalyzer(ca);
    
    RegionBooster rb = new RegionBooster();
    rb.addBoost("title", 1.15d);
    //! rb.addBoost("abstract", 1.05d);
    //! rb.addBoost("keyword", 1.10d);
    //! rb.addBoost("para", 1.01d);
    //! dc.addDocumentAnalyzer(rb);
    
    dc.addTermAnalyzer(CharacterAnalyzer.getInstance());
    dc.addTermAnalyzer(language.getFrequencyAnalyzer());
    dc.addTermAnalyzer(new RegexpTermAnalyzer());

    // FIXME: wrap and hand over to compound analyzer instead?
    dc.addTermAnalyzer(language.getStopListAnalyzer());
    if (customTermAnalyzer != null) dc.addTermAnalyzer(customTermAnalyzer); // blacklist

    dc.addTermAnalyzer(ca);

    // run stop list analyzer after compounds have been made    
    dc.addTermAnalyzer(language.getStopListAnalyzer());
    if (customTermAnalyzer != null) dc.addTermAnalyzer(customTermAnalyzer); // blacklist
    
    if (ta != null) dc.addTermAnalyzer(ta);
    dc.addTermAnalyzer(new RelativeScore());
    
    // analyze document
    dc.analyzeDocument(doc);
    dc.analyzeTerms();    
  }
  
  public TermDatabase getTermDatabase() {
    return tdb;
  }

  /**
   * INTERNAL: Returns the topics that matches the given variant.
   */
  public Collection<TopicIF> getTopics(Variant variant) {
    return ta.getTopics(variant);
  }

  public Collection<TopicIF> getCandidateTypes() {
    return ta.getCandidateTypes();
  }

  public Collection<TopicMapAnalyzer.AssociationType> getAssociationTypes() {
    return ta.getAssociationTypes();
  }
  
}