/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.util.Map; import java.util.Arrays; import java.util.HashMap; import java.util.Collection; /** * PUBLIC: A collection of terms representing the result of * classifying a piece of content. The terms have scores indicating * their importance within the content, and variants, indicating * different spellings for the same term within the content. * * <p>Use SimpleClassifier to create TermDatabase objects. */ public class TermDatabase { protected Map<String, Term> terms; protected Map<String, Variant> variants; protected Map<String, Token> delimiter_terms; TermDatabase() { this.terms = new HashMap<String, Term>(); this.variants = new HashMap<String, Variant>(); this.delimiter_terms = new HashMap<String, Token>(); } /** * PUBLIC: Returns all terms found in the classified content. */ public Collection<Term> getTerms() { return terms.values(); } /** * PUBLIC: Returns all terms found in the classified content sorted * by score. */ public Term[] getTermsByRank() { Term[] ranked = terms.values().toArray(new Term[] {}); Arrays.sort(ranked, Term.SCORE_COMPARATOR); return ranked; } /** * PUBLIC: Returns the number of terms in the database. */ public int getTermCount() { return terms.size(); } /** * PUBLIC: Looks up a particular term by its stem. Returns null if * no term is found. */ public Term getTerm(String stem) { return terms.get(stem); } /** * PUBLIC: Looks up a particular variant by its string representation. * Returns null if no variant is found. */ public Variant getVariant(String variant) { return variants.get(variant); } // -------------------------------------------------------------------------- // package internal // -------------------------------------------------------------------------- protected Token createDelimiter(String delimiter) { Token token = delimiter_terms.get(delimiter); if (token == null) { token = new Token(delimiter, Token.TYPE_DELIMITER); delimiter_terms.put(delimiter, token); } return token; } protected void mergeTerms(Term t1, Term t2) { if (t1 == t2) return; t1.merge(t2); terms.remove(t2.getStem()); } protected Term createTerm(String stem) { Term term = terms.get(stem); if (term == null) { term = new Term(stem); terms.put(stem, term); } return term; } protected double getMaxScore() { Term[] terms = getTermsByRank(); if (terms.length == 0) return 0; else return terms[0].getScore(); } protected Variant createVariant(String variant) { Variant v = variants.get(variant); if (v == null) { v = new Variant(variant); variants.put(variant, v); } return v; } // -------------------------------------------------------------------------- // debug // -------------------------------------------------------------------------- /** * INTERNAL: Writes the contents of the term database out to * System.out for debugging purposes. */ public void dump() { dump(-1); } /** * INTERNAL: Writes the contents of the term database out to * System.out for debugging purposes. * @param firstN how many terms to output */ public void dump(int firstN) { // rank terms by score Term[] terms = getTermsByRank(); // output top N terms int num = (firstN <= 0 ? terms.length : Math.min(terms.length, firstN)); for (int i=0; i < num; i++) { Term t = terms[i]; System.out.println(Integer.toString(i+1) + ": " + t.getPreferredName() + " " + t.getScore() + ", " + t.getOccurrences()); } System.out.println("Total: " + terms.length + " terms."); } }