/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.util.Collection; import net.ontopia.topicmaps.core.TopicIF; import net.ontopia.topicmaps.core.TopicMapIF; /** * INTERNAL: */ public class TopicMapClassification { TermDatabase tdb; TopicMapAnalyzer ta; TermAnalyzerIF customTermAnalyzer; public TopicMapClassification() { this.tdb = new TermDatabase(); } public TopicMapClassification(TopicMapIF topicmap) { this.tdb = new TermDatabase(); this.ta = new TopicMapAnalyzer(topicmap); } public void setCustomTermAnalyzer(TermAnalyzerIF customTermAnalyzer) { this.customTermAnalyzer = customTermAnalyzer; } public void classify(ClassifiableContentIF cc) { // detect document format and read document Document doc = new Document(); new FormatModule().readContent(cc, doc); // tokenize document DocumentTokenizer dt = new DocumentTokenizer(tdb); dt.setTokenizer(new DefaultTokenizer()); SpecialCharNormalizer specialChars = new SpecialCharNormalizer(); dt.setDelimiterTrimmer(specialChars); dt.addTermNormalizer(new JunkNormalizer()); dt.addTermNormalizer(specialChars); dt.tokenize(doc); // detect language Language language = Language.detectLanguage(doc); // set up document classifier and term database DocumentClassifier dc = new DocumentClassifier(tdb); TermStemmerIF stemmer = language.getStemmer(); dc.setTermStemmer(stemmer); dc.addDocumentAnalyzer(new DistanceAnalyzer()); CompoundAnalyzer ca = new CompoundAnalyzer(); ca.setTermStemmer(stemmer); dc.addDocumentAnalyzer(ca); RegionBooster rb = new RegionBooster(); rb.addBoost("title", 1.15d); //! rb.addBoost("abstract", 1.05d); //! rb.addBoost("keyword", 1.10d); //! rb.addBoost("para", 1.01d); //! dc.addDocumentAnalyzer(rb); dc.addTermAnalyzer(CharacterAnalyzer.getInstance()); dc.addTermAnalyzer(language.getFrequencyAnalyzer()); dc.addTermAnalyzer(new RegexpTermAnalyzer()); // FIXME: wrap and hand over to compound analyzer instead? dc.addTermAnalyzer(language.getStopListAnalyzer()); if (customTermAnalyzer != null) dc.addTermAnalyzer(customTermAnalyzer); // blacklist dc.addTermAnalyzer(ca); // run stop list analyzer after compounds have been made dc.addTermAnalyzer(language.getStopListAnalyzer()); if (customTermAnalyzer != null) dc.addTermAnalyzer(customTermAnalyzer); // blacklist if (ta != null) dc.addTermAnalyzer(ta); dc.addTermAnalyzer(new RelativeScore()); // analyze document dc.analyzeDocument(doc); dc.analyzeTerms(); } public TermDatabase getTermDatabase() { return tdb; } /** * INTERNAL: Returns the topics that matches the given variant. */ public Collection<TopicIF> getTopics(Variant variant) { return ta.getTopics(variant); } public Collection<TopicIF> getCandidateTypes() { return ta.getCandidateTypes(); } public Collection<TopicMapAnalyzer.AssociationType> getAssociationTypes() { return ta.getAssociationTypes(); } }