/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import net.ontopia.utils.ObjectUtils; import gnu.trove.map.hash.TObjectIntHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * INTERNAL: */ public class CompoundAnalyzer extends AbstractDocumentAnalyzer implements TermAnalyzerIF { // Define a logging category. static Logger log = LoggerFactory.getLogger(CompoundAnalyzer.class.getName()); TermDatabase tdb; TermStemmerIF termStemmer; Map<Variant, Followers> followers = new HashMap<Variant, Followers>(); int maxLength = 3; double term1ScoreThreshold = 0.02d; double term2ScoreThreshold = 0.02d; int compositeOccsThreshold = 2; double compoundFactor = 2.0d; // 0.6d; public CompoundAnalyzer() { super(1); } public void setMaxLength(int maxLength) { this.maxLength = maxLength; } public void setTerm1ScoreThreshold(double term1ScoreThreshold) { this.term1ScoreThreshold = term1ScoreThreshold; } public void setTerm2ScoreThreshold(double term2ScoreThreshold) { this.term2ScoreThreshold = term2ScoreThreshold; } public void setCompositeOccurrencesThreshold(int compositeOccsThreshold) { this.compositeOccsThreshold = compositeOccsThreshold; } public void setTermStemmer(TermStemmerIF stemmer) { this.termStemmer = stemmer; } // -------------------------------------------------------------------------- // variant followers // -------------------------------------------------------------------------- protected void addFollower(Variant variant, Token token) { addFollower(variant, token, 1); } protected void addFollower(Variant variant, Token token, int counts) { Followers f = followers.get(variant); if (f == null) { f = new Followers(); followers.put(variant, f); } f.addFollower(token, counts); } // -------------------------------------------------------------------------- // document analyzer // -------------------------------------------------------------------------- public void analyzeToken(TextBlock parent, Token token, int index) { // ignore non variant tokens if (token.getType() == Token.TYPE_VARIANT) { List<Token> tokens = parent.getTokens(); int size = tokens.size(); if (size-1 > index) { Token next = tokens.get(index+1); addFollower(((Variant)token), next); } } } // -------------------------------------------------------------------------- // term analyzer // -------------------------------------------------------------------------- public void analyzeTerm(Term term) { addComposites(tdb, term, 2); } public void startAnalysis(TermDatabase tdb) { this.tdb = tdb; } public void endAnalysis() { this.tdb = null; } public void addComposites(TermDatabase tdb, Term t1, int length) { double t1Score = t1.getScore(); if (t1Score < term1ScoreThreshold) return; // loop over all variants and look at their followers Variant[] variants = t1.getVariants(); for (int x=0; x < variants.length; x++) { Variant v1 = variants[x]; Followers f1 = followers.get(v1); if (f1 != null) { double limit = f1.getLimit(); Variant[] followers1 = f1.getFollowers(); for (int z=0; z < followers1.length; z++) { Variant v2 = followers1[z]; Term t2 = v2.getTerm(); if (t1.equals(t2)) continue; // ignore repeated terms double t2Score = t2.getScore(); if (t2Score < term2ScoreThreshold) continue; // check threshold by term, not by variant double compositeScoreTerm = f1.getScore(t2); int compositeOccsTerm = f1.getFollowerOccurrences(t2); String composite = v1.getValue() + " " + v2.getValue(); log.debug("k:" + composite + " " + (compositeScoreTerm-limit) + ", " + compositeOccsTerm + "/" + f1.getTotalFollowerOccurences()); // ignore composites below the thresholds if (compositeScoreTerm >= limit && compositeOccsTerm >= compositeOccsThreshold) { double compositeScore = f1.getScore(v2); int compositeOccs = f1.getFollowerOccurrences(v2); // create new composite term Variant v3 = tdb.createVariant(composite); Term t3 = v3.getTerm(); if (t3 == null) { String stem = (termStemmer != null ? termStemmer.stem(composite) : composite); t3 = tdb.getTerm(stem); double newScore = (t1Score + (t2Score * compositeScore)) * compoundFactor; if (t3 == null) { t3 = tdb.createTerm(stem); // calculate first time score t3.setScore(newScore, "new compound score"); log.debug("c:" + t3.getStem() + " " + t3.getScore() + ", " + compositeOccs + "\n : (" + t1Score + " + (" + t2Score + " * " + compositeScore + ")) * " + compoundFactor + ")"); } else { // adjust term score log.debug("d:" + compositeOccs + " * " + compositeScore); t3.addScore(newScore, "compound adjustment"); } v3.setTerm(t3); } t3.addVariant(v3, compositeOccs); // register the followers of the new composite Followers f2 = followers.get(v2); if (f2 != null) { Variant[] followers2 = f2.getFollowers(); for (int y=0; y < followers2.length; y++) { Variant v4 = followers2[y]; addFollower(v3, v4, f2.getFollowerOccurrences(v4)); } } // score down individual terms log.debug(" b: " + t1.getScore() + " " + t2.getScore()); double ns1 = ((1.0d * compositeOccs) / t1.getOccurrences()); double ns2 = ((1.0d * compositeOccs) / t2.getOccurrences()); if (ns1 < 1.0d) t1.multiplyScore((1.0d - ns1), "compound individiual adjustment"); if (ns2 < 1.0d) t2.multiplyScore((1.0d - ns2), "compound individiual adjustment"); log.debug(" a: " + t1.getScore() + " " + t2.getScore()); // find more complex composites if (length < maxLength) addComposites(tdb, t3, ++length); } } } } } // -------------------------------------------------------------------------- // debug // -------------------------------------------------------------------------- public void dump(Term t) { double termScore = t.getScore(); System.out.println("t:"+ t.getPreferredName() + " " + termScore + ", " + t.getOccurrences()); Object[] variants = t.getVariantsByRank(); for (int x=0; x < variants.length; x++) { Variant v = (Variant)variants[x]; System.out.println(" v:" + v + ":" + t.getOccurrences(v)); Followers f = followers.get(v); if (f == null) System.out.println(" f:null"); else { System.out.println(" f:delimiters: " + f.getFollowedByDelimiter()); Object[] followers = f.getFollowersByRank(); for (int z=0; z < followers.length; z++) { Variant next = (Variant)followers[z]; double nextScore = next.getTerm().getScore(); int nextOccs = f.getFollowerOccurrences(next); double nextCompositeScore = f.getScore(next); System.out.println(" f:" + next.getValue() + " " + nextScore + ", " + nextOccs + ", " + nextCompositeScore); } } } } // -------------------------------------------------------------------------- // nested classes // -------------------------------------------------------------------------- private class CompositeScoreComparator implements Comparator<Variant> { Followers f; CompositeScoreComparator(Followers f) { this.f = f; } public int compare(Variant v1, Variant v2) { return ObjectUtils.compare(f.getScore(v2), f.getScore(v1)); } }; private class Followers { TObjectIntHashMap<Variant> followers = new TObjectIntHashMap<Variant>(); int followedByDelimiter; int totalFollowerOccurrences; public void addFollower(Token token, int counts) { if (token.getType() == Token.TYPE_VARIANT) { Variant variant = (Variant) token; if (followers.get(variant) > 0) followers.adjustValue(variant, counts); else followers.put(variant, counts); totalFollowerOccurrences += counts; } else { followedByDelimiter += counts; } } public Variant[] getFollowers() { return followers.keys(new Variant[followers.keys().length]); } public Variant[] getFollowersByRank() { Variant[] ranked = followers.keys(new Variant[followers.keys().length]); Arrays.sort(ranked, new CompositeScoreComparator(this)); return ranked; } public int getTotalFollowerOccurences() { return totalFollowerOccurrences; } public int getFollowerOccurrences(Variant v) { return followers.get(v); } public int getFollowerOccurrences(Term t) { int occs = 0; Object[] variants = t.getVariants(); for (int i=0; i < variants.length; i++) { Variant variant = (Variant)variants[i]; if (followers.get(variant) > 0) { occs += getFollowerOccurrences(variant); } } return occs; } public int getFollowedByDelimiter() { return followedByDelimiter; } public double getScore(Variant v) { return (1.0d * getFollowerOccurrences(v)) / totalFollowerOccurrences; } public double getScore(Term t) { double score = 0d; Object[] variants = t.getVariants(); for (int i=0; i < variants.length; i++) { Variant variant = (Variant)variants[i]; if (followers.get(variant) > 0) { score += getScore(variant); } } return score; } public double getLimit() { return 0.64d - (Math.log(totalFollowerOccurrences) / 15.0); // NOTE: empirical } } }