/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import net.ontopia.topicmaps.core.TopicIF; import net.ontopia.topicmaps.core.TopicMapIF; import net.ontopia.topicmaps.core.TopicNameIF; import net.ontopia.topicmaps.core.VariantNameIF; import net.ontopia.topicmaps.query.core.ParsedQueryIF; import net.ontopia.topicmaps.query.core.QueryProcessorIF; import net.ontopia.topicmaps.query.core.QueryResultIF; import net.ontopia.topicmaps.query.utils.QueryUtils; import net.ontopia.topicmaps.utils.TopicStringifiers; import net.ontopia.utils.OntopiaRuntimeException; /** * INTERNAL: */ public class TopicMapAnalyzer implements TermAnalyzerIF { TermDatabase tdb; TopicMapIF topicmap; QueryProcessorIF qp; ParsedQueryIF pq_byName; Collection<TopicIF> ctypes; List<TopicIF> ctypes_sorted; Map<TopicIF, AssociationType> atypes; Collection<TopicIF> atopics = new HashSet<TopicIF>(); Map<String, Variant> smap = new HashMap<String, Variant>(); Map<String, Collection<TopicIF>> vtopics = new HashMap<String, Collection<TopicIF>>(); double matchFactor = 4.0d; public TopicMapAnalyzer(TopicMapIF topicmap) { this.topicmap = topicmap; try { this.qp = QueryUtils.getQueryProcessor(topicmap); this.pq_byName = qp.parse("select $T from topic-name($T, $N), value($N, %VALUE%)?"); this.ctypes = new HashSet<TopicIF>(); this.ctypes_sorted = new ArrayList<TopicIF>(); this.atypes = new HashMap<TopicIF, AssociationType>(); QueryResultIF qr = qp.execute("/* #OPTION: optimizer.reorder = false */ " + "using on for i\"http://psi.ontopia.net/ontology/\" " + "using cl for i\"http://psi.ontopia.net/classify/\" " + "using xtm for i\"http://www.topicmaps.org/xtm/1.0/core.xtm#\" " + "descendant-of($ANC, $DES) :- " + " { xtm:superclass-subclass($ANC : xtm:superclass, $DES : xtm:subclass) " + " | xtm:superclass-subclass($ANC : xtm:superclass, $MID : xtm:subclass), descendant-of($MID, $DES) }. " + "has-role-field($PT, $AT, $RT) :- " + " { on:has-field($AT : on:field, $PT : on:topic-type, $RT : on:role-type) " + " | xtm:superclass-subclass($PT : xtm:subclass, $XT : xtm:superclass), has-role-field($XT, $AT, $RT) }. " + "select $CTYPE, $AT, $PRT, $CRT, $ASCORE, $USCORE from " + "subject-identifier($CT, \"http://psi.ontopia.net/classify/classification-type\"), type($A, $CT), " + "association-role($A, $R1), type($R1, $CAT), subject-identifier($CAT, \"http://psi.ontopia.net/classify/classified-association-type\"), " + "association-role($A, $R2), type($R2, $CTT), subject-identifier($CTT, \"http://psi.ontopia.net/classify/classified-topic-type\"), " + "role-player($R1, $AT), role-player($R2, $PTYPE), " + "{ occurrence($AT, $O1), type($O1, $OT1), subject-identifier($OT1, \"http://psi.ontopia.net/classify/score-threshold-with-candidates\"), value($O1, $ASCORE)}, " + "{ occurrence($AT, $O2), type($O2, $OT2), subject-identifier($OT2, \"http://psi.ontopia.net/classify/score-threshold\"), value($O2, $USCORE)}, " + "has-role-field($PTYPE, $AT, $PRT), " + "has-role-field($CTYPE, $AT, $CRT), " + "$PRT /= $CRT, topic($CTYPE)" + "order by $CTYPE?"); while (qr.next()) { TopicIF ctype = (TopicIF)qr.getValue(0); TopicIF atype = (TopicIF)qr.getValue(1); TopicIF prtype = (TopicIF)qr.getValue(2); TopicIF crtype = (TopicIF)qr.getValue(3); String asc = (String)qr.getValue(4); String usc = (String)qr.getValue(5); if (ctypes.add(ctype)) ctypes_sorted.add(ctype); AssociationType at = atypes.get(atype); if (at == null) { double ascore = -1.0d; if (asc != null) { try { ascore = Double.parseDouble(asc); } catch (NumberFormatException e) { } } double uscore = -1.0d; if (usc != null) { try { uscore = Double.parseDouble(usc); } catch (NumberFormatException e) { } } //! System.out.println("SC: " + atype + " " + asc + " : " + score); at = new AssociationType(atype, prtype, crtype, ascore, uscore); atypes.put(atype, at); } at.addCandidateType(ctype); //! System.out.println("s: " + ctype); } } catch (Exception e) { throw new OntopiaRuntimeException(e); } } public void analyzeTerm(Term term) { try { int foundMatches = 0; // look up term by name //! System.out.println("t> '" + term.getStem() + "'"); Object[] variants = term.getVariants(); for (int i=0; i < variants.length; i++) { Variant variant = (Variant)variants[i]; //! System.out.println(" v> '" + variant.getValue() + "'"); QueryResultIF qr = pq_byName.execute(Collections.singletonMap("VALUE", variant.getValue())); try { while (qr.next()) { TopicIF topic = (TopicIF)qr.getValue(0); // ignore topic if topic type is unknown boolean validType = false; for (TopicIF type : topic.getTypes()) { if (ctypes.contains(type)) { validType = true; break; } } if (!validType) { //! System.out.println(" e> " + term.getStem() + " " + topic + " " + topic.getTypes()); continue; } String value = variant.getValue(); smap.put(value, variant); Collection<TopicIF> matching = vtopics.get(value); if (matching == null) { matching = new HashSet<TopicIF>(); vtopics.put(value, matching); } matching.add(topic); atopics.add(topic); foundMatches++; } } finally { qr.close(); } } // adjust score if term found in topic map if (foundMatches > 0) term.multiplyScore(matchFactor, "found in topic map"); } catch (Exception e) { throw new OntopiaRuntimeException(e); } } public void startAnalysis(TermDatabase tdb) { this.tdb = tdb; } public void endAnalysis() { // merge terms that are synonyms for (TopicIF topic : atopics) { Term term = null; for (TopicNameIF bname : topic.getTopicNames()) { term = createTerm(term, bname.getValue()); for (VariantNameIF vname : bname.getVariants()) { term = createTerm(term, vname.getValue()); } } } // boost score by topic type // store information about which terms map to which topics this.tdb = null; } private Term createTerm(Term term, String value) { if (value == null) return term; Variant variant = tdb.getVariant(value); if (variant != null) { if (term == null) { return variant.getTerm(); } else { tdb.mergeTerms(term, variant.getTerm()); } } return term; } // -- inner classes public static class AssociationType { public TopicIF atype; public TopicIF prtype; public TopicIF crtype; public double ascore; public double uscore; public Collection<TopicIF> ctypes = new HashSet<TopicIF>(); AssociationType(TopicIF atype, TopicIF prtype, TopicIF crtype, double ascore, double uscore) { this.atype = atype; this.prtype = prtype; this.crtype = crtype; this.ascore = ascore; this.uscore = uscore; } public void addCandidateType(TopicIF ctype) { this.ctypes.add(ctype); } public String getKey() { return atype.getObjectId() + ":" + prtype.getObjectId() + ":" + crtype.getObjectId(); } public String getAssociationTypeId() { return atype.getObjectId(); } public String getContentRoleTypeId() { return prtype.getObjectId(); } public String getTopicRoleTypeId() { return crtype.getObjectId(); } public String getName() { return TopicStringifiers.toString(atype, prtype); } public double getScoreThreshold(boolean hasCandidates) { //! System.out.println("HS: " + hasCandidates + " " + atype + " " + ascore + " vs " + uscore); if (hasCandidates) return (ascore >= 0 ? ascore : uscore); else return (uscore >= 0 ? uscore : ascore); } } // -- public methods public Collection<TopicIF> getTopics(Variant variant) { Collection<TopicIF> result = vtopics.get(variant.getValue()); return (result == null ? new HashSet<TopicIF>() : result); } public Collection<TopicIF> getCandidateTypes() { return ctypes_sorted; } public Collection<AssociationType> getAssociationTypes() { return atypes.values(); } }