/* * DocumentTagger.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.annotation.tagging; import gnu.trove.map.hash.TIntDoubleHashMap; import java.util.*; import org.wikipedia.miner.annotation.* ; import org.wikipedia.miner.annotation.preprocessing.* ; import org.wikipedia.miner.util.*; /** * Tags documents by adding markup to to the topics it mentions. * <p> * You can use this to tag all topics by giving it the output of the TopicDetector, or * just those that should be linked to (from LinkDetector) or those that the document * is about (from Indexers). If using the latter two, then you should first modify the list * of topics to include only those that are most likely to be links or key topics * (i.e those above a certain weight). * * @author David Milne */ public abstract class DocumentTagger { /** * Options for tagging or ignoring repeat mentions of topics */ public enum RepeatMode { /** * All mentions of a topic will be tagged */ ALL, /** * Only the first mention of a topic will be tagged */ FIRST, /** * Only the first mention within each region (e.g. DIV) will be tagged */ FIRST_IN_REGION } ; /** * Specifies how terms in the document will be replaced by tags. A tagger for html, for example, might return a link to the relevant Wikipedia article. * * @param term the text in the original document that will be tagged. * @param topic the relevant topic that the term was disambiguated to. * @return the tag that will replace the given term. */ public abstract String getTag(String term, Topic topic) ; /** * Tags the given text with occurrences of the given topics. * * @param doc the document to be tagged * @param topics a set of automatically detected topics, i.e. from TopicDetector or LinkDetector * @param repeatMode ALL, FIRST, or FIRST_IN_REGION * @return the tagged text */ public String tag(PreprocessedDocument doc, Collection<Topic> topics, RepeatMode repeatMode) { doc.resetRegionTracking() ; HashMap<Integer,Topic> topicsById = new HashMap<Integer, Topic>() ; for (Topic topic: topics) topicsById.put(topic.getId(), topic) ; ArrayList<TopicReference> references = resolveCollisions(topics) ; String originalText = doc.getOriginalText() ; StringBuffer wikifiedText = new StringBuffer() ; int lastIndex = 0 ; HashSet<Integer> doneIds = new HashSet<Integer>() ; for (TopicReference reference:references) { int start = reference.getPosition().getStart() ; int end = reference.getPosition().getEnd() ; int id = reference.getTopicId() ; Topic topic = topicsById.get(id) ; //System.out.println("considering tagging " + topic + " at " + reference.getPosition()) ; if (repeatMode == RepeatMode.FIRST_IN_REGION) doneIds = doc.getDoneIdsInCurrentRegion(start) ; if (topic != null && (repeatMode == RepeatMode.ALL || !doneIds.contains(id))) { doneIds.add(id) ; wikifiedText.append(originalText.substring(lastIndex, start)) ; wikifiedText.append(getTag(originalText.substring(start, end), topic)) ; lastIndex = end ; //System.out.println(" - tagged") ; } } wikifiedText.append(originalText.substring(lastIndex)) ; return wikifiedText.toString() ; } private ArrayList<TopicReference> resolveCollisions(Collection<Topic> topics) { //build up a list of topic references and hashmap of topic weights ArrayList<TopicReference> references = new ArrayList<TopicReference>() ; TIntDoubleHashMap topicWeights = new TIntDoubleHashMap() ; for(Topic topic: topics) { for (Position pos: topic.getPositions()) { topicWeights.put(topic.getId(), topic.getWeight()) ; TopicReference tr = new TopicReference(null, topic.getId(), pos) ; references.add(tr) ; } } //sort references Collections.sort(references) ; for (int i=0 ; i<references.size(); i++) { TopicReference outerRef = references.get(i) ; //identify weight of this reference double outerWeight = topicWeights.get(outerRef.getTopicId()); //identify references overlapped by this one, and their total weight Vector<TopicReference> innerReferences = new Vector<TopicReference>() ; double maxInnerWeight = 0 ; for (int j=i+1 ; j<references.size(); j++){ TopicReference innerRef = references.get(j) ; if (outerRef.overlaps(innerRef)) { innerReferences.add(innerRef) ; double innerWeight = topicWeights.get(innerRef.getTopicId()); if (innerWeight > maxInnerWeight) maxInnerWeight = innerWeight ; } else { break ; } } if ((maxInnerWeight*0.8) > outerWeight) { // want to keep the inner references references.remove(i) ; i = i-1 ; } else { //want to keep the outer reference for (int j=0 ; j<innerReferences.size() ; j++) { references.remove(i+1) ; } } } return references ; } /* private Vector<TopicReference> resolveCollisions(Collection<Topic> topics) { TIntDoubleHashMap topicWeights = new TIntDoubleHashMap() ; TreeSet<TopicReference> temp = new TreeSet<TopicReference>() ; for(Topic topic: topics) { for (Position pos: topic.getPositions()) { topicWeights.put(topic.getId(), topic.getWeight()) ; TopicReference tr = new TopicReference(null, topic.getId(), pos) ; temp.add(tr) ; } } Vector<TopicReference> references = new Vector<TopicReference>() ; references.addAll(temp) ; for (int i=0 ; i<references.size(); i++) { TopicReference reference = references.elementAt(i) ; Vector<TopicReference> overlappedTopics = new Vector<TopicReference>() ; for (int j=i+1 ; j<references.size(); j++){ TopicReference reference2 = references.elementAt(j) ; if (reference.overlaps(reference2)) overlappedTopics.add(reference2) ; } for (int j=0 ; j<overlappedTopics.size() ; j++) { references.removeElementAt(i+1) ; } /* //TODO: why is all of this blanked out?? double refWeight = 0 ; Integer refId = reference.getTopicId() ; if (topicWeights.containsKey(refId)) refWeight = topicWeights.get(refId) ; double overlapWeight = 0 ; for (int j=i+1 ; j<references.size(); j++){ TopicReference reference2 = references.elementAt(j) ; if (reference.overlaps(reference2)) { //System.out.println("--" + getNGram(words, c.getStartIndex(), c.getEndIndex()) + " overlaps " + getNGram(words, c1.getStartIndex(), c1.getEndIndex())); overlappedTopics.add(reference2) ; double ref2Weight = 0 ; Integer ref2Id = reference2.getTopicId() ; if (topicWeights.containsKey(ref2Id)) ref2Weight = topicWeights.get(ref2Id) ; overlapWeight = overlapWeight + ref2Weight ; } else { break ; } } if (overlappedTopics.size() > 0) overlapWeight = overlapWeight / overlappedTopics.size() ; if (overlapWeight > refWeight) { // want to keep the overlapped items references.removeElementAt(i) ; i = i-1 ; } else { //want to keep the overlapping item for (int j=0 ; j<overlappedTopics.size() ; j++) { references.removeElementAt(i+1) ; } } } return references ; }*/ }