package maui.util; import java.sql.SQLException; import java.util.HashMap; import java.util.Vector; import org.wikipedia.miner.model.Anchor; import org.wikipedia.miner.model.Article; import org.wikipedia.miner.model.Anchor.Sense; public class Candidate { /** Normalized string or vocabulary id */ String name; /** The original full form as it appears in the document */ String fullForm; /** The title of the descriptor in the vocabulary */ String title; /** Number of occurrences of the candidate in the document */ int frequency; /** Normalized frequenc */ double termFrequency; /** Position of the first occurrence */ double firstOccurrence; /** Position of the last occurrence */ double lastOccurrence; /** Wikipedia keyphraseness */ double wikipKeyphraseness = 0; /** Total wikipedia keyphraseness */ double totalWikipKeyphraseness = 0; Anchor anchor = null; Vector<Anchor> anchors = null; /** * HashMap to store occurrence frequencies of all full forms */ HashMap<String,Counter> fullForms; /** * Constructor for the first occurrence of a candidate */ public Candidate (String name, String fullForm, int firstOccurrence) { this.name = name; this.frequency = 1; this.firstOccurrence = (double)firstOccurrence; this.lastOccurrence = (double)firstOccurrence; this.fullForm = fullForm; fullForms = new HashMap<String,Counter>(); fullForms.put(fullForm,new Counter()); } public Candidate(String name, String fullForm, int firstOccurrence, Anchor anchor, double probability) { this.name = name; this.frequency = 1; this.firstOccurrence = (double)firstOccurrence; this.lastOccurrence = (double)firstOccurrence; this.fullForm = fullForm; fullForms = new HashMap<String,Counter>(); fullForms.put(fullForm,new Counter()); this.totalWikipKeyphraseness = probability; this.wikipKeyphraseness = probability; this.anchor = anchor; } public Candidate getCopy() { Candidate newCandidate = new Candidate(this.name, this.fullForm, (int) this.firstOccurrence); newCandidate.frequency = this.frequency; newCandidate.termFrequency = this.termFrequency; newCandidate.firstOccurrence = this.firstOccurrence; newCandidate.lastOccurrence = this.lastOccurrence; newCandidate.fullForms = this.fullForms; newCandidate.totalWikipKeyphraseness = this.totalWikipKeyphraseness; newCandidate.wikipKeyphraseness = this.wikipKeyphraseness; newCandidate.anchor = this.anchor; return newCandidate; } public void setTitle(String title) { this.title = title; } public String getTitle() { return title; } public void setName(String name) { this.name = name; } public Anchor getAnchor() { return anchor; } public double getWikipKeyphraseness() { return wikipKeyphraseness; } public double getTotalWikipKeyphraseness() { return totalWikipKeyphraseness; } /** Returns all document phrases that were mapped to this candidate. * * @return HashMap in which the keys are the full forms and the values are their frequencies */ public HashMap<String,Counter> getFullForms() { return fullForms; } /** * Records the occurrence position and the full form of a candidate * * @param fullForm * @param occurrence */ public void recordOccurrence(String fullForm, int occurrence) { frequency++; lastOccurrence = occurrence; if (fullForms.containsKey(fullForm)) { fullForms.get(fullForm).increment(); } else { fullForms.put(fullForm, new Counter()); } if (totalWikipKeyphraseness != 0) { totalWikipKeyphraseness += wikipKeyphraseness; } } /** * In case of free indexing, e.g. tagging or keyphrase extraction, * retrieves the most frequent full form * for a given candidate. * @return best full form of a candidate */ public String getBestFullForm() { int maxFrequency = 0; String bestFullForm = ""; for (String form : fullForms.keySet()) { int formFrequency = fullForms.get(form).value(); if (formFrequency > maxFrequency) { bestFullForm = form; maxFrequency = formFrequency; } } return bestFullForm; } public String getName() { return name; } public double getFrequency() { return frequency; } public double getTermFrequency() { return termFrequency; } public double getFirstOccurrence() { return firstOccurrence; } public double getLastOccurrence() { return lastOccurrence; } public double getSpread() { return lastOccurrence - firstOccurrence; } /** * Normalizes all occurrence positions and frequencies by the total values in the given document */ public void normalize(int totalFrequency, int documentLength) { termFrequency = frequency/(double)totalFrequency; firstOccurrence = firstOccurrence/(double)documentLength; lastOccurrence = lastOccurrence/(double)documentLength; } public String toString() { return name + " (" + fullForm + "," + title + ")"; } /** * @param args */ public static void main(String[] args) { System.out.println("This is a method for creating candidate topics in a document"); } public String getIdAndTitle() { return name + ": " + title; } /** * If two candidates were disambiguated to the same topic, their values are merged. * @param previousCandidate */ public void mergeWith(Candidate previousCandidate) { // name stays the same // full form stays the same // title stays the same // frequency increments this.frequency += previousCandidate.frequency; // term frequency increments this.termFrequency += previousCandidate.termFrequency; // update first occurrence to the earliest one double previous = previousCandidate.firstOccurrence; if (previous < this.firstOccurrence) { this.firstOccurrence = previous; } // and the opposite with the last occurrence previous = previousCandidate.lastOccurrence; if (previous > this.lastOccurrence) { this.lastOccurrence = previous; } // increment wikip keyphr this.totalWikipKeyphraseness += previousCandidate.totalWikipKeyphraseness; this.wikipKeyphraseness += previousCandidate.wikipKeyphraseness; // anchor should be added to the list of anchors if (anchors == null) { anchors = new Vector<Anchor>(); anchors.add(this.anchor); } anchors.add(previousCandidate.anchor); // full forms should be added to the hash of full forms if (fullForms == null) { System.err.println("Is it ever empty??? "); fullForms = previousCandidate.fullForms; } HashMap<String,Counter> prevFullForms = previousCandidate.fullForms; for(String prevForm : prevFullForms.keySet()) { int count = prevFullForms.get(prevForm).value(); if (fullForms.containsKey(prevForm)) { fullForms.get(prevForm).increment(count); } else { fullForms.put(prevForm, new Counter(count)); } } } /** * Retrieves all recorded info about a candidate * @return info about a candidate formatted as a string */ public String getInfo() { String result = ""; String allFullForms = ""; for (String form : fullForms.keySet()) { allFullForms += form + " (" + fullForms.get(form) + "), "; } String allAnchors = ""; if (anchors != null) { for (Anchor anch : anchors) { allAnchors += anch + ", "; } } result += "\tName: " + this.name + "\n"; result += "\tFullForm: " + this.fullForm + "\n"; result += "\tArticle: " + this.article + "\n"; result += "\tAllFullForms: " + allFullForms + "\n"; result += "\tTitle: " + this.title + "\n"; result += "\tFreq " + this.frequency + "\n"; result += "\tTermFreq: " + this.termFrequency + "\n"; result += "\tFirstOcc: " + this.firstOccurrence + "\n"; result += "\tLastOcc: " + this.lastOccurrence + "\n"; result += "\tWikipKeyphr: " + this.wikipKeyphraseness + "\n"; result += "\tTotalWikipKeyphr: " + this.totalWikipKeyphraseness + "\n"; result += "\tAnchor: " + this.anchor + "\n"; result += "\tAnchors: " + allAnchors + "\n"; return result; } public Vector<Anchor> getAnchors() { return this.anchors; } private Article article; public void setArticle(Article article) { this.article = article; } public Article getArticle() { return this.article; } }