package org.wikipedia.miner.model; import org.wikipedia.miner.db.WEnvironment; import org.wikipedia.miner.db.struct.DbLabel; import org.wikipedia.miner.db.struct.DbSenseForLabel; import org.wikipedia.miner.util.WikipediaConfiguration; import org.wikipedia.miner.util.text.TextProcessor; /** * A term or phrase that has been used to refer to one or more {@link Article Articles} in Wikipedia. * * These provide your best way of searching for articles relating to or describing a particular term. */ public class Label { //properties ============================================================= private String text ; private TextProcessor textProcessor ; private long linkDocCount = 0 ; private long linkOccCount = 0 ; private long textDocCount = 0; private long textOccCount = 0; private Sense[] senses = null ; protected WEnvironment env ; private boolean detailsSet ; //constructor ============================================================= /** * Initialises a Label using the default {@link TextProcessor} specified in your {@link WikipediaConfiguration} * * @param env an active WEnvironment * @param text the term or phrase of interest */ public Label(WEnvironment env, String text) { this.env = env ; this.text = text ; this.textProcessor = env.getConfiguration().getDefaultTextProcessor() ; this.detailsSet = false ; } /** * Initialises a Label using the given {@link TextProcessor}. * * @param env an active WEnvironment * @param text the term or phrase of interest * @param tp a text processor to alter how the given text is matched. If this is null, then texts will be matched directly, without processing. */ public Label(WEnvironment env, String text, TextProcessor tp) { this.env = env ; this.text = text ; this.textProcessor = tp ; this.detailsSet = false ; } //public ================================================================== @Override public String toString() { return "\"" + text + "\"" ; } /** * @return the text used to refer to concepts. */ public String getText() { return text; } /** * @return true if this label has ever been used to refer to an article, otherwise false */ public boolean exists() { if (!detailsSet) setDetails() ; return (senses.length > 0) ; } /** * @return the number of articles that contain links with this label used as an anchor. */ public long getLinkDocCount() { if (!detailsSet) setDetails() ; return linkDocCount; } /** * @return the number of links that use this label as an anchor. */ public long getLinkOccCount() { if (!detailsSet) setDetails() ; return linkOccCount; } /** * @return the number of articles that mention this label (either as links or in plain text). */ public long getDocCount() { if (!detailsSet) setDetails() ; return textDocCount; } /** * @return the number of times this label is mentioned in articles (either as links or in plain text). */ public long getOccCount() { if (!detailsSet) setDetails() ; return textOccCount; } /** * @return the probability that this label is used as a link in Wikipedia ({@link #getLinkDocCount()}/{@link #getDocCount()}. */ public double getLinkProbability() { if (!detailsSet) setDetails() ; if (textDocCount == 0) return 0 ; double linkProb = (double) linkDocCount/textDocCount ; if (linkProb >1) linkProb = 1 ; return linkProb ; } /** * @return an array of {@link Sense Senses}, sorted by {@link Sense#getPriorProbability()}, that this label refers to. */ public Sense[] getSenses() { if (!detailsSet) setDetails() ; return senses ; } /** * A possible sense for a label */ public class Sense extends Article { private long sLinkDocCount ; private long sLinkOccCount ; private boolean fromTitle ; private boolean fromRedirect ; //constructor ============================================================= protected Sense(WEnvironment env, DbSenseForLabel s) { super(env, s.getId()) ; this.sLinkDocCount = s.getLinkDocCount() ; this.sLinkOccCount = s.getLinkOccCount() ; this.fromTitle = s.getFromTitle() ; this.fromRedirect = s.getFromRedirect() ; } //public ================================================================== /** * Returns the number of documents that contain links that use the surrounding label as anchor text, and point to this sense as the destination. * * @return the number of documents that contain links that use the surrounding label as anchor text, and point to this sense as the destination. */ public long getLinkDocCount() { return sLinkDocCount; } /** * Returns the number of links that use the surrounding label as anchor text, and point to this sense as the destination. * * @return the number of links that use the surrounding label as anchor text, and point to this sense as the destination. */ public long getLinkOccCount() { return sLinkOccCount; } /** * Returns true if the surrounding label is used as a title for this sense article, otherwise false * * @return true if the surrounding label is used as a title for this sense article, otherwise false */ public boolean isFromTitle() { return fromTitle; } /** * Returns true if the surrounding label is used as a redirect for this sense article, otherwise false * * @return true if the surrounding label is used as a redirect for this sense article, otherwise false */ public boolean isFromRedirect() { return fromRedirect; } /** * Returns the probability that the surrounding label goes to this destination * * @return the probability that the surrounding label goes to this destination */ public double getPriorProbability() { if (getSenses().length == 1) return 1 ; if (linkOccCount == 0) return 0 ; else return ((double)sLinkOccCount) / linkOccCount ; } /** * Returns true if this is the most likely sense for the surrounding label, otherwise false * * @return true if this is the most likely sense for the surrounding label, otherwise false */ public boolean isPrimary() { return (this == senses[0]) ; } } //protected and private ==================================================== private void setDetails() { try { DbLabel lbl = env.getDbLabel(textProcessor).retrieve(text) ; if (lbl == null) { throw new Exception() ; } else { setDetails(lbl) ; } } catch (Exception e) { this.senses = new Sense[0] ; detailsSet = true ; } } private void setDetails(DbLabel lbl) { this.linkDocCount = lbl.getLinkDocCount() ; this.linkOccCount = lbl.getLinkOccCount() ; this.textDocCount = lbl.getTextDocCount() ; this.textOccCount = lbl.getTextOccCount() ; this.senses = new Sense[lbl.getSenses().size()] ; int i = 0 ; for (DbSenseForLabel dbs:lbl.getSenses()) { this.senses[i] = new Sense(env, dbs) ; i++ ; } this.detailsSet = true ; } public static Label createLabel(WEnvironment env, String text, DbLabel dbLabel, TextProcessor tp) { Label l = new Label(env, text, tp) ; l.setDetails(dbLabel) ; return l ; } }