package org.wikipedia.miner.model; import org.wikipedia.miner.db.WEnvironment.StatisticName; import org.wikipedia.miner.db.struct.DbIntList; import org.wikipedia.miner.db.struct.DbPage; import org.wikipedia.miner.db.WEnvironment; import org.wikipedia.miner.util.MarkupStripper; /** * Represents pages of any type in Wikipedia */ public class Page implements Comparable<Page> { /** * Types that wikipedia pages can be. */ public enum PageType { /** * A page that provides informative text about a topic. */ article, /** * A page that hierarchically organises other pages */ category, /** * A page that exists only to connect an alternative title to an article */ redirect, /** * A page that lists possible senses of an ambiguous word */ disambiguation, /** * A page that can be transcluded into other pages */ template, /** * A type of page that we don't currently deal with (e.g templates) */ invalid } ; protected int id ; protected String title ; protected PageType type ; protected int depth ; protected Double weight = null ; protected WEnvironment env ; protected boolean detailsSet ; //constructor ============================================================= /** * Initialises a newly created Page so that it represents the page given by <em>id</em> and <em>DbPage</em>. * * This is the most efficient page constructor as no database lookup is required. * * @param env an active WikipediaEnvironment * @param id the unique identifier of the page * @param pd details (title, type, etc) of the page */ protected Page(WEnvironment env, int id, DbPage pd) { this.env = env ; this.id = id ; setDetails(pd) ; } /** * Initialises a newly created Page so that it represents the page given by <em>id</em>. This is also an efficient * constructor, since details (page title, type, etc) are only retrieved when requested. * * @param env an active WikipediaEnvironment * @param id the unique identifier of the Wikipedia page */ public Page(WEnvironment env, int id) { this.env = env ; this.id = id ; this.detailsSet = false ; } //public ================================================================== /** * @return the database environment */ public WEnvironment getEnvironment() { return env; } /** * @return true if a page with this id is defined in Wikipedia, otherwise false. */ public boolean exists() { if (!detailsSet) setDetails() ; return (type != PageType.invalid) ; } /** * Sets the weight by which this page will be compared to others. * * @param weight the weight by which this page will be compared to others. */ public void setWeight(Double weight) { this.weight = weight ; } /** * @return the weight by which this page is compared to others. (may be null, in which case the page is compared only via id) */ public Double getWeight() { return weight ; } /** * @param p the page to compare to * @return true if this page has the same id as the given one, otherwise false */ public boolean equals(Page p) { return p.id == id ; } /** * Compares this page to another. If weights are defined for both pages, then the page with the larger * weight will be considered smaller (and thus appear earlier in sorted lists). Otherwise, the comparison is made based on their ids. * * @param p the Page to be compared * @return see above. */ public int compareTo(Page p) { if (p.id == id) return 0 ; int cmp = 0 ; if (p.weight != null && weight != null && p.weight != weight) cmp = p.weight.compareTo(weight) ; if (cmp == 0) cmp = new Integer(id).compareTo(p.id) ; return cmp ; } /** * Returns a string representation of this page, in the format "<em>id</em>: <em>title</em>". * * @return a string representation of the page */ public String toString() { String s = getId() + ": " + getTitle() ; return s ; } /** * @return the unique identifier */ public int getId() { return id; } /** * @return the title */ public String getTitle() { if (!detailsSet) setDetails() ; return title; } /** * @return the type of the page */ public PageType getType() { if (!detailsSet) setDetails() ; return type; } /** * @return the length of the shortest path from this page to the root category, or null if no path exists. */ public Integer getDepth() { if (!detailsSet) setDetails() ; if (depth < 0) return null ; else return depth ; } /** * @return a number representing the height of this page in the category hierarchy, between {@value 0} (as far from the root category as possible) and 1 {the root category}, or null if no path exists */ public Float getGenerality() { Integer d = getDepth() ; if (d == null) return null ; int maxDepth = env.retrieveStatistic(StatisticName.maxCategoryDepth).intValue() ; return 1-((float)d/maxDepth) ; } /** * @return the content of this page, in mediawiki markup format */ public String getMarkup() { String markup = env.getDbMarkup().retrieve(id) ; return markup ; } /** * @return the character positions of sentence breaks within this page's content */ public Integer[] getSentenceSplits() { DbIntList splits = env.getDbSentenceSplits().retrieve(id) ; if (splits == null || splits.getValues() == null) return new Integer[0] ; return splits.getValues().toArray(new Integer[splits.getValues().size()]) ; } /** * @param index the index of the desired sentence * @return the content of the desired sentence, in mediawiki markup format */ public String getSentenceMarkup(int index) { String markup = getMarkup() ; Integer[] splits = getSentenceSplits() ; MarkupStripper s = new MarkupStripper() ; markup = s.stripAllButInternalLinksAndEmphasis(markup, ' ') ; markup = s.stripNonArticleInternalLinks(markup, ' ') ; String sentence ; if (splits.length == 0) sentence = markup ; else if (index == 0) sentence = markup.substring(0, splits[0]) ; else if (index < splits.length) sentence = markup.substring(splits[index-1], splits[index]) ; else if (index == splits.length) sentence = markup.substring(splits[index-1]) ; else sentence = "" ; sentence = sentence.replaceAll("\\s+", " ") ; return sentence.trim(); } /** * Returns the first paragraph from the content of this page, cleaned of all markup except links and * basic formating. * This generally serves as a more specific definition of the concept or concepts for which this * article, disambiguation page or category was written. * * @return the first paragraph on this page. */ public String getFirstParagraphMarkup() { MarkupStripper stripper = new MarkupStripper() ; String markup = getMarkup() ; markup = markup.replaceAll("={2,}(.+)={2,}", "\n") ; //clear section headings completely - not just formating, but content as well. markup = stripper.stripAllButInternalLinksAndEmphasis(markup, null) ; markup = stripper.stripNonArticleInternalLinks(markup, null) ; markup = stripper.stripExcessNewlines(markup) ; String fp = "" ; int pos = markup.indexOf("\n\n") ; while (pos>=0) { fp = markup.substring(0, pos) ; if (pos > 150) break ; pos = markup.indexOf("\n\n", pos+2) ; } fp = fp.replaceAll("\n", " ") ; fp = fp.replaceAll("\\s+", " ") ; //turn all whitespace into spaces, and collapse them. fp = fp.trim(); return fp ; } //public static ============================================================ /** * Instantiates the appropriate subclass of Page given the supplied parameters * * @param env an active Wikipedia environment * @param id the id of the page * @return the instantiated page, which can be safely cast as appropriate */ public static Page createPage(WEnvironment env, int id) { DbPage pd = env.getDbPage().retrieve(id) ; if (pd != null) return createPage(env, id, pd) ; else { pd = new DbPage("Invalid id or excluded via caching", PageType.invalid.ordinal(), -1) ; return new Page(env, id, pd) ; } } /** * Instantiates the appropriate subclass of Page given the supplied parameters * * @param env an active Wikipedia environment * @param id the id of the page * @param pd the details of the page * @return the instantiated page, which can be safely cast as appropriate */ public static Page createPage(WEnvironment env, int id, DbPage pd) { Page p = null ; PageType type = PageType.values()[pd.getType()] ; switch (type) { case article: p = new Article(env, id, pd) ; break ; case redirect: p = new Redirect(env, id, pd) ; break ; case disambiguation: p = new Disambiguation(env, id, pd) ; break ; case category: p = new Category(env, id, pd) ; break ; case template: p = new Template(env, id, pd) ; break ; default: p = new Page(env, id, pd) ; } return p ; } //protected and private ==================================================== private void setDetails() { try { DbPage pd = env.getDbPage().retrieve(id) ; if (pd == null) { throw new Exception() ; } else { setDetails(pd) ; } } catch (Exception e) { title = null ; type = PageType.invalid ; } } private void setDetails(DbPage pd) { title = pd.getTitle() ; type = PageType.values()[pd.getType()] ; depth = pd.getDepth() ; detailsSet = true ; } }