/* * Article.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.model; import java.util.* ; import org.wikipedia.miner.db.WEnvironment; import org.wikipedia.miner.db.struct.DbIntList; import org.wikipedia.miner.db.struct.DbLabelForPage; import org.wikipedia.miner.db.struct.DbLabelForPageList; import org.wikipedia.miner.db.struct.DbLinkLocation; import org.wikipedia.miner.db.struct.DbLinkLocationList; import org.wikipedia.miner.db.struct.DbPage; import org.wikipedia.miner.db.struct.DbPageLinkCounts; import org.wikipedia.miner.db.struct.DbTranslations; /** * Represents articles in Wikipedia; the pages that contain descriptive text regarding a particular topic. */ public class Article extends Page { /** * Initialises a newly created Article so that it represents the article given by <em>id</em>. * * @param env an active WEnvironment * @param id the unique identifier of the article */ public Article(WEnvironment env, int id) { super(env, id) ; } protected Article(WEnvironment env, int id, DbPage pd) { super(env, id, pd) ; } /** * Returns a array of {@link Redirect Redirects}, sorted by id, that point to this article. * * @return an array of Redirects, sorted by id */ public Redirect[] getRedirects() { DbIntList tmpRedirects = env.getDbRedirectSourcesByTarget().retrieve(id) ; if (tmpRedirects == null || tmpRedirects.getValues() == null) return new Redirect[0] ; Redirect[] redirects = new Redirect[tmpRedirects.getValues().size()] ; for (int i=0 ; i<tmpRedirects.getValues().size() ; i++) redirects[i] = new Redirect(env, tmpRedirects.getValues().get(i)) ; return redirects ; } /** * Returns an array of {@link Category Categories} that this article belongs to. These are the categories * that are linked to at the bottom of any Wikipedia article. Note that one of these will be the article's * equivalent category, if one exists. * * @return an array of Categories, sorted by id */ public Category[] getParentCategories() { DbIntList tmpParents = env.getDbArticleParents().retrieve(id) ; if (tmpParents == null || tmpParents.getValues() == null) return new Category[0] ; Category[] parentCategories = new Category[tmpParents.getValues().size()] ; int index = 0 ; for (int id:tmpParents.getValues()) { parentCategories[index] = new Category(env, id) ; index++ ; } return parentCategories ; } //TODO:equivalent categories /** * Returns the {@link Category} that relates to the same concept as this article. For instance, calling * this for "6678: Cat" returns the category "799717: Cats" * * Note that many articles do not have equivalent categories; they are only used when the article * describes a general topic for which there are other, more specific, articles. Consequently, * this method will often return null. * * @return the equivalent Category, or null *//* public Category getEquivalentCategory() { Category equivalentCategory = null ; /* Statement stmt = getWikipediaDatabase().createStatement() ; ResultSet rs = stmt.executeQuery("SELECT page_id, page_title FROM equivalence, page WHERE page_id=eq_cat AND eq_art=" + id) ; if (rs.first()) { try { equivalentCategory = new Category(database, rs.getInt(1), new String(rs.getBytes(2), "UTF-8")) ; } catch (Exception e) {} ; } rs.close() ; stmt.close() ; return equivalentCategory ; }*/ /** * Returns an array of {@link Article Articles} that link to this article. These * are defined by the internal hyperlinks within article text. If these hyperlinks came via * redirects, then they are resolved. * * @return the array of Articles that link to this article, sorted by id. */ public Article[] getLinksIn() { if (env.getDbPageLinkIn().isCached() && !env.getDbPageLinkInNoSentences().isCached()) { DbLinkLocationList tmpLinks = env.getDbPageLinkIn().retrieve(id) ; if (tmpLinks == null || tmpLinks.getLinkLocations() == null) return new Article[0] ; Article[] links = new Article[tmpLinks.getLinkLocations().size()] ; int index = 0 ; for (DbLinkLocation ll:tmpLinks.getLinkLocations()) { links[index] = new Article(env, ll.getLinkId()) ; index++ ; } return links ; } else { DbIntList tmpLinks = env.getDbPageLinkInNoSentences().retrieve(id) ; if (tmpLinks == null || tmpLinks.getValues() == null) return new Article[0] ; Article[] links = new Article[tmpLinks.getValues().size()] ; int index = 0 ; for (Integer linkId:tmpLinks.getValues()) { links[index] = new Article(env, linkId) ; index++ ; } return links ; } } /** * Returns an array of {@link Article}s, sorted by article id, that this article * links to. These are defined by the internal hyperlinks within article text. * If these hyperlinks point to redirects, then these are resolved. * * @return an array of Articles that this article links to, sorted by id */ public Article[] getLinksOut() { if (env.getDbPageLinkOut().isCached() && !env.getDbPageLinkOutNoSentences().isCached()) { DbLinkLocationList tmpLinks = env.getDbPageLinkOut().retrieve(id) ; if (tmpLinks == null || tmpLinks.getLinkLocations() == null) return new Article[0] ; Article[] links = new Article[tmpLinks.getLinkLocations().size()] ; int index = 0 ; for (DbLinkLocation ll:tmpLinks.getLinkLocations()) { links[index] = new Article(env, ll.getLinkId()) ; index++ ; } return links ; } else { DbIntList tmpLinks = env.getDbPageLinkOutNoSentences().retrieve(id) ; if (tmpLinks == null || tmpLinks.getValues() == null) return new Article[0] ; Article[] links = new Article[tmpLinks.getValues().size()] ; int index = 0 ; for (Integer linkId:tmpLinks.getValues()) { links[index] = new Article(env, linkId) ; index++ ; } return links ; } } /** * Returns the title of the article translated into the language given by <em>languageCode</em> * (i.e. fn, jp, de, etc) or null if translation is not available. * * @param languageCode the (generally 2 character) language code. * @return the translated title if it is available; otherwise null. */ public String getTranslation(String languageCode) { DbTranslations t = env.getDbTranslations().retrieve(id) ; if (t == null) return null ; if (t.getTranslationsByLangCode() == null) return null ; return t.getTranslationsByLangCode().get(languageCode.toLowerCase()) ; } /** * Returns a TreeMap associating language code with translated title for all available translations * * @return a TreeMap associating language code with translated title. */ public TreeMap<String,String> getTranslations() { DbTranslations t = env.getDbTranslations().retrieve(id) ; if (t == null) return new TreeMap<String,String>() ; else return t.getTranslationsByLangCode() ; } /** * @return the total number of links that are made to this article */ public int getTotalLinksInCount() { DbPageLinkCounts lc = env.getDbPageLinkCounts().retrieve(id) ; if (lc == null) return 0 ; else return lc.getTotalLinksIn() ; } /** * @return the number of distinct articles which contain a link to this article */ public int getDistinctLinksInCount() { DbPageLinkCounts lc = env.getDbPageLinkCounts().retrieve(id) ; if (lc == null) return 0 ; else return lc.getDistinctLinksIn() ; } /** * @return the total number links that this article makes to other articles */ public int getTotalLinksOutCount() { DbPageLinkCounts lc = env.getDbPageLinkCounts().retrieve(id) ; if (lc == null) return 0 ; else return lc.getTotalLinksOut() ; } /** * @return the number of distinct articles that this article links to */ public int getDistinctLinksOutCount() { DbPageLinkCounts lc = env.getDbPageLinkCounts().retrieve(id) ; if (lc == null) return 0 ; else return lc.getDistinctLinksOut() ; } /** * Returns an array of {@link Label Labels} that have been used to refer to this article. * They are sorted by the number of times each label is used. * * @return an array of {@link Label Labels} that have been used to refer to this article. */ public Label[] getLabels() { DbLabelForPageList tmpLabels = env.getDbLabelsForPage().retrieve(id) ; if (tmpLabels == null || tmpLabels.getLabels() == null) return new Label[0] ; Label[] labels = new Label[tmpLabels.getLabels().size()] ; int index = 0 ; for (DbLabelForPage ll:tmpLabels.getLabels()) { labels[index] = new Label(ll) ; index++ ; } return labels ; } /** * This efficiently identifies sentences within this article that contain links to the given target article. * The actual text of these sentences can be obtained using {@link Page#getSentenceMarkup(int)} * * @param art the article of interest. * @return an array of sentence indexes that contain links to the given article. */ public Integer[] getSentenceIndexesMentioning(Article art) { DbLinkLocationList tmpLinks = env.getDbPageLinkIn().retrieve(art.getId()) ; if (tmpLinks == null || tmpLinks.getLinkLocations() == null) return new Integer[0] ; DbLinkLocation key = new DbLinkLocation(id, null) ; int index = Collections.binarySearch(tmpLinks.getLinkLocations(), key, new Comparator<DbLinkLocation>(){ public int compare(DbLinkLocation a, DbLinkLocation b) { return new Integer(a.getLinkId()).compareTo(b.getLinkId()) ; } }) ; if (index < 0) return new Integer[0] ; ArrayList<Integer> sentenceIndexes = tmpLinks.getLinkLocations().get(index).getSentenceIndexes() ; return sentenceIndexes.toArray(new Integer[sentenceIndexes.size()]) ; } /** * This efficiently identifies sentences within this article that contain links to all of the given target articles. * The actual text of these sentences can be obtained using {@link Page#getSentenceMarkup(int)} * * @param arts the articles of interest. * @return an array of sentence indexes that contain links to the given article. */ public Integer[] getSentenceIndexesMentioning(ArrayList<Article> arts) { TreeMap<Integer, Integer> sentenceCounts = new TreeMap<Integer, Integer>() ; //associate sentence indexes with number of arts mentioned. for (Article art:arts) { //System.out.println(" - Checking art " + art) ; for (Integer sentenceIndex: getSentenceIndexesMentioning(art)) { //System.out.println(" - - Adding sentence " + sentenceIndex) ; Integer count = sentenceCounts.get(sentenceIndex) ; if (count == null) sentenceCounts.put(sentenceIndex, 1) ; else sentenceCounts.put(sentenceIndex, count + 1) ; } } //gather all sentences that mention all arts ArrayList<Integer> validSentences = new ArrayList<Integer>() ; Iterator<Map.Entry<Integer, Integer>> iter = sentenceCounts.entrySet().iterator() ; while (iter.hasNext()) { Map.Entry<Integer, Integer> e = iter.next(); //System.out.println(" - " + e.getKey() + ", " + e.getValue()) ; if (e.getValue() == arts.size()) validSentences.add(e.getKey()) ; } return validSentences.toArray(new Integer[validSentences.size()]) ; } /** * A label that has been used to refer to the enclosing {@link Article}. These are mined from the title of the article, the * titles of {@link Redirect redirects} that point to the article, and the anchors of links that point to the article. */ public class Label { private String text ; private long linkDocCount ; private long linkOccCount ; private boolean fromTitle ; private boolean fromRedirect ; private boolean isPrimary ; protected Label(DbLabelForPage l) { this.text = l.getText() ; this.linkDocCount = l.getLinkDocCount() ; this.linkOccCount = l.getLinkOccCount() ; this.fromTitle = l.getFromTitle() ; this.fromRedirect = l.getFromRedirect() ; this.isPrimary = l.getIsPrimary() ; } /** * @return the text of this label (the title of the article or redirect, or the anchor of the link */ public String getText() { return text ; } /** * @return the number of pages that contain links that associate this label with the enclosing {@link Article}. */ public long getLinkDocCount() { return linkDocCount; } /** * @return the number of times this label occurs as the anchor text in links that refer to the enclosing {@link Article}. */ public long getLinkOccCount() { return linkOccCount; } /** * @return true if this label matches the title of the enclosing {@link Article}, otherwise false. */ public boolean isFromTitle() { return fromTitle; } /** * @return true if there is a {@link Redirect} that associates this label with the enclosing {@link Article}, otherwise false. */ public boolean isFromRedirect() { return fromRedirect; } /** * @return true if the enclosing {@link Article} is the primary, most common sense for the given label, otherwise false. */ public boolean isPrimary() { return isPrimary; } } //public static ============================================================ /* public static void main(String[] args) throws Exception { File databaseDirectory = new File("/research/dmilne/wikipedia/db/en/20100130"); Wikipedia w = new Wikipedia(databaseDirectory) ; Article nzBirds = w.getMostLikelyArticle("Birds of New Zealand", null) ; //Article kiwi = w.getMostLikelyArticle("Kiwi", null) ; /* DbLinkLocationList ll = w.getEnvironment().getDbPageLinkOut().retrieve(kiwi.getId()) ; for (DbLinkLocation l:ll.getLinkLocations()) { System.out.print(" - " + l.getLinkId() + ":") ; for (Integer s:l.getSentenceIndexes()) System.out.print(" " + s) ; System.out.println() ; } */ //System.out.println(kiwi) ; /* Article nz = w.getMostLikelyArticle("New Zealand", null) ; for (Article art:kiwi.getLinksOut()){ if (art.equals(nz)) System.out.println(" - link: " + art) ; } for (Article art:nz.getLinksIn()) { if (art.equals(kiwi)) System.out.println(" - link in: " + art) ; } ArrayList<Article> arts = new ArrayList<Article>() ; arts.add(w.getMostLikelyArticle("Kiwi", null)) ; arts.add(w.getMostLikelyArticle("Takahe", null)) ; System.out.println(nzBirds.getMarkup()) ; for (Article art:arts) { System.out.println("retrieving sentences mentioning " + art) ; for (int si: nzBirds.getSentenceIndexesMentioning(art)){ System.out.println(nzBirds.getSentenceMarkup(si)) ; } } System.out.println("retrieving sentences mentioning all") ; for (int si: nzBirds.getSentenceIndexesMentioning(arts)){ System.out.println(nzBirds.getSentenceMarkup(si)) ; } }*/ /** * Provides a demo of functionality available to Articles * * @param args an array of arguments for connecting to a wikipedia database: server and database names at a minimum, and optionally a username and password * @throws Exception if there is a problem with the wikipedia database. *//* public static void main(String[] args) throws Exception { Wikipedia wikipedia = Wikipedia.getInstanceFromArguments(args) ; BufferedReader in = new BufferedReader( new InputStreamReader( System.in ) ); DecimalFormat df = new DecimalFormat("0") ; while (true) { System.out.println("Enter article title (or enter to quit): ") ; String title = in.readLine() ; if (title == null || title.equals("")) break ; Article article = wikipedia.getArticleByTitle(title) ; if (article == null) { System.out.println("Could not find exact match. Searching through anchors instead") ; article = wikipedia.getMostLikelyArticle(title, null) ; } if (article == null) { System.out.println("Could not find exact article. Try again") ; } else { System.out.println("\n" + article + "\n") ; if (wikipedia.getDatabase().isContentImported()) { System.out.println(" - first sentence:") ; System.out.println(" - " + article.getFirstSentence(null, null)) ; System.out.println(" - first paragraph:") ; System.out.println(" - " + article.getFirstParagraph()) ; } //Category eqCategory = article.getEquivalentCategory() ; //if (eqCategory != null) { // System.out.println("\n - equivalent category") ; // System.out.println(" - " + eqCategory) ; //} System.out.println("\n - redirects (synonyms or very small related topics that didn't deserve a seperate article):") ; for (Redirect r: article.getRedirects()) System.out.println(" - " + r); //System.out.println("\n - anchors (synonyms and hypernyms):") ; //for (AnchorText at:article.getAnchorTexts()) // System.out.println(" - \"" + at.getText() + "\" (used " + at.getCount() + " times)") ; System.out.println("\n - parent categories (hypernyms):") ; for (Category c: article.getParentCategories()) System.out.println(" - " + c); System.out.println("\n - language links (translations):") ; HashMap<String,String> translations = article.getTranslations() ; for (String lang:translations.keySet()) System.out.println(" - \"" + translations.get(lang) + "\" (" + lang + ")") ; //System.out.println("\n - pages that this links to (related concepts):") ; //for (Article a: article.getLinksOut()) { // System.out.println(" - " + a + " (" + df.format(article.getRelatednessTo(a)*100) + "% related)"); //} } System.out.println("") ; } }*/ }