package maui.main; /* * MauiModelBuilder.java * Copyright (C) 2009 Olena Medelyan * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ import gnu.trove.TIntHashSet; import java.io.File; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import maui.stemmers.FrenchStemmer; import maui.stemmers.PorterStemmer; import maui.stemmers.Stemmer; import maui.stopwords.Stopwords; import maui.stopwords.StopwordsEnglish; import maui.stopwords.StopwordsFrench; import org.wikipedia.miner.model.Wikipedia; import org.wikipedia.miner.util.ProgressNotifier; import org.wikipedia.miner.util.text.CaseFolder; import org.wikipedia.miner.util.text.TextProcessor; /** * Demonstrates how to use Maui for three types of topic indexing <br> * 1. Keyphrase extraction - extracting significant phrases from * the document, also suitable for automatic tagging. <br> * 2. Term assignment - indexing documents with terms * from a controlled vocabulary in SKOS or text format. <br> * 3. Indexing with Wikipedia - indexing documents with * terms from Wikipedia, also suitable for * keyphrase extraction and tagging, or any case where there is no con trolled * vocabulary available, but consistency is required. * * @author Olena Medelyan (olena@cs.waikato.ac.nz) * */ public class Examples { private MauiTopicExtractor topicExtractor; private MauiModelBuilder modelBuilder; private Wikipedia wikipedia; private String server; private String database; private String dataDirectory; private boolean cache = false; public Examples (String server, String database, String dataDirectory, boolean cache) throws Exception { this.server = server; this.database = database; this.dataDirectory = dataDirectory; this.cache = cache; loadWikipedia(); } public Examples () { } private void loadWikipedia() throws Exception { wikipedia = new Wikipedia(server, database, "root", null); TextProcessor textProcessor = new CaseFolder(); File dataDir = new File(dataDirectory); if (cache) { ProgressNotifier progress = new ProgressNotifier(5); // cache tables that will be used extensively TIntHashSet validPageIds = wikipedia.getDatabase().getValidPageIds( dataDir, 2, progress); wikipedia.getDatabase().cachePages(dataDir, validPageIds, progress); wikipedia.getDatabase().cacheAnchors(dataDir, textProcessor, validPageIds, 2, progress); wikipedia.getDatabase().cacheInLinks(dataDir, validPageIds, progress); wikipedia.getDatabase().cacheGenerality(dataDir, validPageIds, progress); } } /** * Sets general parameters: debugging printout, language specific options * like stemmer, stopwords. * @throws Exception */ private void setGeneralOptions() { modelBuilder.debugMode = true; modelBuilder.wikipedia = wikipedia; /* language specific options Stemmer stemmer = new FrenchStemmer(); Stopwords stopwords = new StopwordsFrench(); String language = "fr"; String encoding = "UTF-8"; modelBuilder.stemmer = stemmer; modelBuilder.stopwords = stopwords; modelBuilder.documentLanguage = language; modelBuilder.documentEncoding = encoding; topicExtractor.stemmer = stemmer; topicExtractor.stopwords = stopwords; topicExtractor.documentLanguage = language; */ /* specificity options modelBuilder.minPhraseLength = 1; modelBuilder.maxPhraseLength = 5; */ topicExtractor.debugMode = true; topicExtractor.topicsPerDocument = 10; // topicExtractor.wikipedia = wikipedia; } /** * Set which features to use */ private void setFeatures() { modelBuilder.setBasicFeatures(true); modelBuilder.setKeyphrasenessFeature(true); modelBuilder.setFrequencyFeatures(true); modelBuilder.setPositionsFeatures(true); modelBuilder.setLengthFeature(true); modelBuilder.setNodeDegreeFeature(true); modelBuilder.setBasicWikipediaFeatures(false); modelBuilder.setAllWikipediaFeatures(false); } /** * Demonstrates how to perform automatic tagging. Also applicable to * keyphrase extraction. * * @throws Exception */ public void testAutomaticTagging() throws Exception { topicExtractor = new MauiTopicExtractor(); modelBuilder = new MauiModelBuilder(); setGeneralOptions(); setFeatures(); // Directories with train & test data String trainDir = "data/automatic_tagging/train"; String testDir = "data/automatic_tagging/test"; // name of the file to save the model String modelName = "test"; // Settings for the model builder modelBuilder.inputDirectoryName = trainDir; modelBuilder.modelName = modelName; // change to 1 for short documents modelBuilder.minNumOccur = 2; // Run model builder HashSet<String> fileNames = modelBuilder.collectStems(); modelBuilder.buildModel(fileNames); modelBuilder.saveModel(); // Settings for topic extractor topicExtractor.inputDirectoryName = testDir; topicExtractor.modelName = modelName; // Run topic extractor topicExtractor.loadModel(); fileNames = topicExtractor.collectStems(); topicExtractor.extractKeyphrases(fileNames); } /** * Demonstrates how to perform term assignment. Applicable to any vocabulary * in SKOS or text format. * * @throws Exception */ public void testTermAssignment() throws Exception { topicExtractor = new MauiTopicExtractor(); modelBuilder = new MauiModelBuilder(); setGeneralOptions(); setFeatures(); // Directories with train & test data String trainDir = "data/term_assignment/train"; String testDir = "data/term_assignment/test"; // Vocabulary String vocabulary = "agrovoc_sample"; String format = "skos"; // name of the file to save the model String modelName = "test"; HashSet<String> fileNames; // Settings for the model builder modelBuilder.inputDirectoryName = trainDir; modelBuilder.modelName = modelName; modelBuilder.vocabularyFormat = format; modelBuilder.vocabularyName = vocabulary; // Run model builder fileNames = modelBuilder.collectStems(); modelBuilder.buildModel(fileNames); modelBuilder.saveModel(); // Settings for topic extractor topicExtractor.inputDirectoryName = testDir; topicExtractor.modelName = modelName; topicExtractor.vocabularyName = vocabulary; topicExtractor.vocabularyFormat = format; // Run topic extractor topicExtractor.loadModel(); fileNames = topicExtractor.collectStems(); topicExtractor.extractKeyphrases(fileNames); } /** * Demonstrates how to perform topic indexing * with Wikipedia. * * @throws Exception */ public void testIndexingWithWikipedia() throws Exception { topicExtractor = new MauiTopicExtractor(); modelBuilder = new MauiModelBuilder(); setGeneralOptions(); setFeatures(); // Directories with train & test data String trainDir = "data/wikipedia_indexing/test"; String testDir = "/Users/alyona/Documents/corpora/term_assignment/FAO_780/1doc2"; // Vocabulary String vocabulary = "wikipedia"; // name of the file to save the model String modelName = "test"; HashSet<String> fileNames; // Settings for the model builder modelBuilder.inputDirectoryName = trainDir; modelBuilder.modelName = modelName; modelBuilder.vocabularyName = vocabulary; // Run model builder fileNames = modelBuilder.collectStems(); modelBuilder.buildModel(fileNames); modelBuilder.saveModel(); // // Settings for topic extractor // topicExtractor.setDirName(testDir); // topicExtractor.setModelName(modelName); // topicExtractor.setVocabularyName(vocabulary); // // // Run topic extractor // topicExtractor.loadModel(); // fileNames = topicExtractor.collectStems(); // topicExtractor.extractKeyphrases(fileNames); } /** * Main method for running the three types of topic indexing. Comment out * the required one. * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { String mode = args[0]; if (!mode.equals("tagging") && !mode.equals("term_assignment") && !mode.equals("indexing_with_wikipedia")) { throw new Exception("Choose one of the three modes: tagging, term_assignment or indexing_with_wikipedia"); } Date todaysDate = new java.util.Date(); SimpleDateFormat formatter = new SimpleDateFormat( "EEE, dd-MMM-yyyy HH:mm:ss"); String formattedDate1 = formatter.format(todaysDate); Examples exampler; if (mode.equals("tagging")) { exampler = new Examples(); exampler.testAutomaticTagging(); } else if (mode.equals("term_assignment")) { exampler = new Examples(); exampler.testTermAssignment(); } else if (mode.equals("indexing_with_wikipedia")) { // Access to Wikipedia String server = "localhost"; String database = "database"; String dataDirectory = "path/to/data/directory"; boolean cache = false; exampler = new Examples(server, database, dataDirectory, cache); exampler.testIndexingWithWikipedia(); } todaysDate = new java.util.Date(); String formattedDate2 = formatter.format(todaysDate); System.err.print("Run from " + formattedDate1); System.err.println(" to " + formattedDate2); } }