MauiTopicExtractor.java example

Explorer
hive-mrc-master
- doc
  - sampleCode
- hive-core
  - src
  - test
    - edu
      - unc
        ils
        mrc
        hive
        api
        SKOSSchemeTest.java
        SearcherTest.java
        TaggerTest.java
        ir
        lucene
        search
        AutocompleteTest
        AutocompleteTest.java
        tagging
        KEATaggerTest.java
        util
        SimpleCrawlerTest.java
        hive2
        api
        impl
        test
        HiveH2IndexImplTest.java
        HiveLuceneIndexImplTest.java
        HiveVocabularyImplTest.java
- hive-rs
  - src
    - org
      - unc
        hive
        services
        rs
        ConceptsResource.java
        ConfigurationListener.java
        SchemesResource.java
  - test
    - org
      - unc
        hive
        services
        rs
        ConceptsResourceTest.java
        FileIO.java
        SchemesResourceTest.java
- hive-web
  - src
    - org
      - unc
        hive
        client
        ClosablePanel.java
        ConceptBrowser.java
        ConceptBrowserService.java
        ConceptBrowserServiceAsync.java
        ConceptLink.java
        ConceptProxy.java
        HIVEMessages.java
        HomePage.java
        Indexer.java
        IndexerService.java
        IndexerServiceAsync.java
        RecordFormatter.java
        TestVis.java
        server
        ConceptBrowserServiceImpl.java
        FileUpload.java
        IndexerServiceImpl.java
        VocabularyService.java
        services
        ConceptListResource.java
        Main.java
        SKOSResourceApplication.java
        servlet
        AutocompleteServlet.java
        TermSuggestionServlet.java
        sync
        SyncJob.java
package maui.main;

/*
 *    MauiTopicExtractor.java
 *    Copyright (C) 2001-2009 Eibe Frank, Olena Medelyan
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ObjectInputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.sql.SQLException;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Vector;

//import org.wikipedia.miner.model.Article;
//import org.wikipedia.miner.model.Wikipedia;
//import org.wikipedia.miner.util.text.CaseFolder;

import org.apache.commons.io.FileUtils;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;

import maui.filters.MauiFilter;
import maui.stemmers.*;
import maui.stopwords.*;
import maui.vocab.Vocabulary;
import maui.vocab.VocabularyH2;
import maui.vocab.VocabularyJena;


/**
 * Extracts topics from the documents in a given directory.
 * Assumes that the file names for the documents end with ".txt".
 * Puts extracted topics into corresponding files ending with
 * ".key" (if those are not already present). Optionally an encoding
 * for the documents/keyphrases can be defined (e.g. for Chinese
 * text). Documents for which ".key" exists are used for evaluation.
 *
 * Valid options are:<p>
 *
 * -l "directory name"<br>
 * Specifies name of directory.<p>
 *
 * -m "model name"<br>
 * Specifies name of model.<p>
 *
 * -v "vocabulary name"<br>
 * Specifies name of vocabulary.<p>
 *
 * -f "vocabulary format"<br>
 * Specifies format of vocabulary (text or skos).<p>
 *
 * -i "document language" <br>
 * Specifies document language (en, es, de, fr).<p>
 * 
 * -e "encoding"<br>
 * Specifies encoding.<p>
 * 
 *  -w "WikipediaDatabase@WikipediaServer" <br>
 * Specifies wikipedia data.<p>
 *
 * -n <br>
 * Specifies number of phrases to be output (default: 5).<p>
 *
 * -t "name of class implementing stemmer"<br>
 * Sets stemmer to use (default: SremovalStemmer). <p>
 *
 * -s "name of class implementing stopwords"<br>
 * Sets stemmer to use (default: StopwordsEnglish). <p>
 * 
 * -d<br>
 * Turns debugging mode on.<p>
 *
 * -g<br>
 * Build global dictionaries from the test set.<p>
 *
 * -p<br>
 * Prints plain-text graph description of the topics for visual representation of the results.<p>
 *
 * -a<br>
 * Also write stemmed phrase and score into ".key" file.<p>
 * 
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
 */
public class MauiTopicExtractor implements OptionHandler {
	
	/** Name of directory */
	public String inputDirectoryName = null;
	
	/** Name of model */
	public String modelName = null;
	
	/** Vocabulary name */
	public String vocabularyName = "none";
	
	/** Format of the vocabulary */
	public String vocabularyFormat = null;
	
	/** Document language */
	public String documentLanguage = "en";
	
	/** Document encoding */
	public String documentEncoding = "default";
	
	/** Debugging mode? */
	public boolean debugMode = false;
	
	
	/** Maui filter object */
	private MauiFilter mauiFilter = null;
	
	/** Wikipedia object */
//	public Wikipedia wikipedia = null;
	
	/** Name of the server with the mysql Wikipedia data */ 
	private String wikipediaServer = "localhost"; 
	
	/** Name of the database with Wikipedia data */
	private String wikipediaDatabase = "database";
	
	/** Name of the directory with Wikipedia data in files */
	private String wikipediaDataDirectory = null;
	
	/** Should Wikipedia data be cached first? */
	private boolean cacheWikipediaData = false;
	
	/** The number of phrases to extract. */
	int topicsPerDocument = 10;
	

	/** Directory where vocabularies are stored **/
	public String vocabularyDirectory = "data/vocabularies";
	
	/** Stemmer to be used */
	public Stemmer stemmer = new PorterStemmer();
	
	/** Llist of stopwords to be used */
	//public Stopwords stopwords = new StopwordsEnglish("data/stopwords/stopwords_en.txt");
	public Stopwords stopwords;
	
	/** Minimum number of occurrences for a phrase to be considered a candidate **/
	private int minNumOccur;
	
	
	private Vocabulary vocabulary = null;
	
	
	/** Also write stemmed phrase and score into .key file. */
	boolean additionalInfo = false;
	

	/** Prints plain-text graph description of the topics into a .gv file. */
	boolean printGraph = false;
	
	
	/** Build global dictionaries from the test set. */
	boolean buildGlobalDictionary = false;
	
	
	public boolean getDebug() {
		return debugMode;
	}
	
	/**
	 * Parses a given list of options controlling the behaviour of this object.
	 * Valid options are:<p>
	 *
	 * -l "directory name"<br>
	 * Specifies name of directory.<p>
	 *
	 * -m "model name"<br>
	 * Specifies name of model.<p>
	 *
	 * -v "vocabulary name"<br>
	 * Specifies vocabulary name.<p>
	 * 
	 * -f "vocabulary format"<br>
	 * Specifies vocabulary format.<p>
	 * 
	 * -i "document language" <br>
	 * Specifies document language.<p>
	 * 
	 * -e "encoding"<br>
	 * Specifies encoding.<p>
     *
     *  -w "WikipediaDatabase@WikipediaServer" <br>
     * Specifies wikipedia data.<p>
	 *
	 * -n<br>
	 * Specifies number of phrases to be output (default: 5).<p>
	 *
	 * -d<br>
	 * Turns debugging mode on.<p>
	 *
	 * -b<br>
	 * Builds global dictionaries for computing TFxIDF from the test collection.<p>
	 *
	 * -a<br>
	 * Also write stemmed phrase and score into ".key" file.<p>
	 *
	 * @param options the list of options as an array of strings
	 * @exception Exception if an option is not supported
	 */
	public void setOptions(String[] options) throws Exception {
		
		String dirName = Utils.getOption('l', options);
		if (dirName.length() > 0) {
			inputDirectoryName = dirName;
		} else {
			inputDirectoryName = null;
			throw new Exception("Name of directory required argument.");
		}

		String modelName = Utils.getOption('m', options);
		if (modelName.length() > 0) {
			this.modelName = modelName;
		} else {
			this.modelName = null;
			throw new Exception("Name of model required argument.");
		}

		String vocabularyName = Utils.getOption('v', options);
		if (vocabularyName.length() > 0) {
			this.vocabularyName = vocabularyName;
		} 

		String vocabularyFormat = Utils.getOption('f', options);

		if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
			if (vocabularyFormat.length() > 0) {
				if (vocabularyFormat.equals("skos")
						|| vocabularyFormat.equals("text")) {
					this.vocabularyFormat = vocabularyFormat;
				} else {
					throw new Exception(
							"Unsupported format of vocabulary. It should be either \"skos\" or \"text\".");
				}
			} else {
				throw new Exception(
						"If a controlled vocabulary is used, format of vocabulary required argument (skos or text).");
			}
		}
		
		String encoding = Utils.getOption('e', options);
		if (encoding.length() > 0) 
			this.documentEncoding = encoding;
		
		String wikipediaConnection = Utils.getOption('w', options);
		if (wikipediaConnection.length() > 0) {
			int at = wikipediaConnection.indexOf("@");
			wikipediaDatabase = wikipediaConnection.substring(0,at);
			wikipediaServer = wikipediaConnection.substring(at+1);
		} 
		

		String documentLanguage = Utils.getOption('i', options);
		if (documentLanguage.length() > 0) 
			this.documentLanguage = documentLanguage;
		
		String numPhrases = Utils.getOption('n', options);
		if (numPhrases.length() > 0) {
			this.topicsPerDocument = Integer.parseInt(numPhrases);
		} 
		
		
		String stopwordsString = Utils.getOption('s', options);
		if (stopwordsString.length() > 0) {
			stopwordsString = "maui.stopwords.".concat(stopwordsString);
			this.stopwords = (Stopwords) Class.forName(stopwordsString)
					.newInstance();
		}

		String stemmerString = Utils.getOption('t', options);
		if (stemmerString.length() > 0) {
			stemmerString = "maui.stemmers.".concat(stemmerString);
			this.stemmer = (Stemmer) Class.forName(stemmerString).newInstance();
		}

		debugMode = Utils.getFlag('d', options);
		this.buildGlobalDictionary = Utils.getFlag('b', options);
		this.printGraph = Utils.getFlag('p', options);
		this.additionalInfo = Utils.getFlag('a', options);
		Utils.checkForRemainingOptions(options);
	}
	
	/**
	 * Gets the current option settings.
	 *
	 * @return an array of strings suitable for passing to setOptions
	 */
	public String [] getOptions() {
		
		String [] options = new String [22];
		int current = 0;
		
		options[current++] = "-l"; 
		options[current++] = "" + (this.inputDirectoryName);
		options[current++] = "-m"; 
		options[current++] = "" + (this.modelName);
		options[current++] = "-v"; 
		options[current++] = "" + (this.vocabularyName);
		options[current++] = "-f"; 
		options[current++] = "" + (this.vocabularyFormat);
		options[current++] = "-e"; 
		options[current++] = "" + (this.documentEncoding);
		options[current++] = "-i"; 
		options[current++] = "" + (this.documentLanguage);
		options[current++] = "-n"; 
		options[current++] = "" + (this.topicsPerDocument);
		options[current++] = "-t"; 
		options[current++] = "" + (stemmer.getClass().getName());		
		options[current++] = "-s"; 
		options[current++] = "" + (stopwords.getClass().getName());

		if (getDebug()) {
			options[current++] = "-d";
		}
		

		if (printGraph) {
			options[current++] = "-p";
		}
		
		if (this.buildGlobalDictionary) {
			options[current++] = "-b";
		}
		
		if (additionalInfo) {
			options[current++] = "-a";
		}
		
		while (current < options.length) {
			options[current++] = "";
		}
		return options;
	}
	
	/**
	 * Returns an enumeration describing the available options.
	 *
	 * @return an enumeration of all the available options
	 */
	public Enumeration<Option> listOptions() {
		
		Vector<Option> newVector = new Vector<Option>(15);
		
		newVector.addElement(new Option(
				"\tSpecifies name of directory.",
				"l", 1, "-l <directory name>"));
		newVector.addElement(new Option(
				"\tSpecifies name of model.",
				"m", 1, "-m <model name>"));
		newVector.addElement(new Option(
				"\tSpecifies vocabulary name.",
				"v", 1, "-v <vocabulary name>"));
		newVector.addElement(new Option(
				"\tSpecifies vocabulary format.",
				"f", 1, "-f <vocabulary format>"));
		newVector.addElement(new Option(
				"\tSpecifies encoding.",
				"e", 1, "-e <encoding>"));		
		newVector.addElement(new Option("\tSpecifies wikipedia database and server.", "w", 1,
		"-w <wikipediaDatabase@wikipediaServer>"));
		newVector.addElement(new Option(
				"\tSpecifies document language (en (default), es, de, fr).",
				"i", 1, "-i <document language>"));
		newVector.addElement(new Option(
				"\tSpecifies number of phrases to be output (default: 5).",
				"n", 1, "-n"));
		newVector.addElement(new Option(
				"\tSet the stemmer to use (default: SremovalStemmer).",
				"t", 1, "-t <name of stemmer class>"));
		newVector.addElement(new Option(
				"\tSet the stopwords class to use (default: EnglishStopwords).",
				"s", 1, "-s <name of stopwords class>"));
		newVector.addElement(new Option(
				"\tTurns debugging mode on.",
				"d", 0, "-d"));
		newVector.addElement(new Option(
				"\tBuilds global dictionaries for computing TFIDF from the test collection.",
				"b", 0, "-b"));
		newVector.addElement(new Option(
				"\tPrints graph description into a \".gv\" file, in GraphViz format.",
				"p", 0, "-p"));
		newVector.addElement(new Option(
				"\tAlso write stemmed phrase and score into \".key\" file.",
				"a", 0, "-a"));
		
		return newVector.elements();
	}
	
	public void loadThesaurus(Stemmer st, Stopwords sw, String vocabularyDirectory) {
		if (vocabulary != null)
			return;

		try {

			if (debugMode) {
				System.err.println("--- Loading the vocabulary...");
			}
			//vocabulary = new VocabularyJena(vocabularyName, vocabularyFormat, vocabularyDirectory);
			vocabulary = new VocabularyH2(vocabularyName, vocabularyFormat, vocabularyDirectory);
			vocabulary.setStemmer(st);
			vocabulary.setStopwords(sw);
			vocabulary.setDebug(debugMode);
			vocabulary.setLanguage(documentLanguage);
			vocabulary.initialize();
			
		} catch (Exception e) {
			System.err.println("Failed to load thesaurus!");
			e.printStackTrace();
		}

	}
	
	/**
	 * Collects the file names
	 */
	public HashSet<String> collectStems() throws Exception {
		
		HashSet<String> stems = new HashSet<String>();
		
		try {
			File dir = new File(inputDirectoryName);
			
			for (String file : dir.list()) {
				if (file.endsWith(".txt")) {
					String stem = file.substring(0, file.length() - 4);
					
					if (!stems.contains(stem)) {
						stems.add(stem);
					}
				}
			}
		} catch (Exception e) {
			throw new Exception("Problem reading directory " + inputDirectoryName);
		}
		return stems;
	}
	
	/**
	 * Builds the model from the files
	 */
	public void extractKeyphrases(HashSet<String> fileNames) throws Exception {
		
		// Check whether there is actually any data
		if (fileNames.size() == 0) {
			throw new Exception("Couldn't find any data in " + inputDirectoryName);
		}
		
		mauiFilter.setVocabularyName(vocabularyName);
		mauiFilter.setVocabularyFormat(vocabularyFormat);
		mauiFilter.setDocumentLanguage(documentLanguage);
		mauiFilter.setStemmer(stemmer);
		mauiFilter.setStopwords(stopwords);
/*		
		if (wikipedia != null) {
			mauiFilter.setWikipedia(wikipedia);
		} else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
			mauiFilter.setWikipedia(wikipedia);		
		} else {
			mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
		}
*/		
		if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia") ) {
			loadThesaurus(stemmer, stopwords, vocabularyDirectory);
			mauiFilter.setVocabulary(vocabulary);
		}
		
		FastVector atts = new FastVector(3);
		atts.addElement(new Attribute("filename", (FastVector) null));
		atts.addElement(new Attribute("doc", (FastVector) null));
		atts.addElement(new Attribute("keyphrases", (FastVector) null));
		Instances data = new Instances("keyphrase_training_data", atts, 0);
		
		System.err.println("-- Extracting keyphrases... ");
		
		Vector<Double> correctStatistics = new Vector<Double>();
		Vector<Double> precisionStatistics = new Vector<Double>();
		Vector<Double> recallStatistics = new Vector<Double>();
		
		for (String fileName : fileNames) {
		
			double[] newInst = new double[3];
			
			newInst[0] = (double)data.attribute(0).addStringValue(fileName); ;
			
			File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
			File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");
			
			try {

				String documentText;
				if (!documentEncoding.equals("default")) {
					documentText = FileUtils.readFileToString(documentTextFile, documentEncoding);
				} else {
					documentText = FileUtils.readFileToString(documentTextFile);
				}
	
				// Adding the text of the document to the instance
				newInst[1] = (double) data.attribute(1).addStringValue(documentText);
				
			} catch (Exception e) {
				System.err.println("Problem with reading " + documentTextFile);
				e.printStackTrace();
				newInst[1] = Instance.missingValue();
			}
			
			
			try {
				
				String documentTopics;
				if (!documentEncoding.equals("default")) {
					documentTopics = FileUtils.readFileToString(documentTopicsFile, documentEncoding);
				} else {
					documentTopics = FileUtils.readFileToString(documentTopicsFile);
				}
				
				// Adding the topics to the file
				newInst[2] = (double) data.attribute(2).addStringValue(documentTopics);
				
			} catch (Exception e) {
				if (debugMode) {
					System.err.println("No existing topics for " + documentTextFile);
				}
				newInst[2] = Instance.missingValue();
			}
			
			
			data.add(new Instance(1.0, newInst));
			
			mauiFilter.input(data.instance(0));
			
			// test
			if (buildGlobalDictionary)
				mauiFilter.batchFinished();
			// end test
			
			data = data.stringFreeStructure();
			if (debugMode) {
				System.err.println("-- Processing document: " + fileName);
			}
			Instance[] topRankedInstances = new Instance[topicsPerDocument];
			Instance inst;
			
			// Iterating over all extracted keyphrases (inst)
			while ((inst = mauiFilter.output()) != null) {
				
				int index = (int)inst.value(mauiFilter.getRankIndex()) - 1;
			
				if (index < topicsPerDocument) {
					topRankedInstances[index] = inst;
				}
			}
			
			if (debugMode) {
				System.err.println("-- Keyphrases and feature values:");
			}
			FileOutputStream out = null;
			PrintWriter printer = null; 

			if (!documentTopicsFile.exists()) {
				out = new FileOutputStream(documentTopicsFile);
				if (!documentEncoding.equals("default")) {
					printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding));					
				} else {
					printer = new PrintWriter(out);
				}
			}
			
			double numExtracted = 0, numCorrect = 0;
/*			wikipedia = mauiFilter.getWikipedia();
			
			HashMap<Article, Integer> topics = null;
			
			if (printGraph) {
				topics = new HashMap<Article, Integer>();
			}
*/			
			int p = 0;
			String root = "";
			for (int i = 0; i < topicsPerDocument; i++) {
				if (topRankedInstances[i] != null) {
					if (!topRankedInstances[i].
							isMissing(topRankedInstances[i].numAttributes() - 1)) {
						numExtracted += 1.0;
					}
					if ((int)topRankedInstances[i].
							value(topRankedInstances[i].numAttributes() - 1) == 1) {
						numCorrect += 1.0;
					}
					if (printer != null) {
						String topic = topRankedInstances[i].
						stringValue(mauiFilter.getOutputFormIndex());
						printer.print(topic);
						
/*						if (printGraph) {
							
							Article article = wikipedia.getArticleByTitle(topic);
							if (article == null) {
								article = wikipedia.getMostLikelyArticle(topic,
										new CaseFolder());
							}
							if (article != null) {
								if (root == "") {
									root = article.getTitle();
								}
								topics.put(article, new Integer(p));
							} else {
								if (debugMode) {
									System.err.println("Couldn't find article for " + topic + " in " + documentTopicsFile);
								}
							}
							p++;
						}
*/						
						if (additionalInfo) {
							printer.print("\t");
							
							String term = topRankedInstances[i].
									stringValue(mauiFilter.getNormalizedFormIndex());
							/*
							List<SKOSConcept> concepts = searcher.searchConceptByKeyword(term);
							if (concepts.size() > 0) {
								term = concepts.get(0).getQName();
								printer.print(term);
							}
							*/
							printer.print(vocabulary.getIDFromPrefLabel(term));
							//printer.print(topRankedInstances[i].
							//		stringValue(mauiFilter.getNormalizedFormIndex()));
							printer.print("\t");
							printer.print(Utils.
									doubleToString(topRankedInstances[i].
											value(mauiFilter.
													getProbabilityIndex()), 4));
						}
						printer.println();
					}
					if (debugMode) {
						System.err.println(topRankedInstances[i]);
					}
				}
			}
/*			
			if (printGraph) {
				String graphFile = documentTopicsFile.getAbsolutePath().replace(".key",".gv");
				computeGraph(topics, root, graphFile);
			}
*/			
			if (numExtracted > 0) {
				if (debugMode) {
					System.err.println("-- " + numCorrect + " correct");
				}
				double totalCorrect = mauiFilter.getTotalCorrect();
				correctStatistics.addElement(new Double(numCorrect));
				precisionStatistics.addElement(new Double(numCorrect/numExtracted));
				recallStatistics.addElement(new Double(numCorrect/totalCorrect));
				
			}
			if (printer != null) {
				printer.flush();
				printer.close();
				out.close();
			}
		}
		
		if (correctStatistics.size() != 0) {
			
		double[] st = new double[correctStatistics.size()];
		for (int i = 0; i < correctStatistics.size(); i++) {
			st[i] = correctStatistics.elementAt(i).doubleValue();
		}
		double avg = Utils.mean(st);
		double stdDev = Math.sqrt(Utils.variance(st));
		
		
		if (correctStatistics.size() == 1) {
			System.err.println("\n-- Evaluation results based on 1 document:");
				
		} else {
			System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:");
		}
		System.err.println("Avg. number of correct keyphrases per document: " +
				Utils.doubleToString(avg, 2) + " +/- " + 
				Utils.doubleToString(stdDev, 2));
		
		
		st = new double[precisionStatistics.size()];
		for (int i = 0; i < precisionStatistics.size(); i++) {
			st[i] = precisionStatistics.elementAt(i).doubleValue();
		}
		double avgPrecision = Utils.mean(st);
		double stdDevPrecision = Math.sqrt(Utils.variance(st));
		
		System.err.println("Precision: " +
				Utils.doubleToString(avgPrecision*100, 2) + " +/- " + 
				Utils.doubleToString(stdDevPrecision*100, 2));
		
		st = new double[recallStatistics.size()];
		for (int i = 0; i < recallStatistics.size(); i++) {
			st[i] = recallStatistics.elementAt(i).doubleValue();
		}
		double avgRecall = Utils.mean(st);
		double stdDevRecall = Math.sqrt(Utils.variance(st));
		
		System.err.println("Recall: " +
				Utils.doubleToString(avgRecall*100, 2) + " +/- " + 
				Utils.doubleToString(stdDevRecall*100, 2));
		
		double fMeasure = 2*avgRecall*avgPrecision/(avgRecall + avgPrecision);
		System.err.println("F-Measure: " + Utils.doubleToString(fMeasure*100, 2));
		
		System.err.println("");
		}
		
		if (!buildGlobalDictionary)
			mauiFilter.batchFinished();
	}
	
	/**
	 * Prints out a plain-text representation of a graph representing the main topics of the document.
	 * The nodes are the topics and the edges are relations between them as computed using the Wikipedia Miner.
	 * Only possible if Wikipedia data is provided.
	 * 
	 * @param topics
	 * @param root
	 * @param outputFile
	 */
/*	
	public  void computeGraph(HashMap<Article, Integer> topics,
			String root, String outputFile) {
		FileOutputStream out;
		PrintWriter printer;
		try {
			
			if (debugMode) {
				System.err.println("Printing graph information into " + outputFile);
			}
			
			out = new FileOutputStream(outputFile);
			printer = new PrintWriter(out);

			printer.print("graph G {\n");

			printer.print("graph [root=\"" + root
					+ "\", outputorder=\"depthfirst\"];\n");

			HashSet<String> done = new HashSet<String>();
			double relatedness = 0;
			for (Article a : topics.keySet()) {
				int count = topics.get(a).intValue();
				if (count < 1) {
					printer.print("\"" + a.getTitle() + "\" [fontsize=22];\n");
				} else if (count < 3) {
					printer
							.print("\"" + a.getTitle()
									+ "\" [fontsize = 18];\n");
				} else if (count < 6) {
					printer
							.print("\"" + a.getTitle()
									+ "\" [fontsize = 14];\n");
				} else {
					printer
							.print("\"" + a.getTitle()
									+ "\" [fontsize = 12];\n");
				}

				for (Article c : topics.keySet()) {
					if (!c.equals(a)) {
						try {
							relatedness = a.getRelatednessTo(c);
							String relation = "\"" + a.getTitle() + "\" -- \""
									+ c.getTitle();
							String relation2 = "\"" + c.getTitle() + "\" -- \""
									+ a.getTitle();

							if (!done.contains(relation2)
									&& !done.contains(relation)) {
								done.add(relation2);
								done.add(relation);

								if (relatedness < 0.2) {
									printer.print(relation
											+ "\"[style=invis];\n");
								} else {
									printer.print(relation
											+ "\" [penwidth = \""
											+ (int) (relatedness * 10 - 0.2)
											+ "\"];\n");
								}
							}
						} catch (SQLException e) {
							e.printStackTrace();
						}
					}
				}
			}
			printer.print("}\n");
			printer.close();
			out.close();
		} catch (Exception e1) {
			e1.printStackTrace();
		}
	}
*/	
	
	/** 
	 * Loads the extraction model from the file.
	 */
	public void loadModel() throws Exception {
		
		BufferedInputStream inStream =
			new BufferedInputStream(new FileInputStream(modelName));
		ObjectInputStream in = new ObjectInputStream(inStream);
		mauiFilter = (MauiFilter)in.readObject();
		
		// If TFxIDF values are to be computed from the test corpus
		if (buildGlobalDictionary == true) {
			if (debugMode) {
				System.err.println("-- The global dictionaries will be built from this test collection..");
			}
			mauiFilter.globalDictionary = null;			
		}
		in.close();
	}
	
	/**
	 * The main method.  
	 */
	public static void main(String[] ops) {
		
		MauiTopicExtractor topicExtractor = new MauiTopicExtractor();
		try {
			// Checking and Setting Options selected by the user:
			topicExtractor.setOptions(ops);      
			System.err.print("Extracting keyphrases with options: ");
			
			// Reading Options, which were set above and output them:
			String[] optionSettings = topicExtractor.getOptions();
			for (int i = 0; i < optionSettings.length; i++) {
				System.err.print(optionSettings[i] + " ");
			}
			System.err.println();
			
			// Loading selected Model:
			System.err.println("-- Loading the model... ");
			topicExtractor.loadModel();
			
			// Extracting Keyphrases from all files in the selected directory
			topicExtractor.extractKeyphrases(topicExtractor.collectStems());
			
		} catch (Exception e) {
			
			// Output information on how to use this class
			e.printStackTrace();
			System.err.println(e.getMessage());
			System.err.println("\nOptions:\n");
			Enumeration<Option> en = topicExtractor.listOptions();
			while (en.hasMoreElements()) {
				Option option = (Option) en.nextElement();
				System.err.println(option.synopsis());
				System.err.println(option.description());
			}
		}
	}

	public String getInputDirectoryName() {
		return inputDirectoryName;
	}

	public void setInputDirectoryName(String inputDirectoryName) {
		this.inputDirectoryName = inputDirectoryName;
	}

	public String getModelName() {
		return modelName;
	}

	public void setModelName(String modelName) {
		this.modelName = modelName;
	}

	public String getVocabularyFormat() {
		return vocabularyFormat;
	}

	public void setVocabularyFormat(String vocabularyFormat) {
		this.vocabularyFormat = vocabularyFormat;
	}

	public String getDocumentLanguage() {
		return documentLanguage;
	}

	public void setDocumentLanguage(String documentLanguage) {
		this.documentLanguage = documentLanguage;
	}

	public String getDocumentEncoding() {
		return documentEncoding;
	}

	public void setDocumentEncoding(String documentEncoding) {
		this.documentEncoding = documentEncoding;
	}

	public Stemmer getStemmer() {
		return stemmer;
	}

	public void setStemmer(Stemmer stemmer) {
		this.stemmer = stemmer;
	}

	public Stopwords getStopwords() {
		return stopwords;
	}

	public void setStopwords(Stopwords stopwords) {
		this.stopwords = stopwords;
	}

	public Vocabulary getVocabulary() {
		return vocabulary;
	}

	public void setVocabulary(Vocabulary vocabulary) {
		this.vocabulary = vocabulary;
	}

	public boolean isAdditionalInfo() {
		return additionalInfo;
	}

	public void setAdditionalInfo(boolean additionalInfo) {
		this.additionalInfo = additionalInfo;
	}

	public boolean isBuildGlobalDictionary() {
		return buildGlobalDictionary;
	}

	public void setBuildGlobalDictionary(boolean buildGlobalDictionary) {
		this.buildGlobalDictionary = buildGlobalDictionary;
	}

	public String getVocabularyName() {
		return vocabularyName;
	}

	public void setVocabularyName(String vocabularyName) {
		this.vocabularyName = vocabularyName;
	}

	public String getVocabularyDirectory() {
		return vocabularyDirectory;
	}

	public void setVocabularyDirectory(String vocabularyDirectory) {
		this.vocabularyDirectory = vocabularyDirectory;
	}
	
	public int getTopicsPerDocument() {
		return topicsPerDocument;
	}

	public void setTopicsPerDocument(int topicsPerDocument) {
		this.topicsPerDocument = topicsPerDocument;
	}

	public int getMinNumOccur() {
		return minNumOccur;
	}

	public void setMinNumOccur(int minNumOccur) {
		this.minNumOccur = minNumOccur;
		if (this.mauiFilter != null) {
			this.mauiFilter.setMinNumOccur(minNumOccur);
		}
	}
}