package maui.main; import java.util.HashSet; import maui.stemmers.FrenchStemmer; import maui.stemmers.Stemmer; import maui.stopwords.Stopwords; import maui.stopwords.StopwordsFrench; import org.wikipedia.miner.model.Wikipedia; public class FrenchExample { /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // location of the data String trainDir = "data/term_assignment/train_fr"; String testDir = "data/term_assignment/test_fr"; // name of the file for storing the model String modelName = "french_model"; // language specific settings Stemmer stemmer = new FrenchStemmer(); Stopwords stopwords = new StopwordsFrench("data/stopwords/stopwords_fr.txt"); String language = "fr"; String encoding = "UTF-8"; // vocabulary to use for term assignment String vocabulary = "agrovoc_fr"; String format = "skos"; // how many topics per document to extract int numTopicsToExtract = 8; // maui objects MauiModelBuilder modelBuilder = new MauiModelBuilder(); MauiTopicExtractor topicExtractor = new MauiTopicExtractor(); Wikipedia wikipedia = new Wikipedia("localhost", "enwiki_20090306", "root", null); // Settings for the model builder modelBuilder.inputDirectoryName = trainDir; modelBuilder.modelName = modelName; modelBuilder.vocabularyFormat = format; modelBuilder.vocabularyName = vocabulary; modelBuilder.stemmer = stemmer; modelBuilder.stopwords = stopwords; modelBuilder.documentLanguage = language; modelBuilder.documentEncoding = encoding; modelBuilder.debugMode = true; modelBuilder.wikipedia = wikipedia; // Which features to use? modelBuilder.setBasicFeatures(true); modelBuilder.setKeyphrasenessFeature(true); modelBuilder.setFrequencyFeatures(false); modelBuilder.setPositionsFeatures(true); modelBuilder.setLengthFeature(true); modelBuilder.setNodeDegreeFeature(true); modelBuilder.setBasicWikipediaFeatures(true); modelBuilder.setAllWikipediaFeatures(false); // Run model builder modelBuilder.buildModel(modelBuilder.collectStems()); modelBuilder.saveModel(); // Settings for the topic extractor topicExtractor.inputDirectoryName = testDir; topicExtractor.modelName = modelName; topicExtractor.vocabularyName = vocabulary; topicExtractor.vocabularyFormat = format; topicExtractor.stemmer = stemmer; topicExtractor.stopwords = stopwords; topicExtractor.documentLanguage = language; topicExtractor.debugMode = true; topicExtractor.topicsPerDocument = numTopicsToExtract; // topicExtractor.wikipedia = wikipedia; // Run topic extractor topicExtractor.loadModel(); topicExtractor.extractKeyphrases(topicExtractor.collectStems()); } }