package maui.main; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.ObjectInputStream; import java.util.ArrayList; import org.apache.commons.io.FileUtils; import maui.filters.MauiFilter; import maui.stemmers.PorterStemmer; import maui.stemmers.Stemmer; import maui.stopwords.Stopwords; import maui.stopwords.StopwordsEnglish; import maui.vocab.Vocabulary; import maui.vocab.VocabularyJena; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; /** * This class shows how to use Maui on a single document * or just a string of text. * @author alyona * */ public class MauiWrapper { /** Maui filter object */ private MauiFilter extractionModel = null; private Vocabulary vocabulary = null; private Stemmer stemmer; private Stopwords stopwords; private String language = "en"; /** * Constructor, which loads the data * @param dataDirectory - e.g. Maui's main directory (should has "data" dir in it) * @param vocabularyName - name of the rdf vocabulary * @param modelName - name of the model */ public MauiWrapper(String dataDirectory, String vocabularyName, String modelName) { stemmer = new PorterStemmer(); String englishStopwords = dataDirectory + "data/stopwords/stopwords_en.txt"; stopwords = new StopwordsEnglish(englishStopwords); String vocabularyDirectory = dataDirectory + "data/vocabularies/"; String modelDirectory = dataDirectory + "data/models"; loadVocabulary(vocabularyDirectory, vocabularyName); loadModel(modelDirectory, modelName, vocabularyName); } /** * Loads a vocabulary from a given directory * @param vocabularyDirectory * @param vocabularyName */ public void loadVocabulary(String vocabularyDirectory, String vocabularyName) { if (vocabulary != null) return; try { vocabulary = new VocabularyJena(vocabularyName, "skos", vocabularyDirectory); vocabulary.setStemmer(stemmer); vocabulary.setStopwords(stopwords); vocabulary.setLanguage(language); vocabulary.initialize(); } catch (Exception e) { System.err.println("Failed to load vocabulary!"); e.printStackTrace(); } } /** * Loads the model * @param modelDirectory * @param modelName * @param vocabularyName */ public void loadModel(String modelDirectory, String modelName, String vocabularyName) { try { BufferedInputStream inStream = new BufferedInputStream( new FileInputStream(modelDirectory + "/" + modelName)); ObjectInputStream in = new ObjectInputStream(inStream); extractionModel = (MauiFilter) in.readObject(); in.close(); } catch (Exception e) { System.err.println("Failed to load model!"); e.printStackTrace(); } extractionModel.setVocabularyName(vocabularyName); extractionModel.setVocabularyFormat("skos"); extractionModel.setDocumentLanguage(language); extractionModel.setStemmer(stemmer); extractionModel.setStopwords(stopwords); extractionModel.setVocabulary(vocabulary); } /** * Main method to extract the main topics from a given text * @param text * @param topicsPerDocument * @return * @throws Exception */ public ArrayList<String> extractTopicsFromText(String text, int topicsPerDocument) throws Exception { if (text.length() < 5) { throw new Exception("Text is too short!"); } // extractionModel.setWikipedia(null); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue("inputFile"); newInst[1] = (double) data.attribute(1).addStringValue(text); newInst[2] = Instance.missingValue(); data.add(new Instance(1.0, newInst)); extractionModel.input(data.instance(0)); data = data.stringFreeStructure(); Instance[] topRankedInstances = new Instance[topicsPerDocument]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = extractionModel.output()) != null) { int index = (int) inst.value(extractionModel.getRankIndex()) - 1; if (index < topicsPerDocument) { topRankedInstances[index] = inst; } } ArrayList<String> topics = new ArrayList<String>(); for (int i = 0; i < topicsPerDocument; i++) { if (topRankedInstances[i] != null) { String topic = topRankedInstances[i].stringValue(extractionModel .getOutputFormIndex()); topics.add(topic); } } extractionModel.batchFinished(); return topics; } /** * Triggers topic extraction from a text file * @param filePath * @param numberOfTopics * @return * @throws Exception */ public ArrayList<String> extractTopicsFromFile(String filePath, int numberOfTopics) throws Exception { File documentTextFile = new File(filePath); String documentText = FileUtils.readFileToString(documentTextFile); return extractTopicsFromText(documentText, numberOfTopics); } /** * Main method for testing MauiWrapper * Add the path to a text file as command line argument * @param args */ public static void main(String[] args) { String vocabularyName = "agrovoc_en"; String modelName = "fao30"; String dataDirectory = "../Maui1.2/"; MauiWrapper wrapper = new MauiWrapper(dataDirectory, vocabularyName, modelName); String filePath = args[0]; try { ArrayList<String> keywords = wrapper.extractTopicsFromFile(filePath, 15); for (String keyword : keywords) { System.out.println("Keyword: " + keyword); } } catch (Exception e) { e.printStackTrace(); } } }