/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.kea.metadata;
import com.openkm.kea.filter.KEAFilter;
import java.util.List;
import java.util.ArrayList;
import java.util.Date;
import weka.core.Instances;
import weka.core.FastVector;
import weka.core.Attribute;
import weka.core.Instance;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* SubjectExtractor
*
* @author jllort
*
*/
public class SubjectExtractor {
private static Logger log = LoggerFactory.getLogger(SubjectExtractor.class);
private String modelName = "model";
private String vocabulary;
private String vocabularyFormat = "skos";
private String language = "en";
private String encoding = "UTF-8";
private boolean debug = true;
private int subjectNumLimit = 12;
private double subjectRelLimit = 1.2;
private boolean additionalInfo = false;
private KEAFilter filter = null;
/**
* SubjectExtractor
*
* @throws MetadataExtractionException
*/
public SubjectExtractor() throws MetadataExtractionException {
filter = KEAFilterBank.getFilter();
}
/**
* SubjectExtractor
*
* @param limit
* @throws MetadataExtractionException
*/
public SubjectExtractor(int limit) throws MetadataExtractionException {
subjectNumLimit = limit;
filter = KEAFilterBank.getFilter();
}
public String getModelName() {
return modelName;
}
public void setModelName(String modelName) {
this.modelName = modelName;
}
public String getVocabulary() {
return vocabulary;
}
public void setVocabulary(String vocabulary) {
this.vocabulary = vocabulary;
}
public String getVocabularyFormat() {
return vocabularyFormat;
}
public void setVocabularyFormat(String vocabularyFormat) {
this.vocabularyFormat = vocabularyFormat;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public boolean isDebug() {
return debug;
}
public void setDebug(boolean debug) {
this.debug = debug;
}
public int getSubjectNumLimit() {
return subjectNumLimit;
}
public void setSubjectNumLimit(int subjectNumLimit) {
this.subjectNumLimit = subjectNumLimit;
}
public double getSubjectRelLimit() {
return subjectRelLimit;
}
public void setSubjectRelLimit(double subjectRelLimit) {
this.subjectRelLimit = subjectRelLimit;
}
public boolean isAdditionalInfo() {
return additionalInfo;
}
public void setAdditionalInfo(boolean additionalInfo) {
this.additionalInfo = additionalInfo;
}
/**
* extractSuggestedSubjects
*
* @param documentText
* @return
*/
public List<String> extractSuggestedSubjects(String documentText) {
Date start,stop;
start = new Date();
List<String> subjects = new ArrayList<String>();
// no idea what this is ....
FastVector atts = new FastVector(3);
atts.addElement(new Attribute("doc", (FastVector) null));
atts.addElement(new Attribute("keyphrases", (FastVector) null));
atts.addElement(new Attribute("filename", (String) null));
Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0);
try {
// this is the exrtraction process part - not too well understood yet
// "unkowndatastructure" is called instances in original KEA code
double[] unknownStructure = new double[2];
unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText);
unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none
unknownDataStructure.add(new Instance(1.0, unknownStructure));
filter.input(unknownDataStructure.instance(0));
unknownDataStructure.stringFreeStructure(); //??**&%%!!!??
// this is getting the results out - better understood
Instance[] rankedSubjects = new Instance[this.subjectNumLimit];
Instance subject;
while ((subject = filter.output()) != null) {
int index = (int) subject.value(filter.getRankIndex()) -1;
if (index < subjectNumLimit) {
rankedSubjects[index] = subject;
}
}
for (int i = 0; i < subjectNumLimit; i++) {
if (rankedSubjects[i] != null) {
subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex()));
}
}
} catch (Exception e) {
log.error("problem in subject extraction: ",e);
} finally {
stop = new Date();
long time = (stop.getTime() - start.getTime());
log.info("Subject extraction completed in " + time + "ms");
}
return subjects;
}
}