/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.output.labeling;
import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import com.martiansoftware.jsap.JSAPResult;
import at.tuwien.ifs.somtoolbox.apps.config.OptionFactory;
import at.tuwien.ifs.somtoolbox.data.InputData;
import at.tuwien.ifs.somtoolbox.data.InputDataFactory;
import at.tuwien.ifs.somtoolbox.data.InputDatum;
import at.tuwien.ifs.somtoolbox.input.SOMLibFormatInputReader;
import at.tuwien.ifs.somtoolbox.layers.Unit;
import at.tuwien.ifs.somtoolbox.models.GHSOM;
import at.tuwien.ifs.somtoolbox.models.GrowingSOM;
/**
* Implements the <code>Keyword selection</code> labelling method, as described in <i><b>Lagus, K. and Kaski,
* S.</b>:Keyword selection method for characterizing text document maps. Proceedings of ICANN99, 9th International
* Conference on Artificial Neural Networks, volume 1, pages 371-376, IEEE, London. </i><br/>
* This implementation is based on Lucene.<br/>
* FIXME: still incomplete, based on old/deprecated Lucene API
*
* @author Rudolf Mayer
* @version $Id: LagusKeywordLabeler.java 3883 2010-11-02 17:13:23Z frank $
*/
public class LagusKeywordLabeler extends AbstractLabeler {
String path;
public static void main(String[] args) {
JSAPResult config = OptionFactory.parseResults(args, OptionFactory.OPTIONS_LAGUS_KEYWORD_LABELER);
int numLabels = config.getInt("numberLabels", 5);
String inputVectorFilename = config.getString("inputVectorFile");
boolean denseData = config.getBoolean("denseData", false);
String templateVectorFilename = config.getString("templateVectorFile", null);
String unitDescriptionFilename = config.getString("unitDescriptionFile", null);
String weightVectorFilename = config.getString("weightVectorFile");
String mapDescriptionFilename = config.getString("mapDescriptionFile", null);
GrowingSOM gsom = null;
try {
gsom = new GrowingSOM(new SOMLibFormatInputReader(weightVectorFilename, unitDescriptionFilename,
mapDescriptionFilename));
} catch (Exception e) {
Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(e.getMessage() + " Aborting.");
e.printStackTrace();
System.exit(-1);
}
InputData data = InputDataFactory.open(inputVectorFilename, templateVectorFilename, !denseData, true, 1,
7);
Labeler labeler = new LagusKeywordLabeler(config.getString("inputDir"));
labeler.label(gsom, data, numLabels);
}
public LagusKeywordLabeler(String path) {
super();
if (!path.endsWith(File.separator)) {
path += File.separator;
}
this.path = path;
}
@Override
public void label(GHSOM ghsom, InputData data, int num) {
label(ghsom.topLayerMap(), data, num);
}
@Override
public void label(GrowingSOM gsom, InputData data, int num) {
label(gsom, data, num, false);
}
@Override
public void label(GrowingSOM gsom, InputData data, int num, boolean ignoreLabelsWithZero) {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_24);
Unit[] units = gsom.getLayer().getAllUnits();
try {
IndexWriter fullIndex = new IndexWriter(FSDirectory.open(new File("index")), analyzer, true,
MaxFieldLength.UNLIMITED);
IndexWriter[] unitIndices = new IndexWriter[units.length];
for (int i = 0; i < units.length; i++) { // do labeling for each unit
if (units[i].getNumberOfMappedInputs() != 0) {
InputDatum[] unitData = data.getInputDatum(units[i].getMappedInputNames());
String[] vectorNames = new String[unitData.length];
for (int j = 0; j < unitData.length; j++) {
vectorNames[j] = unitData[j].getLabel();
}
// Store the index in memory:
Directory directory = new RAMDirectory();
try {
unitIndices[i] = new IndexWriter(directory, analyzer, true, MaxFieldLength.UNLIMITED);
unitIndices[i].setMaxFieldLength(25000);
// FileIndexer indexer = new FileIndexer(unitIndices[i],
// (String[]) new ArrayList(FileIndexer.KNOWN_FILE_TYPES.keySet()).toArray(new
// String[FileIndexer.KNOWN_FILE_TYPES.size()]));
// indexer.indexDocs(path, vectorNames);
fullIndex.addIndexesNoOptimize(new Directory[] { unitIndices[i].getDirectory() });
IndexSearcher isearcher = new IndexSearcher(directory, true);
TermEnum terms = isearcher.getIndexReader().terms();
do {
System.out.println("Term: " + terms.term());
} while (terms.next());
} catch (IOException e) {
e.printStackTrace();
}
}
}
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
}