// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.morph.cmd; import java.io.IOException; import java.io.Writer; import java.util.LinkedList; import java.util.List; import marmot.core.FeatureVector; import marmot.core.Sequence; import marmot.core.Token; import marmot.morph.MorphModel; import marmot.morph.MorphOptions; import marmot.morph.MorphWeightVector; import marmot.morph.Word; import marmot.morph.io.SentenceReader; import marmot.util.FileUtils; public class FeaturePrinter { public static void main(String[] args) { MorphOptions options = new MorphOptions(); options.setPropertiesFromStrings(args); options.setProperty(MorphOptions.VECTOR_SIZE, "1"); MorphModel model = new MorphModel(); List<Sequence> sentences = new LinkedList<Sequence>(); for (Sequence sentence : new SentenceReader(options.getTrainFile())) { sentences.add(sentence); } model.init(options, sentences); sentences = null; MorphWeightVector weights = new MorphWeightVector(options); weights.init(model, sentences); try { weights.setExtendFeatureSet(true); printFeatures(model, options, weights, options.getTrainFile(), "trn.txt.feat"); weights.setExtendFeatureSet(false); printFeatures(model, options, weights, options.getTestFile(), "tst.txt.feat"); } catch (IOException e) { throw new RuntimeException(e); } } private static void printFeatures(MorphModel model, MorphOptions options, MorphWeightVector weights, String filename, String out_filename) throws IOException { String separator = "\t"; Writer writer = FileUtils.openFileWriter(out_filename); for (Sequence sentence : new SentenceReader(filename)) { int index = 0; for (Token token : sentence) { Word word = (Word) token; model.addIndexes(word, false); } for (Token token : sentence) { Word word = (Word) token; FeatureVector vector = weights.extractStateFeatures(sentence, index); writer.write("pos="); writer.write(word.getPosTag()); if (options.getTagMorph()) { writer.write("|"); writer.write(word.getMorphTag()); } for (int findex = 0; findex < vector.size(); findex ++) { int feature = vector.get(findex); writer.write(separator); writer.write(toString(feature)); } writer.write('\n'); index++; } writer.write('\n'); } writer.close(); } private static String toString(int feature) { return Integer.toString(feature, Character.MAX_RADIX); } }