/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.gc.cmdline.dictionary; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Reader; import java.nio.charset.Charset; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.StringTokenizer; import java.util.TreeMap; import java.util.TreeSet; import org.cogroo.entities.impl.MorphologicalTag; import org.cogroo.formats.ad.ADFeaturizerSampleStream; import org.cogroo.interpreters.FlorestaTagInterpreter; import org.cogroo.interpreters.JspellTagInterpreter; import org.cogroo.interpreters.TagInterpreter; import org.cogroo.tools.featurizer.FeatureSample; import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.BasicCmdLineTool; import opennlp.tools.cmdline.CmdLineUtil; import opennlp.tools.cmdline.TerminateToolException; import opennlp.tools.postag.Triple; import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.featuregen.StringPattern; public class TabSeparatedPOSDictionaryBuilderTool extends BasicCmdLineTool { interface Params extends POSDictionaryBuilderParams { @ParameterDescription(valueName = "includeFetures", description = "include features") @OptionalParameter(defaultValue = "false") Boolean getIsIncludeFeatures(); @ParameterDescription(valueName = "includeFromCorpus", description = "include from corpus") @OptionalParameter(defaultValue = "false") Boolean getIncludeFromCorpus(); @ParameterDescription(valueName = "expandME", description = "include from corpus") @OptionalParameter(defaultValue = "false") Boolean getExpandME(); } public String getShortDescription() { return "builds a new tab separated lexical dictionary to be used with FSA builder"; } public String getHelp() { return getBasicHelp(Params.class); } public void run(String[] args) { Params params = validateAndParseParams(args, Params.class); File dictInFile = params.getInputFile(); File dictOutFile = params.getOutputFile(); File corpusFile = params.getCorpus(); Charset encoding = params.getEncoding(); CmdLineUtil.checkInputFile("dictionary input file", dictInFile); CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile); CmdLineUtil.checkInputFile("corpus input file", corpusFile); InputStreamReader in = null; OutputStreamWriter out = null; try { // load corpus tags InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(corpusFile); ADFeaturizerSampleStream sentenceStream = new ADFeaturizerSampleStream( sampleDataIn, "ISO-8859-1", params.getExpandME()); Set<String> knownFeats = new HashSet<String>(); Set<String> knownPostags = new HashSet<String>(); FeatureSample sample = sentenceStream.read(); while (sample != null) { Collections.addAll(knownFeats, sample.getFeatures()); Collections.addAll(knownPostags, sample.getTags()); sample = sentenceStream.read(); } sentenceStream.close(); in = new InputStreamReader(new FileInputStream(dictInFile), encoding); SortedMap<String, Set<Triple>> entries = new TreeMap<String, Set<Triple>>(); // Multimap<String, Triple> entries = ArrayListMultimap.create(500000, 1); // TreeMultimap<String, Triple> entries = TreeMultimap.create(); // Multimap<String, Triple> entries = HashMultimap.create(500000, 1); parseOneEntryPerLine(in, entries, new JspellTagInterpreter(), new FlorestaTagInterpreter(), knownFeats, knownPostags, params.getAllowInvalidFeats(), params.getIsIncludeFeatures()); in.close(); Map<String, Set<String>> added = new TreeMap<String, Set<String>>(); if (params.getIncludeFromCorpus()) { InputStreamFactory corpusDataIn = CmdLineUtil.createInputStreamFactory(corpusFile); sentenceStream = new ADFeaturizerSampleStream(corpusDataIn, "ISO-8859-1", params.getExpandME()); sample = sentenceStream.read(); while (sample != null) { String[] toks = sample.getSentence(); String[] lemmas = sample.getLemmas(); String[] tags = sample.getTags(); String[] feats = sample.getFeatures(); for (int i = 0; i < toks.length; i++) { String tok; if(!"prop".equals(tags[i])) { tok = toks[i].toLowerCase(); } else { tok = toks[i]; } if (isValid(entries.get(tok), tok, tags[i], lemmas[i], feats[i], params.getIsIncludeFeatures())) { Triple t = asTriple(tags[i], lemmas[i], feats[i], params.getIsIncludeFeatures()); put(tok, t, entries); if(!"prop".equals(t.getClazz())) { if(!added.containsKey(tok)) { added.put(tok, new HashSet<String>()); } added.get(tok).add(t.toString()); } } } sample = sentenceStream.read(); } sentenceStream.close(); for (String k : added.keySet()) { Set<String> e = added.get(k); for (String v : e) { System.out.println(k + " - " + v); } } } out = new OutputStreamWriter(new FileOutputStream(dictOutFile), "UTF-8"); for (String token : entries.keySet()) { for (Triple triple : entries.get(token)) { out.append(toString(token, triple)); } } out.close(); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.getMessage()); } catch (Exception e) { throw new TerminateToolException(-1, "Exception: " + e.getMessage()); } finally { try { in.close(); out.close(); } catch (IOException e) { // sorry that this can fail } } } private static void put(String tok, Triple t, SortedMap<String, Set<Triple>> entries) { if(!entries.containsKey(tok)) { entries.put(tok, new HashSet<Triple>()); } entries.get(tok).add(t); } private boolean isValid(Collection<Triple> knownTriples, String tok, String clazz, String lemma, String feats, boolean includeFeatures) { if(StringPattern.recognize(tok).containsDigit()) return false; //no numbers... // no B- I- if(clazz.startsWith("B-") || clazz.startsWith("I-")) { return false; } if (knownTriples != null && knownTriples.size() > 0) { // check if we already have this entry ignoring the lemma... Set<String> entries = new HashSet<String>(); for (Triple t : knownTriples) { String tFeat = null; if (includeFeatures) { tFeat = t.getFeats(); } entries.add(t.getClazz() + "|" + tFeat); } String f = null; if (includeFeatures) { f = feats; } if (entries.contains(clazz + "|" + f)) { return false; } } return true; } public static void parseOneEntryPerLine(Reader in, SortedMap<String, Set<Triple>> entries, TagInterpreter tago, TagInterpreter tagd, Set<String> knownFeats, Set<String> knownPostags, boolean allowInvalidFeats, boolean includeFeatures) throws IOException { knownFeats = new TreeSet<String>(knownFeats); if(!includeFeatures) { // force ignore unknown features allowInvalidFeats = true; } BufferedReader lineReader = new BufferedReader(in); String line; Set<String> unknownTags = new TreeSet<String>(); while ((line = lineReader.readLine()) != null) { StringTokenizer whiteSpaceTokenizer = new StringTokenizer(line, " "); String word = whiteSpaceTokenizer.nextToken(); while (whiteSpaceTokenizer.hasMoreTokens()) { String data = whiteSpaceTokenizer.nextToken(); String[] lemmaTag = data.split(">"); if (lemmaTag.length != 2) { System.err.println("** Invalid lemmatag. " + word + " -> " + data); } else { // convert the jspell tag to floresta tag MorphologicalTag completeTag = tago .parseMorphologicalTag(lemmaTag[1]); if (completeTag == null || completeTag.getClazzE() == null) { System.err.println("-- Missing class tag. " + word + " -> " + data); } else { MorphologicalTag classMT = new MorphologicalTag(); classMT.setClazz(completeTag.getClazzE()); String classString = tagd.serialize(classMT); if(classString == null) { System.out.println("erro :("); } MorphologicalTag featsMT = completeTag.clone(); featsMT.setClazz(null); String featsString = null; if(!featsMT.isEmpty()) { featsString = tagd.serialize(featsMT); } if (featsString == null || featsString.length() == 0) { featsString = "-"; } if(classString.startsWith("v-") && word.contains("-")) { // don't add // System.err.println("ignore " + word); } else if ("pron".equals(classString)) { // change to pron-det and pron-indp if (knownFeats.contains(featsString) || allowInvalidFeats) { put( word, asTriple("pron-det", lemmaTag[0], featsString, includeFeatures), entries); put( word, asTriple("pron-indp", lemmaTag[0], featsString, includeFeatures), entries); } } else if (classString != null && knownPostags.contains(classString) && (knownFeats.contains(featsString) || allowInvalidFeats)) { put( word, asTriple(classString, lemmaTag[0], featsString, includeFeatures), entries); } else { if ("pnt".equals(classString) && knownPostags.contains(word)) { put(word, asTriple(word, word, null, includeFeatures), entries); } else if (!classString.startsWith("v-")) System.err.println("unknown - " + word + " -> " + new Triple(classString, lemmaTag[0], classString + "_" + featsString)); unknownTags.add(classString + "_" + featsString); } } } } } if (knownFeats.size() > 0) { System.err.print("Known tags:"); for (String tag : knownFeats) { System.err.print(" " + tag); } System.err.println(); } if (unknownTags.size() > 0) { System.err.print("Found unknown tags:"); for (String tag : unknownTags) { System.err.print(" " + tag); } System.err.println(); } } private static Triple asTriple(String clazz, String lemma, String feats, boolean includeFeatures) { if (includeFeatures) return new Triple(clazz, lemma, feats); return new Triple(clazz, lemma, null); } private static final char HT = '\t'; private static final char NL = '\n'; private static String toString(String word, Triple t) { StringBuilder sb = new StringBuilder(); sb.append(word).append(HT).append(t.getLemma()).append(HT) .append(t.getClazz()); if (t.getFeats() != null && t.getFeats().length() > 0) { sb.append("#").append(t.getFeats()); } sb.append(NL); return sb.toString(); } }