/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.postag;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;
import org.cogroo.entities.impl.MorphologicalTag;
import org.cogroo.interpreters.TagInterpreter;
public class MyPOSDictionary extends POSDictionary {
private Set<String> knwonTags = new HashSet<String>();
public static ExtendedPOSDictionary parseOneEntryPerLine(Reader in,
TagInterpreter tago, TagInterpreter tagd, Set<String> knownFeats,
Set<String> knownPostags, boolean allowInvalidFeats) throws IOException {
knownFeats = new TreeSet<String>(knownFeats);
BufferedReader lineReader = new BufferedReader(in);
ExtendedPOSDictionary dictionary = new ExtendedPOSDictionary();
String line;
Set<String> unknownTags = new TreeSet<String>();
while ((line = lineReader.readLine()) != null) {
StringTokenizer whiteSpaceTokenizer = new StringTokenizer(line, " ");
String word = whiteSpaceTokenizer.nextToken();
while (whiteSpaceTokenizer.hasMoreTokens()) {
String data = whiteSpaceTokenizer.nextToken();
String[] lemmaTag = data.split(">");
if (lemmaTag.length != 2) {
// System.err.println("** Invalid lemmatag. " + word + " -> " + data);
} else {
// convert the jspell tag to floresta tag
MorphologicalTag completeTag = tago
.parseMorphologicalTag(lemmaTag[1]);
if (completeTag == null || completeTag.getClazzE() == null) {
// System.err.println("-- Missing class tag. " + word + " -> " +
// data);
} else {
MorphologicalTag classMT = new MorphologicalTag();
classMT.setClazz(completeTag.getClazzE());
String classString = tagd.serialize(classMT);
MorphologicalTag featsMT = completeTag.clone();
featsMT.setClazz(null);
String featsString = null;
if(!featsMT.isEmpty()) {
featsString = tagd.serialize(featsMT);
}
if (featsString == null || featsString.length() == 0) {
featsString = "-";
}
if ("pron".equals(classString)) {
// change to pron-det
if (knownFeats.contains(featsString) || allowInvalidFeats) {
dictionary.addTriple(word, new Triple("pron-det", lemmaTag[0],
featsString));
}
if (knownFeats.contains(featsString) || allowInvalidFeats) {
dictionary.addTriple(word, new Triple("pron-indp", lemmaTag[0],
featsString));
}
} else if (classString != null
&& knownPostags.contains(classString)
&& (knownFeats.contains(featsString) || allowInvalidFeats)) {
dictionary.addTriple(word, new Triple(classString, lemmaTag[0],
featsString));
} else {
if (!classString.startsWith("v-"))
System.err.println("unknown - "
+ word
+ " -> "
+ new Triple(classString, lemmaTag[0], classString + "_"
+ featsString));
unknownTags.add(classString + "_" + featsString);
}
}
}
}
}
if (knownFeats.size() > 0) {
System.err.print("Known tags:");
for (String tag : knownFeats) {
System.err.print(" " + tag);
}
System.err.println();
}
if (unknownTags.size() > 0) {
System.err.print("Found unknown tags:");
for (String tag : unknownTags) {
System.err.print(" " + tag);
}
System.err.println();
}
return dictionary;
}
// private static MorphologicalTag preprocess(MorphologicalTag tag) {
// if(Class.PROPER_NOUN.equals(tag.getClazzE())) {
// if(tag.getNumberE() == null) {
// tag.setNumber(Number.SINGULAR);
// }
// if(tag.getGenderE() == null) {
// tag.setGender(Gender.MALE);
// }
// }
// return tag;
// }
void addTags(String word, String... tags) {
super.addTags(word, tags);
for (String t : tags) {
knwonTags.add(t);
}
}
public void addTag(String word, String tag) {
String[] tags = getTags(word);
if (tag.startsWith("B-") || tag.startsWith("I-")) {
tag = tag.substring(2);
}
if (tags != null) {
if (!arrayContains(tag, tags) && knwonTags.contains(tag)) {
System.err.println("-- tag not found " + word + ":" + tag);
String[] newTags = Arrays.copyOf(tags, tags.length + 1);
newTags[tags.length] = tag;
addTags(word, newTags);
}
}
}
private boolean arrayContains(String tag, String[] tags) {
for (String t : tags) {
if (tag.equals(t)) {
return true;
}
}
return false;
}
public static MyPOSDictionary createCopy(POSDictionary original) {
MyPOSDictionary newDict = new MyPOSDictionary();
for (String word : original) {
newDict.addTags(word, original.getTags(word));
}
return newDict;
}
}