/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.postag; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.util.Cache; import opennlp.tools.util.StringList; /** * A context generator for the POS Tagger. */ public class DefaultPOSContextGenerator implements POSContextGenerator { protected final String SE = "*SE*"; protected final String SB = "*SB*"; private static final int PREFIX_LENGTH = 4; private static final int SUFFIX_LENGTH = 4; private static Pattern hasCap = Pattern.compile("[A-Z]"); private static Pattern hasNum = Pattern.compile("[0-9]"); private Cache<String, String[]> contextsCache; private Object wordsKey; private Dictionary dict; private String[] dictGram; /** * Initializes the current instance. * * @param dict */ public DefaultPOSContextGenerator(Dictionary dict) { this(0,dict); } /** * Initializes the current instance. * * @param cacheSize * @param dict */ public DefaultPOSContextGenerator(int cacheSize, Dictionary dict) { this.dict = dict; dictGram = new String[1]; if (cacheSize > 0) { contextsCache = new Cache<>(cacheSize); } } protected static String[] getPrefixes(String lex) { String[] prefs = new String[PREFIX_LENGTH]; for (int li = 0; li < PREFIX_LENGTH; li++) { prefs[li] = lex.substring(0, Math.min(li + 1, lex.length())); } return prefs; } protected static String[] getSuffixes(String lex) { String[] suffs = new String[SUFFIX_LENGTH]; for (int li = 0; li < SUFFIX_LENGTH; li++) { suffs[li] = lex.substring(Math.max(lex.length() - li - 1, 0)); } return suffs; } public String[] getContext(int index, String[] sequence, String[] priorDecisions, Object[] additionalContext) { return getContext(index,sequence,priorDecisions); } /** * Returns the context for making a pos tag decision at the specified token index * given the specified tokens and previous tags. * @param index The index of the token for which the context is provided. * @param tokens The tokens in the sentence. * @param tags The tags assigned to the previous words in the sentence. * @return The context for making a pos tag decision at the specified token index * given the specified tokens and previous tags. */ public String[] getContext(int index, Object[] tokens, String[] tags) { String next, nextnext = null, lex, prev, prevprev = null; String tagprev, tagprevprev; tagprev = tagprevprev = null; lex = tokens[index].toString(); if (tokens.length > index + 1) { next = tokens[index + 1].toString(); if (tokens.length > index + 2) nextnext = tokens[index + 2].toString(); else nextnext = SE; // Sentence End } else { next = SE; // Sentence End } if (index - 1 >= 0) { prev = tokens[index - 1].toString(); tagprev = tags[index - 1]; if (index - 2 >= 0) { prevprev = tokens[index - 2].toString(); tagprevprev = tags[index - 2]; } else { prevprev = SB; // Sentence Beginning } } else { prev = SB; // Sentence Beginning } String cacheKey = index + tagprev + tagprevprev; if (contextsCache != null) { if (wordsKey == tokens) { String[] cachedContexts = contextsCache.get(cacheKey); if (cachedContexts != null) { return cachedContexts; } } else { contextsCache.clear(); wordsKey = tokens; } } List<String> e = new ArrayList<>(); e.add("default"); // add the word itself e.add("w=" + lex); dictGram[0] = lex; if (dict == null || !dict.contains(new StringList(dictGram))) { // do some basic suffix analysis String[] suffs = getSuffixes(lex); for (int i = 0; i < suffs.length; i++) { e.add("suf=" + suffs[i]); } String[] prefs = getPrefixes(lex); for (int i = 0; i < prefs.length; i++) { e.add("pre=" + prefs[i]); } // see if the word has any special characters if (lex.indexOf('-') != -1) { e.add("h"); } if (hasCap.matcher(lex).find()) { e.add("c"); } if (hasNum.matcher(lex).find()) { e.add("d"); } } // add the words and pos's of the surrounding context if (prev != null) { e.add("p=" + prev); if (tagprev != null) { e.add("t=" + tagprev); } if (prevprev != null) { e.add("pp=" + prevprev); if (tagprevprev != null) { e.add("t2=" + tagprevprev + "," + tagprev); } } } if (next != null) { e.add("n=" + next); if (nextnext != null) { e.add("nn=" + nextnext); } } String[] contexts = e.toArray(new String[e.size()]); if (contextsCache != null) { contextsCache.put(cacheKey,contexts); } return contexts; } }