/*Copyright 2014, Language Technologies Institute, Carnegie Mellon University Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package cmu.arktweetnlp.impl.features; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.regex.Pattern; import cmu.arktweetnlp.Twokenize; import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface; import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs; import cmu.arktweetnlp.util.BasicFileIO; public class WordListFeatures { public static class POSTagDict implements FeatureExtractorInterface { Pattern URL = Pattern.compile(Twokenize.url); Pattern letter = Pattern.compile("[A-Za-z]{3,}"); public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) { for (int t=0; t < tokens.size(); t++) { String tok = tokens.get(t); List<String> poses = TagDictionary.WORD_TO_POS.get(tok); if (poses == null && letter.matcher(tok).find() && (!URL.matcher(tok).matches())) { String normtok = FeatureUtil.normalizecap(tok); poses = TagDictionary.WORD_TO_POS.get(normtok); if (poses==null) { ArrayList<String> fuzz = new ArrayList<String>(); fuzz.addAll(FeatureUtil.fuzztoken(normtok, true)); fuzz.addAll(FeatureUtil.fuzztoken(FeatureUtil.normalize(tok), true)); for (String f:fuzz){ poses = TagDictionary.WORD_TO_POS.get(f); if (poses != null){ //System.err.println(tok+"->"+f); break; } } } } if (poses!=null) { pairs.add(t, "POSTagDict|"+poses.get(0)); if (t > 0) pairs.add(t-1, "NextPOSTag|" + poses.get(0)); if (t < tokens.size()-1) pairs.add(t+1, "PrevPOSTag|" + poses.get(0)); for (int i=1; i < poses.size(); i++) { pairs.add(t, "POSTagDict|" + poses.get(i), (poses.size()-(double)i)/poses.size()); } } } } } /* * If you add a new list, make sure to run "mvn install -q" so it's copied * to the target folder or else you'll get a null pointer exception */ public static class Listofnames implements FeatureExtractorInterface { String Listname=""; HashSet<String> members; public Listofnames(String str) throws IOException { Listname=str; this.members = initDict(Listname); } public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) { for (int t=0; t < tokens.size(); t++) { String tok = tokens.get(t); String normaltok = tok.toLowerCase().replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\""); if (members.contains(normaltok)){ pairs.add(t, Listname); } } } } public static class MetaphonePOSDict implements FeatureExtractorInterface { public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) { for (int t=0; t < tokens.size(); t++) { String tok = tokens.get(t); if(tok.length()>1){ //tok=tok.replace("4", "four").replace("2", "two").replace("3","three"); String metaphone_word = MetaphoneFeatures.getMetaphone().encode(tok); String key = String.format("**MP**%s", metaphone_word); if (TagDictionary.WORD_TO_POS.containsKey(key)) { List<String> poses = TagDictionary.WORD_TO_POS.get(key); for (String pos : poses) { pairs.add(t, "metaph_POSDict|"+pos); } } } } } } private static HashSet<String> initDict(String dict) throws IOException { BufferedReader bReader = BasicFileIO.getResourceReader("res/arktweetnlp/" + dict); HashSet<String> dictset = new HashSet<String>(); String line=BasicFileIO.getLine(bReader); while(line != null){ dictset.add(line.toLowerCase()); line = BasicFileIO.getLine(bReader); } return dictset; } }