/*Copyright 2014, Language Technologies Institute, Carnegie Mellon
University
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
*/
package cmu.arktweetnlp.impl.features;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import cmu.arktweetnlp.Twokenize;
import cmu.arktweetnlp.impl.features.FeatureExtractor.FeatureExtractorInterface;
import cmu.arktweetnlp.impl.features.FeatureExtractor.PositionFeaturePairs;
public class MiscFeatures {
public static class NextWord implements FeatureExtractorInterface{
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
ArrayList<String> normtoks = FeatureUtil.normalize(tokens);
for (int t=0; t < tokens.size()-1; t++) {
pairs.add(t, "nextword|"+tokens.get(t+1), .5);
pairs.add(t, "nextword|"+normtoks.get(t+1), .5);
pairs.add(t, "currnext|"+normtoks.get(t)+"|"+normtoks.get(t+1));
}
pairs.add(tokens.size()-1, "currnext|"+normtoks.get(tokens.size()-1)+"|<END>");
pairs.add(tokens.size()-1, "nextword|<END>");
}
}
public static class Next2Words implements FeatureExtractorInterface{
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
if (tokens.size()>1){
ArrayList<String> normtoks = FeatureUtil.normalize(tokens);
for (int t=0; t < tokens.size()-2; t++) {
pairs.add(t, "next2words|"+tokens.get(t+1)+"|"+tokens.get(t+2), .5);
pairs.add(t, "next2words|"+normtoks.get(t+1)
+"|"+normtoks.get(t+2), .5);
}
pairs.add(tokens.size()-2, "next2words|"+normtoks.get(tokens.size()-1)+"|<END>", .5);
pairs.add(tokens.size()-2, "next2words|"+tokens.get(tokens.size()-1)+"|<END>", .5);
}
}
}
public static class PrevWord implements FeatureExtractorInterface{
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
ArrayList<String> normtoks = FeatureUtil.normalize(tokens);
pairs.add(0, "prevword|<START>");
pairs.add(0, "prevcurr|<START>|"+normtoks.get(0));
for (int t=1; t < tokens.size(); t++) {
pairs.add(t, "prevword|"+tokens.get(t-1));
pairs.add(t, "prevword|"+normtoks.get(t-1));
pairs.add(t, "prevcurr|"+normtoks.get(t-1)
+"|"+normtoks.get(t));
}
}
}
public static class Prev2Words implements FeatureExtractorInterface{
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
if (tokens.size()>1){
ArrayList<String> normtoks = FeatureUtil.normalize(tokens);
pairs.add(1, "prev2words|<START>|"+normtoks.get(0)+"|"+normtoks.get(1));
for (int t=2; t < tokens.size(); t++) {
pairs.add(t, "prev2words|"+tokens.get(t-2)+"|"+tokens.get(t-1));
pairs.add(t, "prev2words|"+normtoks.get(t-2)+"|"+normtoks.get(t-1));
}
}
}
}
public static class PrevNext implements FeatureExtractorInterface{
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
ArrayList<String> normtoks = FeatureUtil.normalize(tokens);
if (tokens.size()>1){
pairs.add(0, "prevnext|<START>|"+normtoks.get(1));
//pairs.add(0, "prevcurrnext|<START>|"+normtoks.get(0)+"|"+normtoks.get(1));
for (int t=1; t < normtoks.size()-1; t++) {
pairs.add(t, "prevnext|"+normtoks.get(t-1)+"|"+normtoks.get(t+1));
//pairs.add(t, "prevcurrnext|"+normtoks.get(t-1)+"|"+normtoks.get(0)+"|"+normtoks.get(1));
}
pairs.add(normtoks.size()-1,"prevnext|"+normtoks.get(tokens.size()-2)+"|<END>");
//pairs.add(normtoks.size()-1,"prevcurrnext|"+normtoks.get(tokens.size()-2)+"|"+normtoks.get(tokens.size()-1)+"|<END>");
}
}
}
public static class CapitalizationFeatures implements FeatureExtractorInterface {
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
/*if (Character.isUpperCase(tok.charAt(0))) {
pairs.add(t, "initcap");
}*/
int numChar = 0;
int numCap = 0;
for (int i=0; i < tok.length(); i++) {
numChar += Character.isLetter(tok.charAt(i)) ? 1 : 0;
numCap += Character.isUpperCase(tok.charAt(i)) ? 1 : 0;
}
// A => shortcap
// HELLO => longcap
// Hello => initcap
// HeLLo => mixcap
boolean allCap = numChar==numCap;
boolean shortCap = allCap && numChar <= 1;
boolean longCap = allCap && numChar >= 2;
boolean initCap = !allCap && numChar >= 2 && Character.isUpperCase(tok.charAt(0)) && numCap==1;
boolean mixCap = numCap>=1 && numChar >= 2 && (tok.charAt(0) != '@') && !(tok.startsWith("http://"));
String caplabel = shortCap ? "shortcap" : longCap ? "longcap" : initCap ? "initcap"
: mixCap ? "mixcap" : "nocap";
if (caplabel != null){
if (numChar >= 1) {
if (tok.endsWith("'s"))
caplabel = "pos-" + caplabel;
else if (t==0)
caplabel = "first-" + caplabel;
pairs.add(t, caplabel+"");
}
}
}
}
}
public static class SimpleOrthFeatures implements FeatureExtractorInterface {
public Pattern hasDigit = Pattern.compile("[0-9]");
/** TODO change to punctuation class, or better from Twokenize **/
//Pattern allPunct = Pattern.compile("^[^a-zA-Z0-9]*$");
Pattern allPunct = Pattern.compile("^\\W*$");
Pattern emoticon = Pattern.compile(Twokenize.emoticon);
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
if (hasDigit.matcher(tok).find())
pairs.add(t, "HasDigit");
if (tok.charAt(0) == '@')
pairs.add(t, "InitAt");
if (tok.charAt(0) == '#')
pairs.add(t, "InitHash");
/*if (allPunct.matcher(tok).matches()){
pairs.add(t, "AllPunct");
}*/
if (emoticon.matcher(tok).matches()){
pairs.add(t, "Emoticon");
}
if (tok.contains("-")){
pairs.add(t, "Hyphenated");
String[] splithyph = FeatureUtil.normalize(tok).split("-", 2);
//pairs.add(t, "preHyph|"+splithyph[0]);//quote -Ben Franklin
//pairs.add(t, "postHyph|"+splithyph[1]);//-esque
for (String part:splithyph){
pairs.add(t, "hyph|" + part);
}
}
}
}
}
public static class URLFeatures implements FeatureExtractorInterface {
Pattern validURL = Pattern.compile(Twokenize.url);
Pattern validEmail = Pattern.compile(Twokenize.Email);
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
if (validURL.matcher(tok).matches()){
pairs.add(t, "validURL");
}
if (validEmail.matcher(tok).matches()){
pairs.add(t, "validURL");
}
}
}
}
public static class WordformFeatures implements FeatureExtractorInterface {
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = tokens.get(t);
String normalizedtok=tok.replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\"");
pairs.add(t, "Word|" + normalizedtok);
pairs.add(t, "Lower|" + FeatureUtil.normalize(normalizedtok));
pairs.add(t, "Xxdshape|" + Xxdshape(normalizedtok), .5);
pairs.add(t, "charclass|" + charclassshape(tok), .5);
}
}
private String Xxdshape(String tok) {
String s=tok.replaceAll("[a-z]", "x").replaceAll("[0-9]", "d").replaceAll("[A-Z]","X");
return s;
}
private String charclassshape(String tok) {
StringBuilder sb = new StringBuilder(3 * tok.length());
for(int i=0; i<tok.length(); i++){
sb.append(Character.getType(tok.codePointAt(i))).append(',');
}
return sb.toString();
}
}
public static class NgramPrefix implements FeatureExtractorInterface {
int ngram=3;
public NgramPrefix(int i) {
ngram=i;
}
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = FeatureUtil.normalize(tokens.get(t).replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\""));
int l=tok.length();
for(int i=1;i<=ngram;i++){
if(l>=i){
pairs.add(t, i+"gramPref|"+tok.substring(0, i));
}
else break;
}
}
}
}
public static class NgramSuffix implements FeatureExtractorInterface {
int ngram=3;
public NgramSuffix(int i) {
ngram=i;
}
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
for (int t=0; t < tokens.size(); t++) {
String tok = FeatureUtil.normalize(tokens.get(t).replaceAll("[‘’´`]", "'").replaceAll("[“”]", "\""));
int l=tok.length();
for(int i=1;i<=ngram;i++){
if(l>=i){
pairs.add(t, i+"gramSuff|"+tok.substring(l-i, l));
/*if (t<tokens.size()-1 && i==3)
pairs.add(t+1, "prev"+i+"gramSuff|"+tok.substring(l-i, l).toLowerCase()); */
}
else break;
}
}
}
}
public static class Positions implements FeatureExtractorInterface {
public void addFeatures(List<String> tokens, PositionFeaturePairs pairs) {
for (int t=0; t < Math.min(tokens.size(), 4); t++) {
pairs.add(t, "t="+t);
}
for (int t=tokens.size()-1; t > Math.max(tokens.size()-4, -1); t--) {
pairs.add(t, "t=-"+t);
}
}
}
}