/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Wei Li <a href="mailto:weili@cs.umass.edu">weili@cs.umass.edu</a> */ package cc.mallet.share.weili.ner; import java.util.regex.*; import java.io.*; import cc.mallet.pipe.*; import cc.mallet.types.*; public class WordTransformation { static final String[] endings = new String[] {"ing", "ed", "ogy", "s", "ly", "ion", "tion", "ity", "ies"}; static Pattern[] endingPatterns = new Pattern[endings.length]; static final String[][][] endingNames = new String[2][3][endings.length]; { for (int i = 0; i < endings.length; i++) { endingPatterns[i] = Pattern.compile (".*"+endings[i]+"$"); for (int j = 0; j < 3; j++) { for (int k = 0; k < 2; k++) endingNames[k][j][i] = "W"+(k==1?"-":"")+j+"=<END"+endings[i]+">"; } } } boolean doSpelling; boolean doDigitCollapses; boolean doDowncasing; public WordTransformation () { this (false, true, false); } public WordTransformation (boolean doSpelling, boolean doDigitCollapses, boolean doDowncasing) { this.doSpelling = doSpelling; this.doDigitCollapses = doDigitCollapses; this.doDowncasing = doDowncasing; } public Token transformedToken (String original) { boolean [][] ending = new boolean[3][endings.length]; boolean [][] endingp1 = new boolean[3][endings.length]; boolean [][] endingp2 = new boolean[3][endings.length]; String word = original; if (doDigitCollapses) { if (word.matches ("19\\d\\d")) word = "<YEAR>"; else if (word.matches ("19\\d\\ds")) word = "<YEARDECADE>"; else if (word.matches ("19\\d\\d-\\d+")) word = "<YEARSPAN>"; else if (word.matches ("\\d+\\\\/\\d")) word = "<FRACTION>"; else if (word.matches ("\\d[\\d,\\.]*")) word = "<DIGITS>"; else if (word.matches ("19\\d\\d-\\d\\d-\\d--d")) word = "<DATELINEDATE>"; else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d")) word = "<DATELINEDATE>"; else if (word.matches (".*-led")) word = "<LED>"; else if (word.matches (".*-sponsored")) word = "<LED>"; } if (doDowncasing) word = word.toLowerCase(); Token token = new Token (word); if (doSpelling) { for (int j = 0; j < endings.length; j++) { ending[2][j] = ending[1][j]; ending[1][j] = ending[0][j]; ending[0][j] = endingPatterns[j].matcher(word).matches(); if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1); } } return token; } }