/* Copyright (c) 2007 Arizona State University, Dept. of Computer Science and Dept. of Biomedical Informatics. This file is part of the BANNER Named Entity Recognition System, http://banner.sourceforge.net This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file 'LICENSE.txt' included with this distribution. */ package de.berlin.hu.banner.featuresets; import banner.types.Sentence; import banner.types.Token; import cc.mallet.pipe.Pipe; import cc.mallet.types.Instance; import cc.mallet.types.TokenSequence; import java.util.List; /** * Accesses the succeeding token and checks whether it is a whitespace character. * * @author Bob */ public class RWhitespace extends Pipe { private static final long serialVersionUID = 1L; private String prefix; public RWhitespace(String prefix) { this.prefix = prefix; } @Override public Instance pipe(Instance carrier) { // TODO Configure folding characters into their class & test variants Sentence sentence = (Sentence)carrier.getSource(); List<Token> tokens = sentence.getTokens(); TokenSequence ts = (TokenSequence)carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token bannerToken = tokens.get(i); cc.mallet.types.Token token = ts.get(i); // Add features to token String sentenceText = bannerToken.getSentence().getText(); int end = bannerToken.getEnd(); if (end < sentenceText.length()) { char ch = sentenceText.substring(end, end + 1).charAt(0); if (Character.isWhitespace(ch)) token.setFeatureValue("RWHITESPACE", 1); } } return carrier; } }