/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; import static org.apache.uima.fit.util.JCasUtil.select; import java.util.Locale; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; /** * Change tokens to follow a specific casing: all upper case, all lower case, or 'normal case': * lowercase everything but the first character of a token and the characters immediately following * a hyphen. * * */ public class TokenCaseTransformer extends JCasTransformerChangeBased_ImplBase { private static final String HYPHEN = "-"; public static enum Case { UPPERCASE, LOWERCASE, NORMALCASE } /** * The case to convert tokens to: * <ul> * <li>UPPERCASE: uppercase everything.</li> * <li>LOWERCASE: lowercase everything.</li> * <li>NORMALCASE: retain first letter in word and after hyphens, lowercase everything else.</li> * </ul> */ public static final String PARAM_CASE = "tokenCase"; @ConfigurationParameter(name = PARAM_CASE, mandatory = true) private Case tokenCase; @Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { Locale locale = Locale.forLanguageTag(aInput.getDocumentLanguage()); for (Token token : select(aInput, Token.class)) { String origTokenText = token.getCoveredText(); String filteredToken = origTokenText; if (!filteredToken.isEmpty()) { switch (tokenCase) { case UPPERCASE: filteredToken = origTokenText.toUpperCase(locale); break; case LOWERCASE: filteredToken = origTokenText.toLowerCase(locale); break; case NORMALCASE: StringBuilder normalized = new StringBuilder(origTokenText.toLowerCase()); normalized.setCharAt(0, origTokenText.charAt(0)); /* after hyphen, retain original case */ int hyphenPosition = normalized.indexOf(HYPHEN); while (hyphenPosition != -1) { if (normalized.length() > hyphenPosition + 1) { normalized.setCharAt(hyphenPosition + 1, origTokenText.charAt(hyphenPosition + 1)); } hyphenPosition = normalized.indexOf(HYPHEN, hyphenPosition + 1); } filteredToken = normalized.toString(); break; default: throw new IllegalStateException("Unknown case parameter [" + tokenCase + "]"); } } if (!filteredToken.equals(origTokenText)) { replace(token.getBegin(), token.getEnd(), filteredToken); } } } }