/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.textnormalizer.transformation; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; /** * Simple dictionary-based hyphenation remover. */ public class HyphenationRemover extends JCasTransformerChangeBased_ImplBase { /** * Expect at least one whitespace character behind the dash to avoid * conflating words which may be written with a dash or without, such as * "non-empty" and "nonempty". */ private static final Pattern HYPHEN_PATTERN = Pattern.compile( "\\b(\\p{L}+)-[\\p{Space}]+(\\p{L}+)\\b"); public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private String modelLocation; public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8") private String modelEncoding; private Set<String> dict; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try { URL url = ResourceUtils.resolveLocation(modelLocation); try (InputStream is = url.openStream()) { dict = new HashSet<>(IOUtils.readLines(is ,modelEncoding)); } } catch (IOException e) { throw new ResourceInitializationException(e); } } @Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { StringBuilder c_new = new StringBuilder(); final Matcher m = HYPHEN_PATTERN.matcher(aInput.getDocumentText()); while (m.find()) { // The capturing groups count should be exactly 2. assert m.groupCount() == 2 : "Expected 2 groups but got " + m.groupCount(); c_new.setLength(0); c_new.append(m.group(1)); c_new.append(m.group(2)); if (dict.contains(c_new.toString())) { replace(m.start(1), m.end(2), c_new.toString()); // getLogger().info( // "Conflated: [" + aInput.getDocumentText().substring(m.start(1), m.end(2)) // + "] to [" + c_new + "]"); } } } }