/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.textnormalizer.frequency; import static org.apache.uima.fit.util.JCasUtil.select; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.frequency.provider.FrequencyCountProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.transform.JCasTransformerChangeBased_ImplBase; /** * Takes a text and shortens extra long words */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class ExpressiveLengtheningNormalizer extends JCasTransformerChangeBased_ImplBase { public static final String FREQUENCY_PROVIDER = "FrequencyProvider"; @ExternalResource(key = FREQUENCY_PROVIDER, mandatory = true) protected FrequencyCountProvider frequencyProvider; @Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { // Pattern for repetitions of one character more than 2 times Pattern moreThanTwo = Pattern.compile("([a-zA-ZäöüÄÖÜß])\\1{2,}"); for (Token token : select(aInput, Token.class)) { if (moreThanTwo.matcher(token.getCoveredText()).find()) { // baseline work: reducing any repetition to a maximum of three repetitions String tokenText = token.getCoveredText().replaceAll("([a-zA-ZäöüÄÖÜß])\\1{3,}", "$1$1$1"); String replacement; try { replacement = getBestReplacement(tokenText); if (replacement.equals("No Candidate has a score higher than 0")) replacement = tokenText; } catch (IOException e) { getLogger().error( "Unable to determine the best replacement, using original. Error: " + ExceptionUtils.getRootCauseMessage(e)); replacement = tokenText; } replace(token.getBegin(), token.getEnd(), replacement); } } } public String getBestReplacement(String token) throws IOException { Pattern pattern = Pattern.compile("([a-zA-ZäöüÄÖÜß])\\1{1,}"); Matcher matcher = pattern.matcher(token); // In case there are no abnormalities if (!matcher.find()) return token; // Collecting the start points of all abnormal parts List<Integer> abnormalities = new ArrayList<Integer>(); matcher.reset(); while (matcher.find()) { abnormalities.add(matcher.start()); } // splitting in parts starting with first character abnormalities List<String> parts = new ArrayList<String>(); for (int i = 0; i < abnormalities.size(); i++) { // in case the token has only one abnormality if (abnormalities.size() == 1) { parts.add(token); break; } // first abnormality if (i == 0) { parts.add(token.substring(0, abnormalities.get(i + 1))); continue; } // last abnormality if (i == abnormalities.size() - 1) { parts.add(token.substring(abnormalities.get(i))); continue; } if (i < abnormalities.size() - 1) { parts.add(token.substring(abnormalities.get(i), abnormalities.get(i + 1))); continue; } } // Fills big list of arrays with all parts and their versions List<String[]> bigList = new ArrayList<String[]>(); for (String part : parts) { String v1 = part.replaceFirst(pattern.pattern(), "$1"); String v2 = part.replaceFirst(pattern.pattern(), "$1$1"); String v3 = part.replaceFirst(pattern.pattern(), "$1$1$1"); bigList.add(new String[] { v1, v2, v3 }); } List<String> candidates = permute(bigList, 0, new ArrayList<String>(), ""); return getMostFrequentCandidate(candidates); } private String getMostFrequentCandidate(List<String> candidates) throws IOException { long bestScore = 0; String bestCandidate = "No Candidate has a score higher than 0"; for (String currentCandidate : candidates) { long currentScore = frequencyProvider.getFrequency(currentCandidate); //System.out.println(currentCandidate + " " + currentScore); if (currentScore > bestScore) { bestScore = currentScore; bestCandidate = currentCandidate; } } return bestCandidate; } private static List<String> permute(List<String[]> listOfArrays, int depth, ArrayList<String> output, String current) { if (depth == listOfArrays.size()) { output.add(current); return output; } for (int i = 0; i < listOfArrays.get(depth).length; ++i) { permute(listOfArrays, depth + 1, output, current + listOfArrays.get(depth)[i]); } return output; } }