/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.textnormalizer; import static de.tudarmstadt.ukp.dkpro.core.castransformation.ApplyChangesAnnotator.OP_REPLACE; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.transform.alignment.AlignedString; import de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation; /** * Takes a text and replaces desired expressions This class should not work on tokens as some * expressions might span several tokens * */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation" }) @Deprecated public class ReplacementFileNormalizer extends Normalizer_ImplBase { /** * Location of a file which contains all replacing characters */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true) private String replacePath; public static final String PARAM_SRC_SURROUNDINGS = "srcExpressionSurroundings"; @ConfigurationParameter(name = PARAM_SRC_SURROUNDINGS, mandatory = true, defaultValue = "IRRELEVANT") private SrcSurroundings srcExpressionSurroundings; public static final String PARAM_TARGET_SURROUNDINGS = "targetExpressionSurroundings"; @ConfigurationParameter(name = PARAM_TARGET_SURROUNDINGS, mandatory = true, defaultValue = "NOTHING") private TargetSurroundings targetExpressionSurroundings; private String srcSurroundingsStart; private String srcSurroundingsEnd; private String targetSurroundings; public enum SrcSurroundings { ONLY_ALPHANIMERIC, IRRELEVANT } public enum TargetSurroundings { WHITESPACE, NOTHING } protected Map<String, String> replacementMap; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); replacementMap = getReplacementMap(); switch (srcExpressionSurroundings) { case ONLY_ALPHANIMERIC: srcSurroundingsStart = "([^a-zA-Z0-9äöüß]|^)"; srcSurroundingsEnd = "([^a-zA-Z0-9äöüß]|$)"; break; case IRRELEVANT: srcSurroundingsStart = ""; srcSurroundingsEnd = ""; break; } switch (targetExpressionSurroundings) { case WHITESPACE: targetSurroundings = " "; break; case NOTHING: targetSurroundings = ""; break; } } @Override protected Map<Integer, List<SofaChangeAnnotation>> createSofaChangesMap(JCas jcas) { Map<Integer, List<SofaChangeAnnotation>> changesMap = new TreeMap<Integer, List<SofaChangeAnnotation>>(); int mapKey = 1; String coveredText = jcas.getDocumentText().toLowerCase(); List<SofaChangeAnnotation> scaChangesList = new ArrayList<SofaChangeAnnotation>(); for (Map.Entry<String, String> entry : replacementMap.entrySet()) { String replacementKey = entry.getKey().toLowerCase(); String replacementValue = targetSurroundings + entry.getValue() + targetSurroundings; String regex = srcSurroundingsStart + "(" + Pattern.quote(replacementKey) + ")" + srcSurroundingsEnd; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(coveredText); int groupNumberOfKey = (matcher.groupCount() == 1) ? 1 : 2; while (matcher.find()) { int start = matcher.start(groupNumberOfKey); int end = matcher.end(groupNumberOfKey); SofaChangeAnnotation sca = new SofaChangeAnnotation(jcas); sca.setBegin(start); sca.setEnd(end); sca.setOperation(OP_REPLACE); sca.setValue(replacementValue); scaChangesList.add(sca); System.out.println(matcher.group(0)); } } changesMap.put(mapKey++, scaChangesList); return changesMap; } @SuppressWarnings("serial") @Override protected Map<Integer, Boolean> createTokenReplaceMap(JCas jcas, AlignedString as) throws AnalysisEngineProcessException { return new TreeMap<Integer, Boolean>() { { put(1, true); } }; } private Map<String, String> getReplacementMap() throws ResourceInitializationException { Map<String, String> replacementMap = new HashMap<String, String>(); try { // Reads in all mappings of expressions(to be replaced expression, target expression) // and fills replacement map for (String line : FileUtils.readLines(new File(replacePath))) { if (!line.isEmpty()) { // Each line of source file contains mapping of "to replaced expression" and the // "target expressions" // those expressions are separated by tabs String[] entry = line.split("\t"); replacementMap.put(entry[0], entry[1]); } } } catch (IOException e) { throw new ResourceInitializationException(e); } return replacementMap; } }