/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.text; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.PhraseSequenceGenerator; import de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator.StringSequenceGenerator; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import java.io.IOException; import java.io.OutputStream; /** * This class writes a set of pre-processed documents into a large text file containing one sentence * per line and tokens split by whitespaces. Optionally, annotations other than tokens (e.g. lemmas) * are written as specified by {@link #PARAM_FEATURE_PATH}. */ @MimeTypeCapability({MimeTypes.TEXT_PLAIN}) @TypeCapability( inputs={ "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"}) public class TokenizedTextWriter extends JCasFileWriter_ImplBase { private static final String TOKEN_SEPARATOR = " "; private static final String NUMBER_REPLACEMENT = "NUM"; private static final String STOPWORD_REPLACEMENT = "STOP"; private static final String DEFAULT_COVERING_TYPE = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"; /** * Encoding for the target file. Default is UTF-8. */ public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = "UTF-8") private String targetEncoding; /** * The feature path, e.g. * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token/lemma/value} for lemmas. Default: * {@code de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token} (i.e. token texts). */ public static final String PARAM_FEATURE_PATH = "featurePath"; /** * All tokens that match this regex are replaced by {@code NUM}. Examples: * <ul> * <li>^[0-9]+$ * <li>^[0-9,\.]+$ * <li>^[0-9]+(\.[0-9]*)?$ * </ul> * <p> * Make sure that these regular expressions are fit to the segmentation, e.g. if your work on * tokens, your tokenizer might split prefixes such as + and - from the rest of the number. */ @ConfigurationParameter(name = PARAM_FEATURE_PATH, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token") private String featurePath; public static final String PARAM_NUMBER_REGEX = "numberRegex"; @ConfigurationParameter(name = PARAM_NUMBER_REGEX, mandatory = true, defaultValue = "") private String numberRegex; /** * All the tokens listed in this file (one token per line) are replaced by {@code STOP}. Empty * lines and lines starting with {@code #} are ignored. Casing is ignored. */ public static final String PARAM_STOPWORDS_FILE = "stopwordsFile"; @ConfigurationParameter(name = PARAM_STOPWORDS_FILE, mandatory = true, defaultValue = "") private String stopwordsFile; /** * Set the output file extension. Default: {@code .txt}. */ public static final String PARAM_EXTENSION = "extension"; @ConfigurationParameter(name = PARAM_EXTENSION, mandatory = true, defaultValue = ".txt") private String extension = ".txt"; /** * In the output file, each unit of the covering type is written into a separate line. The default * (set in {@link #DEFAULT_COVERING_TYPE}), is sentences so that each sentence is written to a line. * <p> * If no linebreaks within a document is desired, set this value to {@code null}. */ public static final String PARAM_COVERING_TYPE = "coveringType"; @ConfigurationParameter(name = PARAM_COVERING_TYPE, mandatory = true, defaultValue = DEFAULT_COVERING_TYPE) private String coveringType; private StringSequenceGenerator sequenceGenerator; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); try { sequenceGenerator = new PhraseSequenceGenerator.Builder() .featurePath(featurePath) .filterRegex(numberRegex) .filterRegexReplacement(NUMBER_REPLACEMENT) .stopwordsFile(stopwordsFile) .stopwordsReplacement(STOPWORD_REPLACEMENT) .coveringType(coveringType) .buildStringSequenceGenerator(); } catch (IOException e) { throw new ResourceInitializationException(e); } } /* * (non-Javadoc) * * @see * org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache.uima.jcas.JCas) */ @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { try { OutputStream outputStream = getOutputStream(aJCas, extension); /* iterate over sentences */ for (String[] line : sequenceGenerator.tokenSequences(aJCas)) { if (line.length > 0) { /* write first token */ outputStream.write(line[0].getBytes(targetEncoding)); /* write remaining tokens with token separator */ for (int i = 1; i < line.length; i++) { outputStream.write((TOKEN_SEPARATOR + line[i]).getBytes(targetEncoding)); } } outputStream.write(System.lineSeparator().getBytes(targetEncoding)); } } catch (FeaturePathException | IOException e) { throw new AnalysisEngineProcessException(e); } } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (getTargetLocation() == null) { getLogger().info("Output written to file <stdout>"); } else { getLogger().info("Output written to file " + getTargetLocation()); } super.collectionProcessComplete(); } }