/*
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.web1t;
import java.io.IOException;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.io.web1t.util.Web1TConverter;
/**
* Web1T n-gram index format writer.
*/
@MimeTypeCapability({MimeTypes.TEXT_X_NGRAM})
@TypeCapability(
inputs={
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"})
public class Web1TWriter
extends JCasAnnotator_ImplBase
{
/**
* Types to generate n-grams from.
*
* Example: {@code Token.class.getName() + "/pos/PosValue"} for part-of-speech n-grams
*/
public static final String PARAM_INPUT_TYPES = "inputTypes";
@ConfigurationParameter(name = PARAM_INPUT_TYPES, mandatory = true)
private Set<String> inputPaths;
/**
* Location to which the output is written.
*/
public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION;
@ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true)
private String outputPath;
/**
* Character encoding of the output data.
*/
public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING;
@ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = false, defaultValue = "UTF-8")
private String outputEncoding;
/**
* Minimum n-gram length.
*
* Default: {@code 1}
*/
public static final String PARAM_MIN_NGRAM_LENGTH = "minNgramLength";
@ConfigurationParameter(name = PARAM_MIN_NGRAM_LENGTH, mandatory = false, defaultValue = "1")
private int minNgramLength;
/**
* Maximum n-gram length.
*
* Default: {@code 3}
*/
public static final String PARAM_MAX_NGRAM_LENGTH = "maxNgramLength";
@ConfigurationParameter(name = PARAM_MAX_NGRAM_LENGTH, mandatory = false, defaultValue = "3")
private int maxNgramLength;
/**
* Create a lower case index.
*/
public static final String PARAM_LOWERCASE = "lowercase";
@ConfigurationParameter(name = PARAM_LOWERCASE, mandatory = false, defaultValue = "false")
private boolean lowercase;
/**
* Create the indexes that jWeb1T needs to operate. (default: true)
*/
public static final String PARAM_CREATE_INDEXES = "createIndexes";
@ConfigurationParameter(name = PARAM_CREATE_INDEXES, mandatory = false, defaultValue = "true")
private boolean createIndexes;
/**
* Specifies the minimum frequency a NGram must have to be written to the
* final index. The specified value is interpreted as inclusive value, the
* default is 1. Thus, all NGrams with a frequency of at least 1 or higher
* will be written.
*/
public static final String PARAM_MIN_FREQUENCY = "minFreq";
@ConfigurationParameter(name = PARAM_MIN_FREQUENCY, mandatory = false, defaultValue = "1")
private int minFreq;
/**
* The input file(s) is/are split into smaller files for quick access. An
* own file is created if the first two starting letters (or the starting
* letter if the word has a length of 1 character) account for at least x%
* of all starting letters in the input file(s). The default value for
* splitting a file is 1.0%. Every word that has starting characters which
* does not suffice the threshold is written with other words that also did
* not meet the threshold into an own file for miscellaneous words. A high
* threshold will lead to only a few, but large files and a most likely very
* large misc. file. A low threshold results in many small files. Use a zero or a negative
* value to write everything to one file.
*/
public static final String PARAM_SPLIT_TRESHOLD = "splitFileTreshold";
@ConfigurationParameter(name = PARAM_SPLIT_TRESHOLD, mandatory = false, defaultValue = "1.0")
private float splitThreshold;
/**
* The type being used for segments
*/
public static final String PARAM_CONTEXT_TYPE = "contextType";
@ConfigurationParameter(name = PARAM_CONTEXT_TYPE, mandatory = true, defaultValue="de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
protected String contextType;
private Web1TConverter converter;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
try {
this.converter = new Web1TConverter(outputPath, minNgramLength, maxNgramLength);
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
converter.setWriteIndexes(createIndexes);
converter.setSplitThreshold(splitThreshold);
converter.setMinFrequency(minFreq);
converter.setToLowercase(lowercase);
converter.setOutputEncoding(outputEncoding);
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
try {
converter.add(jcas, inputPaths, jcas.getCas().getTypeSystem().getType(contextType));
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
/**
* The input files for each ngram level is read, splitted according to the
* frequency of the words starting letter in the files and the split files
* are individually sorted and consolidated.
*/
@Override
public void collectionProcessComplete()
throws AnalysisEngineProcessException
{
super.collectionProcessComplete();
try {
converter.createIndex();
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
}