/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.api.io.sequencegenerator;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.uima.jcas.JCas;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Create String sequences from JCas annotations. Use a
* {@link PhraseSequenceGenerator.Builder#buildStringSequenceGenerator()} to create class instances.
* <p>
* Either create a single token sequence from the whole document, or multiple sequences based on
* covering annotations, e.g. one sequence for each sentence.
* <p>
* By default, the sequences are created from {@link Token}s found in the input document. In order
* to use other annotations, e.g. lemmas, specify the feature path in
* {@link PhraseSequenceGenerator.Builder#featurePath(String)}.
*
* @since 1.9.0
*/
public class StringSequenceGenerator
{
private PhraseSequenceGenerator psg;
protected StringSequenceGenerator(PhraseSequenceGenerator.Builder builder)
throws IOException
{
psg = builder.build();
}
/**
* Generate a list of String sequences.
*
* @param aJCas
* the {@link JCas} to generate sequences from.
* @return a list of string arrays.
* @throws FeaturePathException
* if there was a problem creating the feature path.
*/
public List<String[]> tokenSequences(JCas aJCas)
throws FeaturePathException
{
return psg.tokenSequences(aJCas).stream()
.map(this::phrases2String)
.collect(Collectors.toList());
}
/**
* Convert {@link LexicalPhrase} arrays to string arrays by extracting their texts.
*
* @param phrases an array of {@link LexicalPhrase}s.
* @return an array of strings.
*/
private String[] phrases2String(LexicalPhrase[] phrases)
{
return Stream.of(phrases)
.map(LexicalPhrase::getText)
.filter(string -> !string.isEmpty())
.toArray(String[]::new);
}
}