/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.gosen; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.LanguageCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import net.java.sen.SenFactory; import net.java.sen.StringTagger; import net.java.sen.dictionary.Token; /** * Segmenter for Japanese text based on GoSen. */ @LanguageCapability("ja") @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }) public class GosenSegmenter extends SegmenterBase { @Override protected void process(JCas aJCas, String text, int zoneBegin) throws AnalysisEngineProcessException { String language = getLanguage(aJCas); if (!"ja".equals(language)) { throw new AnalysisEngineProcessException(Messages.BUNDLE, Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { language }); } StringTagger tagger = SenFactory.getStringTagger(null); List<Token> tokens = new ArrayList<>(); try { tokens = tagger.analyze(text, tokens); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } int sentenceBegin = -1; for (Token t : tokens) { Annotation ut = createToken(aJCas, t.getStart() + zoneBegin, t.end() + zoneBegin); if (sentenceBegin == -1) { sentenceBegin = ut.getBegin(); } // End of sentence? if ("。".equals(ut.getCoveredText())) { createSentence(aJCas, sentenceBegin, ut.getEnd()); sentenceBegin = -1; } } } }