/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.mecab; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.LanguageCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.apache.uima.util.Logger; import org.chasen.mecab.Tagger; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.resources.PlatformDetector; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken; /** * Annotator for the MeCab Japanese POS Tagger. */ @LanguageCapability("ja") @TypeCapability( outputs={ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"} ) public class MeCabTagger extends SegmenterBase { private Logger logger; private Tagger tagger; /** * Loads MeCab library from system default paths. Throws and UnsatisfiedLinkError in case the * native code cannot be read. */ @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); logger = getContext().getLogger(); try { tagger = getMeCabJNI(); } catch (IOException e) { throw new ResourceInitializationException(e); } if (tagger == null) { throw new ResourceInitializationException(); } } private Tagger getMeCabJNI() throws ResourceInitializationException, IOException { PlatformDetector pd = new PlatformDetector(); Tagger tagger = null; try { String platform = pd.getPlatformId(); this.getLogger().log(Level.INFO, "Your platform is " + platform); if (platform.equalsIgnoreCase("linux-x86_64")) { tagger = initTagger(platform, "libmecab.so.2.0.0", "libMeCab.so"); } else if (platform.equalsIgnoreCase("linux-x86_32")) { tagger = initTagger(platform, "libmecab.so.2.0.0", "libMeCab.so"); } else if (platform.equalsIgnoreCase("osx-x86_64")) { tagger = initTagger(platform, "libmecab.2.dylib", "libMeCab.so"); } else { throw new ResourceInitializationException(new Throwable("MeCab native code for " + platform + " is not supported")); } } catch (UnsatisfiedLinkError e) { this.getLogger() .log(Level.SEVERE, "Cannot load the MeCab native code.\nMake sure that the system path (i.e. LD_LIBRARY_PATH) contains the library (i.e. libMeCab.so)\n"); throw new ResourceInitializationException(e); } return tagger; } private Tagger initTagger(String platform, String sysLib, String javaWrapper) throws IOException { String prefix = "lib/tagger/jp/bin-" + platform; String packagePrefix = getClass().getPackage().getName().replaceAll("\\.", "/"); File binFolder = ResourceUtils.getClasspathAsFolder("classpath*:" + packagePrefix + "/" + prefix, true); System.load(new File(binFolder, sysLib).getAbsolutePath()); System.load(new File(binFolder, javaWrapper).getAbsolutePath()); // Generate a dummy config file. Mecab does not really need any settings form it, but it // requires that the file is present. File dummyConfigFile = File.createTempFile("mecab", "rc"); dummyConfigFile.deleteOnExit(); String configFile = dummyConfigFile.getAbsolutePath(); // We force a temporary location because Mecab cannot deal with paths containing spaces // and it is quite unlikely that the temp folder has spaces in its path. (See comment // below as well). -- REC 2012-06-03 File dictFolder = ResourceUtils.getClasspathAsFolder("classpath*:" + packagePrefix + "/lib/tagger/jp/ipadic", true); getLogger().log(Level.INFO, "Native library folder: " + binFolder); getLogger().log(Level.INFO, "Dictionary folder: " + dictFolder); // FIXME Mecab tagger cannot deal with folders containing spaces because it uses spaces // to split the parameter string and there is no way implemented to quote parameters. // See param.cpp. There is a static create() method in C++ that acceptsma parameter // count and an array of parameter strings, but this is unusable as it is realized in JNI // at the moment. -- REC 2012-06-02 return new Tagger("-d " + dictFolder.getAbsolutePath() + " -r " + configFile); } @Override public void destroy() { super.destroy(); tagger.delete(); } @Override protected void process(JCas aJCas, String text, int zoneBegin) throws AnalysisEngineProcessException { tag(aJCas, text, zoneBegin); } protected void tag(JCas aJCas, String text, int begin) // , int end { DocumentMetaData docMeta = DocumentMetaData.get(aJCas); String documentId = docMeta.getDocumentId(); this.getLogger().log(Level.INFO, "Start tagging document with id: " + documentId); /* * First, read all morphemes and POS tags. * * The native library seems to have a problem with parseToNode(), parseToString() functions * For now, we have to parse the test from parse() function. */ // Node node = tagger.parseToNode(docText); // for (; node != null; node = node.getNext()) { // System.out.println(node.getSurface() + "\t" + node.getFeature()); // } // System.out.println("EOS\n"); List<String> morphList = new ArrayList<String>(); List<String> posList = new ArrayList<String>(); List<String> baseFormList = new ArrayList<String>(); List<String> readingFormList = new ArrayList<String>(); List<String> iboList = new ArrayList<String>(); List<String> keiList = new ArrayList<String>(); List<String> danList = new ArrayList<String>(); String taggedResult = tagger.parse(text.replaceAll("[\\s]+", " ")); BufferedReader taggedResultReader = new BufferedReader(new StringReader(taggedResult)); try { String line; while ((line = taggedResultReader.readLine()) != null) { String morph = null, pos = null, baseForm = null, readingForm = null, ibo = null, dan = null, kei = null; String[] tokens = line.split("[\\s]+"); morph = tokens[0]; if (tokens.length >= 2) { String[] features = tokens[1].split(","); pos = getPartOfSpeech(features); dan = getDan(features); kei = getKei(features); baseForm = getBaseForm(features, morph); readingForm = getReading(features, morph); ibo = getIBO(morph, features, iboList); } if ((morph == null) && (pos == null) && (baseForm == null)) { logger.log(Level.WARNING, "Morph and pos not found: " + line); continue; } morphList.add(morph); posList.add(pos); baseFormList.add(baseForm); readingFormList.add(readingForm); iboList.add(ibo); danList.add(dan); keiList.add(kei); } } catch (IOException e) { logger.log(Level.WARNING, "Reading results from tagger caused an exception: " + e.getMessage()); } /* * Using the list of morphemes and POS tags, we mark sentence boundaries, as well as * morpheme and POS boundaries. Japanese sentences end with full stop mark (。), exclamation * mark (!), or a question mark (?). Note that these are full-width characters. */ { int curSenBegin = 0; List<String> curMorphList = new ArrayList<String>(); List<String> curPOSList = new ArrayList<String>(); List<String> curBaseFormList = new ArrayList<String>(); List<String> curReadingFormList = new ArrayList<String>(); List<String> curIBOList = new ArrayList<String>(); List<String> curDanList = new ArrayList<String>(); List<String> curKeiList = new ArrayList<String>(); for (int i = 0; i < morphList.size(); i++) { String morph = morphList.get(i); String pos = posList.get(i); String baseForm = baseFormList.get(i); String readingForm = readingFormList.get(i); String ibo = iboList.get(i); String dan = danList.get(i); String kei = keiList.get(i); curMorphList.add(morph); curPOSList.add(pos); curBaseFormList.add(baseForm); curReadingFormList.add(readingForm); curIBOList.add(ibo); curDanList.add(dan); curKeiList.add(kei); if (morph.matches("[。!?]")) { curSenBegin = createSentence(aJCas, text, begin, curSenBegin, curMorphList, curPOSList, curBaseFormList, curReadingFormList, curIBOList, curDanList, curKeiList, begin, curSenBegin); } } // cut off mecab's 'EOS' and its entries in the various lists int morphs = curMorphList.size(); if (curMorphList.get(morphs - 1).equals("EOS")) { curMorphList.remove(morphs - 1); curPOSList.remove(morphs - 1); curBaseFormList.remove(morphs - 1); curReadingFormList.remove(morphs - 1); curIBOList.remove(morphs - 1); curDanList.remove(morphs - 1); curKeiList.remove(morphs - 1); } // process the remaining text if (curMorphList.size() > 0) { curSenBegin = createSentence(aJCas, text, begin, curSenBegin, curMorphList, curPOSList, curBaseFormList, curReadingFormList, curIBOList, curDanList, curKeiList, begin, curSenBegin); } } this.getLogger().log(Level.INFO, "Finished tagging document with id: " + documentId); } private String getReading(String[] features, String fallback) { String readingForm = (features.length > 7) ? features[7] : "*"; if (readingForm.equals("*")) { readingForm = fallback; } return readingForm; } private String getBaseForm(String[] features, String fallback) { String baseForm = features[6]; if (baseForm.equals("*")) { baseForm = fallback; } return baseForm; } private String getKei(String[] features) { String kei = features[5]; if (kei.equals("*")) { kei = ""; } return kei; } private String getDan(String[] features) { String dan = features[4]; if (dan.equals("*")) { dan = ""; } return dan; } private String getPartOfSpeech(String[] features) { StringBuffer posBuf = new StringBuffer(); int i = 0; while (!features[i].equals("*") && (i < features.length) && (i < 4)) { if (posBuf.length() > 0) { posBuf.append("-"); } posBuf.append(features[i]); i++; } return posBuf.toString(); } private int createSentence(JCas aJCas, String text, int begin, int curSenBegin, List<String> curMorphList, List<String> curPOSList, List<String> curBaseFormList, List<String> curReadingFormList, List<String> curIBOList, List<String> curDanList, List<String> curKeiList, int begin2, int curSenBegin2) { curSenBegin = skipBlanksAtBeginningOfString(text, begin, curSenBegin); int curMorphBegin = 0; curMorphBegin = createTokensAddToIndex(text, curMorphList, curPOSList, curBaseFormList, curReadingFormList, curIBOList, curDanList, curKeiList, curMorphBegin, curSenBegin, begin, aJCas); createSentenceAddToIndex(aJCas, begin, curSenBegin, curMorphBegin); curSenBegin += curMorphBegin; clearLists(curMorphList, curPOSList, curBaseFormList, curReadingFormList, curIBOList, curDanList, curKeiList); return curSenBegin; } private void clearLists(List<String> curMorphList, List<String> curPOSList, List<String> curBaseFormList, List<String> curReadingFormList, List<String> curIBOList, List<String> curDanList, List<String> curKeiList) { curMorphList.clear(); curPOSList.clear(); curBaseFormList.clear(); curReadingFormList.clear(); curIBOList.clear(); curDanList.clear(); curKeiList.clear(); } private int skipBlanksAtBeginningOfString(String text, int begin, int curSenBegin) { while ((text.length() > (begin + curSenBegin)) && Character.isWhitespace(text.charAt(begin + curSenBegin))) { curSenBegin++; } return curSenBegin; } private void createSentenceAddToIndex(JCas aJCas, int begin, int curSenBegin, int curMorphBegin) { Sentence curSentence = new Sentence(aJCas, begin + curSenBegin, begin + curSenBegin + curMorphBegin); curSentence.addToIndexes(); } private int createTokensAddToIndex(String text, List<String> curMorphList, List<String> curPOSList, List<String> curBaseFormList, List<String> curReadingFormList, List<String> curIBOList, List<String> curDanList, List<String> curKeiList, int curMorphBegin, int curSenBegin, int begin, JCas aJCas) { for (int j = 0; j < curMorphList.size(); j++) { String curMorph = trimWhitespaces(curMorphList.get(j)); if (!isValidMorph(curMorph)) { continue; } JapaneseToken jpyToken = new JapaneseToken(aJCas, begin + curSenBegin + curMorphBegin, begin + curSenBegin + curMorphBegin + curMorph.length()); jpyToken.setKana(curReadingFormList.get(j)); jpyToken.setIbo(curIBOList.get(j)); jpyToken.setDan(curDanList.get(j)); jpyToken.setKei(curKeiList.get(j)); jpyToken.addToIndexes(); POS curPOS = new POS(aJCas, begin + curSenBegin + curMorphBegin, begin + curSenBegin + curMorphBegin + curMorph.length()); curPOS.setPosValue(curPOSList.get(j)); curPOS.setCoarseValue(curPOS.getClass().equals(POS.class) ? null : curPOS.getType().getShortName().intern()); curPOS.addToIndexes(); String lemmaString = curBaseFormList.get(j); if (lemmaString == null) { lemmaString = jpyToken.getCoveredText(); } Lemma curLemma = new Lemma(aJCas, begin + curSenBegin + curMorphBegin, begin + curSenBegin + curMorphBegin + curMorph.length()); curLemma.setValue(lemmaString); curLemma.addToIndexes(); // set lemma and pos additionally for the token jpyToken.setPos(curPOS); jpyToken.setLemma(curLemma); curMorphBegin += curMorph.length(); // append whitespace after the morph while ((text.length() > (begin + curSenBegin + curMorphBegin)) && Character.isWhitespace(text.charAt(begin + curSenBegin + curMorphBegin))) { curMorphBegin++; } } return curMorphBegin; } private boolean isValidMorph(String curMorph) { if ((curMorph.length() == 1) && Character.isWhitespace(curMorph.charAt(0))) { return false; } if (containsOnlyWhitespacesAndTabs(curMorph)) { return false; } return true; } private boolean containsOnlyWhitespacesAndTabs(String curMorph) { for (int i = 0; i < curMorph.length(); i++) { char c = curMorph.charAt(i); if (!(Character.isWhitespace(c) || (c == '\t'))) { return false; } } return true; } private String trimWhitespaces(String morph) { if (morph.length() == 1) { return morph; } int i = 0; // forward to first non-blank character of morph while ((i < morph.length()) && Character.isWhitespace(morph.charAt(i))) { i++; } // step back until first non-blank character of morph int j = morph.length() - 1; while ((j >= 0) && Character.isWhitespace(morph.charAt(j))) { j--; } if (j < i) { return morph; } return morph.substring(i, j + 1); } /** * Based on a simple heuristic it is attempted to mark the morphemes with I-B-O tags if they * belong to the same word. O = 1-morpheme word B = morpheme marks the beginning of a word I = * morpheme is part of a word * * @param morph a morpheme. * @param features a set of features. * @param iboList a IBO list. * @return the IBO code. */ private String getIBO(String morph, String[] features, List<String> iboList) { String pos = features[0]; String pos_suffix_1 = features[1]; String kei = features[5]; String baseForm = features[6]; String OUTSIDE = "O"; String INSIDE = "I"; String BEGINNING = "B"; String ibo = OUTSIDE; if (isVerb(pos)) { if (isIndependent(pos_suffix_1) && !baseForm.equals(morph)) { if (isBeginning(iboList)) { ibo = BEGINNING; } else { ibo = INSIDE; } } else if (isSuffix(pos_suffix_1)) { ibo = INSIDE; } else if (isIncompleteVerbForm(kei)) { ibo = BEGINNING; } } else if (isAuxilaryVerb(pos)) { if (isBeginning(iboList)) { ibo = BEGINNING; } else { ibo = INSIDE; } } else if (isParticle(pos)) { if (isLinkingParticle(pos_suffix_1)) { ibo = INSIDE; } } else if (isAdjective(pos)) { if (endsOnInformalPastTense(morph, pos_suffix_1)) { ibo = BEGINNING; } else if (isPastTenseEndingSyllablePolite(morph, kei)) { ibo = INSIDE; } } return ibo; } private boolean endsOnInformalPastTense(String morph, String pos_suffix_1) { return (morph.charAt(morph.length() - 1) == 'っ') && pos_suffix_1.equals("自立"); } private boolean isPastTenseEndingSyllablePolite(String morph, String feature) { return morph.equals("た") && feature.equals("基本形"); } private boolean isAdjective(String pos) { return pos.equals("形容詞"); } private boolean isIncompleteVerbForm(String feature) { return feature.equals("未然形"); } private boolean isLinkingParticle(String pos_suffix_1) { return pos_suffix_1.startsWith("接続"); } private boolean isParticle(String pos) { return pos.equals("助詞"); } private boolean isBeginning(List<String> iboList) { int size = iboList.size(); return (size > 1) && iboList.get(size - 1).equals("O"); } private boolean isSuffix(String pos_suffix_1) { return pos_suffix_1.equals("接尾"); } private boolean isAuxilaryVerb(String pos) { return pos.equals("助動詞"); } private boolean isIndependent(String pos_suffix_1) { return pos_suffix_1.equals("自立"); } private boolean isVerb(String pos) { return pos.equals("動詞"); } }