/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.icu.segmentation; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; /** Test tokenizing Myanmar text into syllables */ public class TestMyanmarSyllable extends BaseTokenStreamTestCase { Analyzer a; @Override public void setUp() throws Exception { super.setUp(); a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, false)); return new TokenStreamComponents(tokenizer); } }; } @Override public void tearDown() throws Exception { a.close(); super.tearDown(); } /** as opposed to dictionary break of သက်ဝင်|လှုပ်ရှား|စေ|ပြီး */ public void testBasics() throws Exception { assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်", "ဝင်", "လှုပ်", "ရှား", "စေ", "ပြီး" }); } // simple tests from "A Rule-based Syllable Segmentation of Myanmar Text" // * http://www.aclweb.org/anthology/I08-3010 // (see also the presentation: http://gii2.nagaokaut.ac.jp/gii/media/share/20080901-ZMM%20Presentation.pdf) // The words are fake, we just test the categories. // note that currently our algorithm is not sophisticated enough to handle some of the special cases! /** constant */ public void testC() throws Exception { assertAnalyzesTo(a, "ကက", new String[] { "က", "က" }); } /** consonant + sign */ public void testCF() throws Exception { assertAnalyzesTo(a, "ကံကံ", new String[] { "ကံ", "ကံ" }); } /** consonant + consonant + asat */ public void testCCA() throws Exception { assertAnalyzesTo(a, "ကင်ကင်", new String[] { "ကင်", "ကင်" }); } /** consonant + consonant + asat + sign */ public void testCCAF() throws Exception { assertAnalyzesTo(a, "ကင်းကင်း", new String[] { "ကင်း", "ကင်း" }); } /** consonant + vowel */ public void testCV() throws Exception { assertAnalyzesTo(a, "ကာကာ", new String[] { "ကာ", "ကာ" }); } /** consonant + vowel + sign */ public void testCVF() throws Exception { assertAnalyzesTo(a, "ကားကား", new String[] { "ကား", "ကား" }); } /** consonant + vowel + vowel + asat */ public void testCVVA() throws Exception { assertAnalyzesTo(a, "ကော်ကော်", new String[] { "ကော်", "ကော်" }); } /** consonant + vowel + vowel + consonant + asat */ public void testCVVCA() throws Exception { assertAnalyzesTo(a, "ကောင်ကောင်", new String[] { "ကောင်", "ကောင်" }); } /** consonant + vowel + vowel + consonant + asat + sign */ public void testCVVCAF() throws Exception { assertAnalyzesTo(a, "ကောင်းကောင်း", new String[] { "ကောင်း", "ကောင်း" }); } /** consonant + medial */ public void testCM() throws Exception { assertAnalyzesTo(a, "ကျကျ", new String[] { "ကျ", "ကျ" }); } /** consonant + medial + sign */ public void testCMF() throws Exception { assertAnalyzesTo(a, "ကျံကျံ", new String[] { "ကျံ", "ကျံ" }); } /** consonant + medial + consonant + asat */ public void testCMCA() throws Exception { assertAnalyzesTo(a, "ကျင်ကျင်", new String[] { "ကျင်", "ကျင်" }); } /** consonant + medial + consonant + asat + sign */ public void testCMCAF() throws Exception { assertAnalyzesTo(a, "ကျင်းကျင်း", new String[] { "ကျင်း", "ကျင်း" }); } /** consonant + medial + vowel */ public void testCMV() throws Exception { assertAnalyzesTo(a, "ကျာကျာ", new String[] { "ကျာ", "ကျာ" }); } /** consonant + medial + vowel + sign */ public void testCMVF() throws Exception { assertAnalyzesTo(a, "ကျားကျား", new String[] { "ကျား", "ကျား" }); } /** consonant + medial + vowel + vowel + asat */ public void testCMVVA() throws Exception { assertAnalyzesTo(a, "ကျော်ကျော်", new String[] { "ကျော်", "ကျော်" }); } /** consonant + medial + vowel + vowel + consonant + asat */ public void testCMVVCA() throws Exception { assertAnalyzesTo(a, "ကြောင်ကြောင်", new String[] { "ကြောင်", "ကြောင်"}); } /** consonant + medial + vowel + vowel + consonant + asat + sign */ public void testCMVVCAF() throws Exception { assertAnalyzesTo(a, "ကြောင်းကြောင်း", new String[] { "ကြောင်း", "ကြောင်း"}); } /** independent vowel */ public void testI() throws Exception { assertAnalyzesTo(a, "ဪဪ", new String[] { "ဪ", "ဪ" }); } /** independent vowel */ public void testE() throws Exception { assertAnalyzesTo(a, "ဣဣ", new String[] { "ဣ", "ဣ" }); } }