/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.core.segmentation;
import static org.junit.Assert.assertEquals;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
import org.omegat.util.Language;
/**
* Tests for OmegaT segmentation.
*
* @author Maxym Mykhalchuk
*/
public class SegmenterTest {
private Segmenter segmenter = new Segmenter(SRX.getDefault());
/**
* Test of segment method, of class org.omegat.core.segmentation.Segmenter.
*/
@Test
public void testSegment() {
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<String> segments = segmenter.segment(new Language("en"), "<br7>\n\n<br5>\n\nother", spaces,
new ArrayList<Rule>());
assertEquals(3, segments.size());
assertEquals("<br7>", segments.get(0));
assertEquals("<br5>", segments.get(1));
assertEquals("other", segments.get(2));
}
/**
* Test of glue method, of class org.omegat.core.segmentation.Segmenter.
*/
@Test
public void testGlue() {
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
String oldString = "<br7>\n\n<br5>\n\nother";
List<String> segments = segmenter.segment(new Language("en"), oldString, spaces, brules);
String newString = segmenter.glue(new Language("en"), new Language("fr"), segments, spaces, brules);
assertEquals(oldString, newString);
}
/**
* Test of glue method for CJK, of class org.omegat.core.segmentation.Segmenter.
*/
@Test
public void testGlueCJK() {
final String EN_FULLSTOP = ".";
final String JA_FULLSTOP = "\\u3002"; // Unicode escaped
// basic combination
final String SOURCE = "Foo. Bar.\nHere.\n\nThere.\r\nThis.\tThat.\n\tOther.";
final String TRANSLATED = SOURCE.replace(" ", "").replace(EN_FULLSTOP, JA_FULLSTOP);
String translated = getPseudoTranslationFromEnToJa(SOURCE);
assertEquals(TRANSLATED, translated);
// spaces after/before \n
final String SOURCE2 = "Foo. \n Bar.";
final String TRANSLATED2 = "Foo\\u3002\n Bar\\u3002";
translated = getPseudoTranslationFromEnToJa(SOURCE2);
assertEquals(TRANSLATED2, translated);
// spaces after/before \t
final String SOURCE3 = "Foo. \t Bar.";
final String TRANSLATED3 = "Foo\\u3002\t Bar\\u3002";
translated = getPseudoTranslationFromEnToJa(SOURCE3);
assertEquals(TRANSLATED3, translated);
}
private String getPseudoTranslationFromEnToJa(final String source) {
final String EN_FULLSTOP = ".";
final String JA_FULLSTOP = "\\u3002";
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
List<String> segments = segmenter.segment(new Language("en"), source, spaces, brules);
// pseudo-translation (just replace full-stop char)
for (int i = 0; i < segments.size(); i++) {
segments.set(i, segments.get(i).replace(EN_FULLSTOP, JA_FULLSTOP));
}
return segmenter.glue(new Language("en"), new Language("ja"), segments, spaces, brules);
}
}