/*
* Copyright (C) 2015, VistaTEC or third-party contributors as indicated
* by the @author tags or express copyright attribution statements applied by
* the authors. All third-party contributions are distributed under license by
* VistaTEC.
*
* This file is part of Ocelot.
*
* Ocelot is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Ocelot is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, write to:
*
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301
* USA
*
* Also, see the full LGPL text here: <http://www.gnu.org/copyleft/lesser.html>
*/
package com.vistatec.ocelot.xliff.okapi;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.junit.Test;
import com.vistatec.ocelot.segment.model.OcelotSegment;
import com.vistatec.ocelot.segment.model.SimpleSegment;
public class TestOkapiXliff20Parser {
@Test
public void testParser() throws URISyntaxException, IOException {
File testFile = new File(TestOkapiXliff20Parser.class.getResource(
"XLIFF2.0_example.xlf").toURI());
OkapiXLIFF20Parser parser = new OkapiXLIFF20Parser();
List<OcelotSegment> testSegments = parser.parse(testFile);
List<OcelotSegment> goalSegments = getGoalSegments();
assertTrue(testSegments.size() > 0);
compareSegmentsIgnoringWhitespace(testSegments, goalSegments);
}
@Test
public void testTagParser() throws URISyntaxException, IOException {
File testFile = new File(TestOkapiXliff20Parser.class.getResource(
"LQE_xliff_2.0.xlf").toURI());
OkapiXLIFF20Parser parser = new OkapiXLIFF20Parser();
List<OcelotSegment> testSegments = parser.parse(testFile);
List<OcelotSegment> goalSegments = getTagGoalSegments();
compareSegmentsIgnoringWhitespace(testSegments, goalSegments);
}
@Test
public void testMissingTarget() throws Exception {
File testFile = new File(getClass().getResource("/xliff20/noTargets.xlf").toURI());
OkapiXLIFF20Parser parser = new OkapiXLIFF20Parser();
List<OcelotSegment> testSegments = parser.parse(testFile);
assertEquals(6, testSegments.size());
}
public void compareSegmentsIgnoringWhitespace(List<OcelotSegment> testSegs, List<OcelotSegment> goalSegs) {
Iterator<OcelotSegment> testIter = testSegs.iterator();
Iterator<OcelotSegment> goalIter = goalSegs.iterator();
while (testIter.hasNext()) {
OcelotSegment testSeg = testIter.next();
OcelotSegment goalSeg = goalIter.next();
assertEquals(goalSeg.getSource().getDisplayText().replaceAll("\\s", ""),
testSeg.getSource().getDisplayText().replaceAll("\\s", ""));
assertEquals(goalSeg.getTarget().getDisplayText().replaceAll("\\s", ""),
testSeg.getTarget().getDisplayText().replaceAll("\\s", ""));
}
assertFalse(goalIter.hasNext());
}
public List<OcelotSegment> getGoalSegments() {
List<OcelotSegment> segs = new ArrayList<>();
SimpleSegment.Builder seg1 = new SimpleSegment.Builder()
.segmentNumber(1);
seg1.source().text("Sentence 1. Sentence 2.");
seg1.target().text("");
segs.add(seg1.build());
SimpleSegment.Builder seg2 = new SimpleSegment.Builder()
.segmentNumber(2);
seg2.source().text("Sentence 3 (no-trans). Sentence 4 (no-trans).");
seg2.target().text("");
segs.add(seg2.build());
SimpleSegment.Builder seg3 = new SimpleSegment.Builder()
.segmentNumber(3);
seg3.source().text("Sentence 5.");
seg3.target().text("");
segs.add(seg3.build());
SimpleSegment.Builder seg4 = new SimpleSegment.Builder()
.segmentNumber(4);
seg4.source().text("Sentence 6 (no-trans).");
seg4.target().text("");
segs.add(seg4.build());
SimpleSegment.Builder seg5 = new SimpleSegment.Builder()
.segmentNumber(5);
seg5.source().text("Sentence 7. Sentence 8. ");
seg5.target().text("");
segs.add(seg5.build());
SimpleSegment.Builder seg6 = new SimpleSegment.Builder()
.segmentNumber(6);
seg6.source().text("Sentence with A. Sentence with <cp hex=\"0001\"/>. ");
seg6.target().text("Sentence with A. Sentence with <cp hex=\"0001\"/>. ");
segs.add(seg6.build());
return segs;
}
public List<OcelotSegment> getTagGoalSegments() {
List<OcelotSegment> segs = new ArrayList<>();
SimpleSegment.Builder seg1 = new SimpleSegment.Builder();
seg1.source().text("Sentence 1.").code("1", "<mrk1>", "<mrk id=\"1\" type=\"its:its\" translate=\"no\">")
.text("LQI").code("1", "</mrk1>", "</mrk>").text(" Sentence 2.");
seg1.target().text("Sentence 1.").code("1", "<mrk2>", "<mrk id=\"1\" type=\"its:its\" translate=\"no\">")
.text("Prov").code("1", "</mrk2>", "</mrk>").text(" Sentence 2.");
segs.add(seg1.build());
SimpleSegment.Builder seg2 = new SimpleSegment.Builder();
seg2.source().text("Sentence with A. Sentence with <cp hex=\"0001\"/>. ");
seg2.target().text("Sentence with A. Sentence with <cp hex=\"0001\"/>. ");
segs.add(seg2.build());
SimpleSegment.Builder seg3 = new SimpleSegment.Builder();
seg3.source().text("Ph element ").code("1", "<phph1/>", "<ph id=\"ph1\"/>").text(" #1.");
seg3.target().text("Ph element ").code("1", "<phph1/>", "<ph id=\"ph1\"/>").text(" #1.");
segs.add(seg3.build());
SimpleSegment.Builder seg4 = new SimpleSegment.Builder();
seg4.source().text("Pc element ").code("1", "<pcpc1>", "<pc id=\"pc1\">").text("Important")
.code("1", "</pcpc1>", "</pc>").text(" #1.");
seg4.target().text("Pc element ").code("1", "<pcpc1>", "<pc id=\"pc1\">")
.text("Important").code("1", "</pcpc1>", "</pc>").text(" #1.");
segs.add(seg4.build());
SimpleSegment.Builder seg5 = new SimpleSegment.Builder();
seg5.source().text("Text in ").code("1", "<scsc1/>", "<sc id=\"sc1\"")
.text("bold ").code("1", "<pcsc2>", "<sc id=\"sc2\"/>")
.text("and").code("1", "<ecsc1/>", "<ec startRef=\"sc1\"/>")
.text(" italics").code("1", "</pcsc2>", "<ec startRef=\"sc2\"/>")
.text(".");
seg5.target().text("Text in ").code("1", "<scsc1/>", "<sc id=\"sc1\"/>")
.text("bold ").code("1", "<pcsc2>", "<sc id=\"sc2\"/>")
.text("and").code("1", "<ecsc1/>", "<ec startRef=\"sc1\"/>")
.text(" italics").code("1", "</pcsc2>", "<ec startRef=\"sc2\"/>")
.text(".");
segs.add(seg5.build());
SimpleSegment.Builder seg6 = new SimpleSegment.Builder();
seg6.source().text("Mrk element ").code("1", "<mrkmrk1>", "<mrk id=\"mrk1\" translate=\"yes\">")
.text("Important").code("1", "</mrkmrk1>", "</mrk>").text(" #1.");
seg6.target().text("Mrk element ").code("1", "<mrkmrk1>", "<mrk id=\"mrk1\" translate=\"yes\">")
.text("Important").code("1", "</mrkmrk1>", "</mrk>").text(" #1.");
segs.add(seg6.build());
SimpleSegment.Builder seg7 = new SimpleSegment.Builder();
seg7.source().text("Sm split element ").code("1", "<smsm1/>", "<sm id=\"sm1\" translate=\"no\"/>")
.text(" #1.");
seg7.target().text("Sm split element ").code("1", "<smsm1/>", "<sm id=\"sm1\" translate=\"no\"/>")
.text(" #1.");
segs.add(seg7.build());
SimpleSegment.Builder seg8 = new SimpleSegment.Builder();
seg8.source().text("Em split element ").code("1", "<emsm1/>", "<em startRef=\"sm1\"/>")
.text(" #1.");
seg8.target().text("Em split element ").code("1", "<emsm1/>", "<em startRef=\"sm1\"/>")
.text(" #1.");
segs.add(seg8.build());
return segs;
}
}