package com.vistatec.ocelot.tm.okapi; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.List; import net.sf.okapi.common.Event; import net.sf.okapi.common.FileUtil; import net.sf.okapi.common.LocaleId; import net.sf.okapi.common.resource.ITextUnit; import net.sf.okapi.common.resource.RawDocument; import net.sf.okapi.common.resource.TextContainer; import net.sf.okapi.filters.tmx.TmxFilter; import net.sf.okapi.tm.pensieve.common.TranslationUnit; import net.sf.okapi.tm.pensieve.common.TranslationUnitVariant; import net.sf.okapi.tm.pensieve.writer.PensieveWriter; /** * Parse TMs in the TMX 1.4 file format and index the segments for use in * Pensieve. */ public class OkapiTmTmxImporter { private LocaleId sourceLocale, targetLocale; public void parse(File tmx, PensieveWriter writer) throws IOException { List<String> locales = FileUtil.guessLanguages(tmx.getAbsolutePath()); sourceLocale = (locales.size() >= 1) ? LocaleId.fromString(locales.get(0)) : LocaleId.EMPTY; targetLocale = (locales.size() >= 2) ? LocaleId.fromString(locales.get(1)) : LocaleId.EMPTY; RawDocument rawDoc = new RawDocument(new FileInputStream(tmx), "UTF-8", sourceLocale, targetLocale); TmxFilter filter = new TmxFilter(); filter.open(rawDoc); while (filter.hasNext()) { Event event = filter.next(); if (event.isTextUnit()) { ITextUnit tu = event.getTextUnit(); indexTranslationUnit(tu, writer); } } } private void indexTranslationUnit(ITextUnit tu, PensieveWriter writer) { TextContainer srcTu = tu.getSource(); TextContainer tgtTu = tu.getTarget(targetLocale); TranslationUnit pensieveTu = new TranslationUnit( new TranslationUnitVariant(sourceLocale, srcTu.getUnSegmentedContentCopy()), new TranslationUnitVariant(targetLocale, tgtTu.getUnSegmentedContentCopy())); writer.indexTranslationUnit(pensieveTu); } }