/* * Copyright (C) 2014-2015, VistaTEC or third-party contributors as indicated * by the @author tags or express copyright attribution statements applied by * the authors. All third-party contributions are distributed under license by * VistaTEC. * * This file is part of Ocelot. * * Ocelot is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Ocelot is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, write to: * * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 * USA * * Also, see the full LGPL text here: <http://www.gnu.org/copyleft/lesser.html> */ package com.vistatec.ocelot.xliff.okapi; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import net.sf.okapi.lib.xliff2.Const; import net.sf.okapi.lib.xliff2.changeTracking.ChangeTrack; import net.sf.okapi.lib.xliff2.changeTracking.Item; import net.sf.okapi.lib.xliff2.changeTracking.Revision; import net.sf.okapi.lib.xliff2.changeTracking.Revisions; import net.sf.okapi.lib.xliff2.core.Fragment; import net.sf.okapi.lib.xliff2.core.MTag; import net.sf.okapi.lib.xliff2.core.Note; import net.sf.okapi.lib.xliff2.core.Part; import net.sf.okapi.lib.xliff2.core.StartXliffData; import net.sf.okapi.lib.xliff2.core.Tag; import net.sf.okapi.lib.xliff2.core.TagType; import net.sf.okapi.lib.xliff2.core.Unit; import net.sf.okapi.lib.xliff2.its.IITSItem; import net.sf.okapi.lib.xliff2.its.LocQualityIssue; import net.sf.okapi.lib.xliff2.its.LocQualityIssues; import net.sf.okapi.lib.xliff2.its.Provenances; import net.sf.okapi.lib.xliff2.reader.Event; import net.sf.okapi.lib.xliff2.reader.XLIFFReader; import com.ibm.icu.text.SimpleDateFormat; import com.vistatec.ocelot.its.model.LanguageQualityIssue; import com.vistatec.ocelot.its.model.Provenance; import com.vistatec.ocelot.its.model.okapi.OkapiProvenance; import com.vistatec.ocelot.segment.model.BaseSegmentVariant; import com.vistatec.ocelot.segment.model.OcelotSegment; import com.vistatec.ocelot.segment.model.SegmentAtom; import com.vistatec.ocelot.segment.model.TextAtom; import com.vistatec.ocelot.segment.model.enrichment.Enrichment; import com.vistatec.ocelot.segment.model.okapi.FragmentVariant; import com.vistatec.ocelot.segment.model.okapi.Notes; import com.vistatec.ocelot.segment.model.okapi.OcelotRevision; import com.vistatec.ocelot.segment.model.okapi.OkapiSegment; import com.vistatec.ocelot.xliff.XLIFFParser; import com.vistatec.ocelot.xliff.freme.EnrichmentConverterXLIFF20; /** * Parse XLIFF 2.0 file for use in the workbench. */ public class OkapiXLIFF20Parser implements XLIFFParser { private static final String DATETIME_PATTERN = "yyyy-MM-dd'T'HH:mm:ssX"; private final SimpleDateFormat dateFormatter = new SimpleDateFormat( DATETIME_PATTERN); private List<Event> events; private List<net.sf.okapi.lib.xliff2.core.Segment> segmentUnitParts; private List<TargetVersion> targetVersions; private Map<Integer, Integer> segmentEventMapping; private int documentSegmentNum; private String sourceLang, targetLang; private EnrichmentConverterXLIFF20 enrichmentConverter; public List<Event> getEvents() { return this.events; } public Event getSegmentEvent(int segEventNumber) { return this.events.get(segmentEventMapping.get(segEventNumber)); } public net.sf.okapi.lib.xliff2.core.Segment getSegmentUnitPart( int segmentUnitPartIndex) { return this.segmentUnitParts.get(segmentUnitPartIndex); } public TargetVersion getTargetVersion(int segmentUnitPartIndex){ return this.targetVersions.get(segmentUnitPartIndex); } @Override public List<OcelotSegment> parse(File xliffFile) throws IOException { List<OcelotSegment> segments = new LinkedList<>(); segmentEventMapping = new HashMap<Integer, Integer>(); events = new LinkedList<Event>(); segmentUnitParts = new LinkedList<>(); targetVersions = new ArrayList<TargetVersion>(); this.documentSegmentNum = 1; int segmentUnitPartIndex = 0; XLIFFReader reader = new XLIFFReader(); reader.open(xliffFile); while (reader.hasNext()) { Event event = reader.next(); this.events.add(event); if (event.isStartXliff()) { StartXliffData xliffElement = event.getStartXliffData(); this.sourceLang = xliffElement.getSourceLanguage(); // optional unless document contains target elements underneath // <segment> or <ignorable> if (xliffElement.getTargetLanguage() != null) { this.targetLang = xliffElement.getTargetLanguage(); } enrichmentConverter = new EnrichmentConverterXLIFF20(sourceLang, targetLang); } else if (event.isUnit()) { Unit unit = event.getUnit(); for (Part unitPart : unit) { if (unitPart.isSegment()) { List<Enrichment> sourceEnrichments = enrichmentConverter .retrieveEnrichments(unit, unitPart.getSource(), sourceLang); List<Enrichment> targetEnrichments = enrichmentConverter .retrieveEnrichments(unit, unitPart.getTarget(), targetLang); net.sf.okapi.lib.xliff2.core.Segment okapiSegment = (net.sf.okapi.lib.xliff2.core.Segment) unitPart; OcelotSegment ocelotSegment = convertPartToSegment( okapiSegment, segmentUnitPartIndex++, sourceEnrichments, targetEnrichments, unit.getId()); if (ocelotSegment.getTarget() != null) { setTargetRevisions(unit, okapiSegment, ocelotSegment); } readNotes(unit, ocelotSegment); segments.add(ocelotSegment); this.segmentUnitParts.add(okapiSegment); } } } } reader.close(); return segments; } private void readNotes(Unit unit, OcelotSegment ocelotSegment) { if(unit.getNoteCount() > 0){ com.vistatec.ocelot.segment.model.okapi.Note ocelotNote = null; Notes ocelotNotes = new Notes(); for(Note okapiNote: unit.getNotes()){ ocelotNote = new com.vistatec.ocelot.segment.model.okapi.Note(); ocelotNote.setContent(okapiNote.getText()); ocelotNote.setId(okapiNote.getId()); ocelotNotes.add(ocelotNote); } ocelotSegment.setNotes(ocelotNotes); } } /** * Sets the revisions of the target for this segment if a * {@link ChangeTrack} object exists for the segment. * * @param unit the xliff unit * @param okapiSegment * the okapi segment * @param ocelotSegment * the Ocelot segment. */ private void setTargetRevisions(Unit unit, net.sf.okapi.lib.xliff2.core.Segment okapiSegment, OcelotSegment ocelotSegment) { if (unit.hasChangeTrack()) { List<OcelotRevision> ocelotRevisions = new ArrayList<OcelotRevision>(); Revisions targetRevisions = null; Iterator<Revisions> revsIt = unit.getChangeTrack().iterator(); Revisions revs = null; while (revsIt.hasNext()) { revs = revsIt.next(); if (revs.getAppliesTo().equals(Const.ELEM_TARGET)) { targetRevisions = revs; break; } } if (targetRevisions != null) { for (Revision rev : targetRevisions) { Iterator<Item> itemsIt = rev.iterator(); Item currItem = null; while (itemsIt.hasNext()) { currItem = itemsIt.next(); if (currItem.getProperty().equals( Item.PROPERTY_CONTENT_VALUE)) { ocelotRevisions.add(new OcelotRevision(rev, currItem)); } } } } else if (!ocelotSegment.getTarget().getDisplayText().isEmpty()) { targetRevisions = createRevisionsForTarget(okapiSegment .getTarget()); unit.getChangeTrack().add(targetRevisions); ocelotRevisions.add(new OcelotRevision(targetRevisions.get(0), targetRevisions.get(0).get(0))); } if (!ocelotRevisions.isEmpty()) { Collections.sort(ocelotRevisions, new OcelotRevisionComparator()); if (ocelotRevisions.size() > 1) { List<SegmentAtom> atoms = new ArrayList<SegmentAtom>(); TextAtom origTrgtAtom = new TextAtom(ocelotRevisions.get(ocelotRevisions.size() - 1).getText()); atoms.add(origTrgtAtom); FragmentVariant origTargetVar = new FragmentVariant(atoms, true); ocelotSegment.setOriginalTarget(origTargetVar); } int nextVersion = 1; for (OcelotRevision rev : ocelotRevisions) { if (rev.getVersion().startsWith(TargetVersion.VERSION_PREFIX)) { int revNum = Integer.parseInt(rev.getVersion() .substring(TargetVersion.VERSION_PREFIX.length())); if (revNum >= nextVersion) { nextVersion = revNum + 1; } } } //check currentVersion and target OcelotRevision currRev = null; for(OcelotRevision ocelotRev: ocelotRevisions){ if(ocelotRev.getVersion().equals(targetRevisions.getCurrentVersion())){ currRev = ocelotRev; break; } } if(!currRev.getText().equals(okapiSegment.getTarget().getPlainText())) { Item currTargetItem = new Item(Item.PROPERTY_CONTENT_VALUE); currTargetItem.setText(okapiSegment.getTarget().getPlainText()); Revision revision = new Revision(); revision.setVersion(TargetVersion.VERSION_PREFIX + nextVersion++); revision.setDatetime(dateFormatter.format(new Date())); revision.add(currTargetItem); targetRevisions.add(revision); targetRevisions.setCurrentVersion(revision.getVersion()); } targetVersions.add(new TargetVersion(TargetVersion.VERSION_PREFIX + nextVersion)); } else { targetVersions.add(new TargetVersion(TargetVersion.VERSION_PREFIX + "1")); } } else if (!ocelotSegment.getTarget().getDisplayText().isEmpty()) { ChangeTrack changeTrack = new ChangeTrack(); unit.setChangeTrack(changeTrack); changeTrack.add(createRevisionsForTarget(okapiSegment .getTarget())); targetVersions.add(new TargetVersion(TargetVersion.VERSION_PREFIX + "2")); } else { targetVersions.add(new TargetVersion(TargetVersion.VERSION_PREFIX + "1")); } } private Revisions createRevisionsForTarget(Fragment target){ Revisions revisions = new Revisions(); revisions.setAppliesTo(Const.ELEM_TARGET); revisions.setCurrentVersion(TargetVersion.VERSION_PREFIX + "1"); Revision revision = new Revision(); revision.setDatetime(dateFormatter.format(new Date())); revision.setVersion(TargetVersion.VERSION_PREFIX + "1"); revisions.add(revision); Item item = new Item(Item.PROPERTY_CONTENT_VALUE); item.setText(getFragmentPlainText(target)); revision.add(item); return revisions; } /** * Converts Okapi XLIFF 2.0 Unit Parts to the Ocelot Segment format. * * @param unitPart * <segment> or <ignorable> element. See {@link Part} for * more details. * @param segmentUnitPartIndex * - Index of the associated original Okapi XLIFF 2.0 Event from * which the Segment was derived. * @return Segment - Ocelot Segment * @throws MalformedURLException */ private OcelotSegment convertPartToSegment(net.sf.okapi.lib.xliff2.core.Segment unitPart, int segmentUnitPartIndex, List<Enrichment> sourceEnrichments, List<Enrichment> targetEnrichments, String unitId) throws MalformedURLException { segmentEventMapping.put(this.documentSegmentNum, this.events.size()-1); //TODO: load original target from file OkapiSegment seg = new OkapiSegment.Builder() .segmentNumber(documentSegmentNum++) .eventNumber(segmentUnitPartIndex) .source(new FragmentVariant(unitPart, false)) .target(new FragmentVariant(unitPart, true)) .tuId(unitId) .build(); seg.addAllLQI(parseLqiData(unitPart)); seg.addAllProvenance(parseProvData(unitPart)); if(sourceEnrichments != null && !sourceEnrichments.isEmpty() && seg.getSource() != null && seg.getSource() instanceof BaseSegmentVariant){ ((BaseSegmentVariant)seg.getSource()).setEnrichments(new HashSet<Enrichment>(sourceEnrichments)); ((BaseSegmentVariant)seg.getSource()).setEnriched(true); } if(targetEnrichments != null && !targetEnrichments.isEmpty() && seg.getTarget() != null && seg.getTarget() instanceof BaseSegmentVariant){ ((BaseSegmentVariant)seg.getTarget()).setEnrichments(new HashSet<Enrichment>(targetEnrichments)); ((BaseSegmentVariant)seg.getTarget()).setEnriched(true); } enrichmentConverter.convertEnrichments2ITSMetadata(seg); return seg; } private List<LanguageQualityIssue> parseLqiData(Part unitPart) throws MalformedURLException { List<LanguageQualityIssue> ocelotLqiList = new ArrayList<LanguageQualityIssue>(); List<Tag> sourceTags = unitPart.getSource().getOwnTags(); ocelotLqiList.addAll(convertOkapiToOcelotLqiData(sourceTags)); if (unitPart.getTarget() != null) { List<Tag> targetTags = unitPart.getTarget().getOwnTags(); ocelotLqiList.addAll(convertOkapiToOcelotLqiData(targetTags)); } return ocelotLqiList; } private List<LanguageQualityIssue> convertOkapiToOcelotLqiData( List<Tag> okapiXliff2Tags) throws MalformedURLException { List<LanguageQualityIssue> ocelotLqiList = new ArrayList<LanguageQualityIssue>(); for (Tag tag : okapiXliff2Tags) { // ITS XLIFF 2.0 LQI Mapping must be done using the <mrk> element if (tag.isMarker()) { MTag mtag = (MTag) tag; // Same Tag object is generated twice for paired elements; only // take the opening LQI if (mtag.hasITSItem() && (mtag.getTagType() == TagType.OPENING || mtag .getTagType() == TagType.STANDALONE)) { IITSItem itsLqiItem = mtag.getITSItems().get( LocQualityIssue.class); if (itsLqiItem != null) { if (itsLqiItem.isGroup()) { LocQualityIssues lqiGroup = (LocQualityIssues) itsLqiItem; for (LocQualityIssue lqi : lqiGroup.getList()) { ocelotLqiList.add(convertOkapiToOcelotLqi(lqi)); } } else { LocQualityIssue lqi = (LocQualityIssue) itsLqiItem; ocelotLqiList.add(convertOkapiToOcelotLqi(lqi)); } } } } } return ocelotLqiList; } /** * Convert from Okapi parsed version of an LQI * * @param lqi * - Okapi representation of an ITS Language Quality Issue * @return - Ocelot representation of an ITS Language Quality Issue * @throws MalformedURLException */ private LanguageQualityIssue convertOkapiToOcelotLqi(LocQualityIssue lqi) throws MalformedURLException { LanguageQualityIssue ocelotLQI = new LanguageQualityIssue(); ocelotLQI.setType(lqi.getType()); ocelotLQI.setComment(lqi.getComment()); ocelotLQI .setSeverity(lqi.getSeverity() != null ? lqi.getSeverity() : 0); URL profileRef = lqi.getProfileRef() != null ? new URL( lqi.getProfileRef()) : null; ocelotLQI.setProfileReference(profileRef); ocelotLQI.setEnabled(lqi.isEnabled()); return ocelotLQI; } private List<Provenance> parseProvData(Part unitPart) { List<Provenance> ocelotProvList = new ArrayList<Provenance>(); List<Tag> sourceTags = unitPart.getSource().getOwnTags(); Fragment target = unitPart.getTarget(); List<Tag> targetTags = target != null ? target.getOwnTags() : new ArrayList<Tag>(); ocelotProvList.addAll(convertOkapiToOcelotProvData(sourceTags)); ocelotProvList.addAll(convertOkapiToOcelotProvData(targetTags)); return ocelotProvList; } private List<Provenance> convertOkapiToOcelotProvData( List<Tag> okapiXliff2Tags) { List<Provenance> ocelotProvList = new ArrayList<Provenance>(); for (Tag tag : okapiXliff2Tags) { // ITS XLIFF 2.0 Provenance Mapping must be done using the <mrk> // element if (tag.isMarker()) { MTag mtag = (MTag) tag; if (mtag.hasITSItem() && (mtag.getTagType() == TagType.OPENING || mtag .getTagType() == TagType.STANDALONE)) { IITSItem itsProvItem = mtag.getITSItems().get( net.sf.okapi.lib.xliff2.its.Provenance.class); if (itsProvItem != null) { if (itsProvItem.isGroup()) { Provenances provMetadata = (Provenances) itsProvItem; for (net.sf.okapi.lib.xliff2.its.Provenance p : provMetadata .getList()) { ocelotProvList.add(new OkapiProvenance(p)); } } else { ocelotProvList .add(new OkapiProvenance( (net.sf.okapi.lib.xliff2.its.Provenance) itsProvItem)); } } } } } return ocelotProvList; } @Override public String getSourceLang() { return this.sourceLang; } @Override public String getTargetLang() { return this.targetLang; } public void updateTargetVersions(){ for(TargetVersion tVersion: targetVersions){ tVersion.nextVersion(); } } public SimpleDateFormat getRevisionDateFormatter(){ return dateFormatter; } public String getFragmentPlainText(Fragment fragment ){ StringBuilder plainText = new StringBuilder(); String ctext = fragment.getCodedText(); for ( int i=0; i<ctext.length(); i++ ) { char ch = ctext.charAt(i); switch (ch) { case Fragment.CODE_OPENING: case Fragment.CODE_CLOSING: case Fragment.CODE_STANDALONE: case Fragment.MARKER_OPENING: case Fragment.MARKER_CLOSING: case Fragment.PCONT_STANDALONE: i++; break; case '\r': plainText.append(" "); // Literal break; case '<': plainText.append("<"); break; case '&': plainText.append("&"); break; case '\n': case '\t': plainText.append(ch); break; default: if (( ch > 0x001F ) && ( ch < 0xD800 )) { // Valid char (most frequent) plainText.append(ch); } else if ( Character.isHighSurrogate(ch) ) { plainText.append(Character.toChars(ctext.codePointAt(i))); i++; } else if (( ch < 0x0020 ) || (( ch > 0xD7FF ) && ( ch < 0xE000 )) || ( ch == 0xFFFE ) || ( ch == 0xFFFF )) { // Invalid characters plainText.append(String.format("<cp hex=\"%04X\"/>", (int)ch)); } break; } } return plainText.toString(); } } //class RevisionComparator implements Comparator<Revision> { // // private SimpleDateFormat dateFormatter; // // public RevisionComparator(SimpleDateFormat dateFormatter) { // // this.dateFormatter = dateFormatter; // } // // @Override // public int compare(Revision o1, Revision o2) { // // int retValue = 0; // if(o1.getDatetime() == null || o1.getDatetime().isEmpty()){ // retValue = 1; // } else if(o2.getDatetime() == null || o2.getDatetime().isEmpty()){ // retValue = -1; // } else { // try { // Long dateTime1 = Long.valueOf(dateFormatter.parse(o1.getDatetime()).getTime()); // Long dateTime2 = Long.valueOf(dateFormatter.parse(o2.getDatetime()).getTime()); // retValue = (-1) * dateTime1.compareTo(dateTime2); // } catch (ParseException e) { // retValue = 0; // } //// retValue = (-1) * // } // return retValue; // } // //} class OcelotRevisionComparator implements Comparator<OcelotRevision> { @Override public int compare(OcelotRevision o1, OcelotRevision o2) { return (-1) * Long.valueOf(o1.getDatetimeAsDate().getTime()).compareTo( Long.valueOf(o2.getDatetimeAsDate().getTime())); } } class TargetVersion { static final String VERSION_PREFIX = "Rev"; private String version; private boolean updated; public TargetVersion(String version) { this.version = version; } public String getVersion() { return version; } public void setVersion(String version) { this.version = version; } public boolean isUpdated() { return updated; } public void setUpdated(boolean updated) { this.updated = updated; } public void nextVersion() { if (updated) { int versionNum = Integer .parseInt(version.substring(VERSION_PREFIX.length())); version = VERSION_PREFIX + (versionNum + 1); updated = false; } } }