package eu.project.ttc.termino.export; import java.io.IOException; import java.io.Writer; import java.math.RoundingMode; import java.text.NumberFormat; import java.util.Collection; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.Lists; import eu.project.ttc.api.TermSuiteException; import eu.project.ttc.api.Traverser; import eu.project.ttc.models.CompoundType; import eu.project.ttc.models.Term; import eu.project.ttc.models.TermIndex; import eu.project.ttc.models.TermOccurrence; import eu.project.ttc.models.TermVariation; import eu.project.ttc.utils.TermSuiteUtils; public class TbxExporter { private static final Logger LOGGER = LoggerFactory.getLogger(TbxExporter.class); /** Prints float out numbers */ private static final NumberFormat NUMBER_FORMATTER = NumberFormat.getNumberInstance(Locale.US); /** Prefix used in langset ids */ private static final String LANGSET_ID_PREFIX = "langset-"; /** Prefix used in langset ids */ private static final String TERMENTRY_ID_PREFIX = "entry-"; /** Prefix used in langset ids */ private static final String TIG_ID_PREFIX = "term-"; /* The tbx document */ private Document document; private TermIndex termIndex; private Traverser traverser; private Writer writer; private TbxExporter(TermIndex termIndex, Writer writer, Traverser traverser) { NUMBER_FORMATTER.setMaximumFractionDigits(4); NUMBER_FORMATTER.setMinimumFractionDigits(4); NUMBER_FORMATTER.setRoundingMode(RoundingMode.UP); NUMBER_FORMATTER.setGroupingUsed(false); this.writer = writer; this.termIndex = termIndex; this.traverser = traverser; } private void doExport() { try { prepareTBXDocument(); try { for(Term t: traverser.toList(termIndex)) { addTermEntry(t, false); for(TermVariation v:t.getVariations()) addTermEntry(v.getVariant(), true); } exportTBXDocument(); } catch (TransformerException | IOException e) { LOGGER.error("An error occurred when exporting term index to file"); throw new TermSuiteException(e); } } catch (ParserConfigurationException e) { throw new TermSuiteException(e); } } public static void export(TermIndex termIndex, Writer writer) { export(termIndex, writer, Traverser.create()); } public static void export(TermIndex termIndex, Writer writer, Traverser traverser) { new TbxExporter(termIndex, writer, traverser).doExport(); } /** * Prepare the TBX document that will contain the terms. */ private void prepareTBXDocument() throws ParserConfigurationException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); this.document = builder.newDocument(); Element martif = document.createElement("martif"); martif.setAttribute("type", "TBX"); document.appendChild(martif); Element header = document.createElement("martifHeader"); martif.appendChild(header); Element fileDesc = document.createElement("fileDesc"); header.appendChild(fileDesc); Element encodingDesc = document.createElement("encodingDesc"); header.appendChild(encodingDesc); Element encodingP = document.createElement("p"); encodingP.setAttribute("type", "XCSURI"); encodingP.setTextContent("http://ttc-project.googlecode.com/files/ttctbx.xcs"); encodingDesc.appendChild(encodingP); Element sourceDesc = document.createElement("sourceDesc"); Element p = document.createElement("p"); // p.setTextContent(workingDir.getAbsolutePath()); sourceDesc.appendChild(p); fileDesc.appendChild(sourceDesc); Element text = document.createElement("text"); martif.appendChild(text); Element body = document.createElement("body"); text.appendChild(body); } /** * Export the TBX document to a file specified in parameter. * * @throws TransformerException */ private void exportTBXDocument() throws TransformerException { // Prepare the transformer to persist the file TransformerFactory transformerFactory = TransformerFactory .newInstance(); Transformer transformer = transformerFactory.newTransformer(); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "http://ttc-project.googlecode.com/files/tbxcore.dtd"); transformer.setOutputProperty(OutputKeys.STANDALONE, "yes"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); try { transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); } catch (IllegalArgumentException e) { throw new TransformerException(e); } // Ignore // Actually persist the file DOMSource source = new DOMSource(document); StreamResult result = new StreamResult(this.writer); transformer.transform(source, result); } private LoadingCache<Term, Collection<TermOccurrence>> allOccurrencesCaches = CacheBuilder.newBuilder() .maximumSize(100) .build( new CacheLoader<Term, Collection<TermOccurrence>>() { @Override public Collection<TermOccurrence> load(Term term) throws Exception { return term.getOccurrences(); } }); private LoadingCache<Term, LinkedHashMap<String, Integer>> termForms = CacheBuilder.newBuilder() .maximumSize(100) .build( new CacheLoader<Term, LinkedHashMap<String, Integer>>() { @Override public LinkedHashMap<String, Integer> load(Term term) throws Exception { Collection<TermOccurrence> allOccurrences = allOccurrencesCaches.getUnchecked(term); List<String> forms = Lists.newArrayListWithCapacity(allOccurrences.size()); for(TermOccurrence o:allOccurrences) forms.add(TermSuiteUtils.trimInside(o.getCoveredText())); return TermSuiteUtils.getCounters(forms); } }); /** * Add a term to the TBX document. * * @param doc * @param langsetId * @param term * @param isVariant * @throws IOException */ private void addTermEntry(Term term, boolean isVariant) throws IOException { String langsetId = LANGSET_ID_PREFIX + term.getId(); Node body = document.getElementsByTagName("body").item(0); Element termEntry = document.createElement("termEntry"); termEntry.setAttribute("xml:id", TERMENTRY_ID_PREFIX + term.getId()); body.appendChild(termEntry); Element langSet = document.createElement("langSet"); langSet.setAttribute("xml:id", langsetId); langSet.setAttribute("xml:lang", this.termIndex.getLang().getCode()); termEntry.appendChild(langSet); for (TermVariation variation : term.getBases()) this.addTermBase(langSet, variation.getBase().getGroupingKey(), null); for (TermVariation variation : term.getVariations()) { this.addTermVariant(langSet, String.format("langset-%d", variation.getVariant().getId()), variation.getVariant().getGroupingKey()); } Collection<TermOccurrence> allOccurrences = allOccurrencesCaches.getUnchecked(term); this.addDescrip(langSet, langSet, "nbOccurrences", allOccurrences.size()); Element tig = document.createElement("tig"); tig.setAttribute("xml:id", TIG_ID_PREFIX + term.getId()); langSet.appendChild(tig); Element termElmt = document.createElement("term"); termElmt.setTextContent(term.getGroupingKey()); tig.appendChild(termElmt); LinkedHashMap<String, Integer> formCounters = termForms.getUnchecked(term); addNote(langSet, tig, "termPilot", formCounters.entrySet().iterator().next().getKey()); this.addNote(langSet, tig, "termType", isVariant ? "variant" : "termEntry"); this.addNote( langSet, tig, "partOfSpeech", term.isMultiWord() ? "noun" : term.firstWord().getSyntacticLabel()); this.addNote(langSet, tig, "termPattern", term.firstWord().getSyntacticLabel()); this.addNote(langSet, tig, "termComplexity", this.getComplexity(term)); this.addDescrip(langSet, tig, "termSpecificity", NUMBER_FORMATTER.format(termIndex.getWRMeasure().getValue(term))); this.addDescrip(langSet, tig, "nbOccurrences", term.getFrequency()); this.addDescrip(langSet, tig, "relativeFrequency", NUMBER_FORMATTER.format(term.getFrequency())); addDescrip(langSet, tig, "formList", buildFormListJSON(term, formCounters.size())); this.addDescrip(langSet, tig, "domainSpecificity", termIndex.getWRMeasure().getValue(term)); } private void addDescrip(Element lang, Element element, String type, Object value) { Element descrip = document.createElement("descrip"); element.appendChild(descrip); descrip.setAttribute("type", type); descrip.setTextContent(value.toString()); } private void addTermBase(Element lang, String target, Object value) { Element descrip = document.createElement("descrip"); lang.appendChild(descrip); descrip.setAttribute("type", "termBase"); descrip.setAttribute("target", "#"+target); if (value != null) { descrip.setTextContent(value.toString()); } } private void addTermVariant(Element lang, String target, Object value) { Element descrip = document.createElement("descrip"); lang.appendChild(descrip); descrip.setAttribute("type", "termVariant"); descrip.setAttribute("target", "#"+target); if (value != null) { descrip.setTextContent(value.toString()); } } private void addNote(Element lang, Element element, String type, Object value) { Element termNote = document.createElement("termNote"); element.appendChild(termNote); termNote.setAttribute("type", type); termNote.setTextContent(value.toString()); } private String buildFormListJSON(Term term, int size) { StringBuilder sb = new StringBuilder("["); LinkedHashMap<String, Integer> formCounts = termForms.getUnchecked(term); int i = 0; for (String form:formCounts.keySet()) { if (i > 0) sb.append(", "); sb.append("{term=\"").append(form); sb.append("\", count=").append(formCounts.get(form)).append("}"); i++; } sb.append("]"); return sb.toString(); } private String getComplexity(Term term) { if (term.isSingleWord()) { if(term.isCompound()) { if(term.firstWord().getWord().getCompoundType() == CompoundType.NEOCLASSICAL) return "neoclassical-compound"; else return "compound"; } else return "single-word"; } return "multi-word"; } }