package tbx2rdf; //JAVA import java.io.FileInputStream; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Scanner; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.EntityResolver; import org.xml.sax.SAXException; //JENA import org.openjena.riot.Lang; import org.apache.jena.riot.RDFDataMgr; import org.apache.jena.riot.RDFFormat; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.vocabulary.DCTerms; import com.hp.hpl.jena.vocabulary.RDF; //TBX2RDF import java.io.PrintStream; import java.util.Collections; import org.apache.log4j.Logger; import tbx2rdf.datasets.iate.SubjectFields; import tbx2rdf.vocab.ONTOLEX; import tbx2rdf.vocab.SKOS; import tbx2rdf.vocab.TBX; import tbx2rdf.types.LexicalEntry; import tbx2rdf.types.Describable; import tbx2rdf.types.MartifHeader; import tbx2rdf.types.TBX_Terminology; import tbx2rdf.types.Descrip; import tbx2rdf.types.XReference; import tbx2rdf.types.Term; import tbx2rdf.types.AdminGrp; import tbx2rdf.types.AdminInfo; import tbx2rdf.types.DescripGrp; import tbx2rdf.types.DescripNote; import tbx2rdf.types.MartifHeader.*; import tbx2rdf.types.Note; import tbx2rdf.types.NoteLinkInfo; import tbx2rdf.types.Reference; import tbx2rdf.types.TermComp; import tbx2rdf.types.TermCompGrp; import tbx2rdf.types.TermCompList; import tbx2rdf.types.TermNote; import tbx2rdf.types.TermNoteGrp; import tbx2rdf.types.TransacGrp; import tbx2rdf.types.TransacNote; import tbx2rdf.types.Transaction; import tbx2rdf.types.abs.impID; import tbx2rdf.types.abs.impIDLangTypeTgtDtyp; import tbx2rdf.utils.XMLUtils; import tbx2rdf.vocab.DC; import tbx2rdf.vocab.IATE; /** * Entry point of the TBX2RDF converter * * TBX: framework consisting of a core structure, and a formalism (eXtensible * Constraint Specification) for identifying a set of data-categories and their * constraints, both expressed in XML * * Several of the remaining data categories, including definition, context, part * of speech, and subject field are very important and should be included in a * terminology whenever possible. The most important non-mandatory data category * is part of speech. * * * A very nice reference for the basic model can be found here: * http://www.terminorgs.net/downloads/TBX_Basic_Version_3.pdf * * @author Philipp Cimiano - Universität Bielefeld * @author Victor Rodriguez - Universidad Politécnica de Madrid */ public class TBX2RDF_Converter { private final static Logger logger = Logger.getLogger(TBX2RDF_Converter.class); /** * Do not construct */ public TBX2RDF_Converter() { } /** * Converts a TBX string into a RDF. Parses the XML searching for termEntry * elements. * * Then, Serializes Terms and Lexicons * * @param str The TBX XML as a String. * @return str A Turtle string with the equivalent information */ public String convert(String str, Mappings mappings, String resourceURI) throws Exception { TBX_Terminology result = convert(new StringReader(str), mappings); StringWriter sw = new StringWriter(); RDFDataMgr.write(sw, result.getModel(resourceURI), RDFFormat.TURTLE_PRETTY); return sw.toString(); } /** * Makes the conversion given a certain input and a set of mappings. This is done with a * @param input Input * @param mappings Mappings */ public TBX_Terminology convert(Reader input, Mappings mappings) throws IOException, ParserConfigurationException, TBXFormatException, SAXException { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); TransacNote.mapAgents.clear(); db.setEntityResolver(new EntityResolver() { @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { if (systemId.endsWith(".dtd")) { return new InputSource(new StringReader("")); } else { return null; } } }); // parse the input document Document doc = db.parse(new InputSource(input)); // extract here martif metadata Element root = doc.getDocumentElement(); return createTerminology(root, mappings); } /** * Processes the whole TBX file from the root XML element (once built the DOM model) * @param root The root element */ TBX_Terminology createTerminology(Element root, Mappings mappings) throws IOException, SAXException { MartifHeader header = processMartifHeader(XMLUtils.child(root, "martifHeader"), mappings); final TBX_Terminology terminology = new TBX_Terminology(root.getAttribute("type"), header); mappings.defaultLanguage = "en"; for (Element e : XMLUtils.children(root)) { if (e.getTagName().equalsIgnoreCase("text")) { for (Term t : processText(e, mappings)) { terminology.addTerm(t); } } else if (!e.getTagName().equalsIgnoreCase("martifHeader")) { unexpected(root); } } return terminology; } /** * Given a XML root element, processes the Martif Header * @param root XML root element * @param mappings Mappings */ MartifHeader processMartifHeader(Element root, Mappings mappings) throws IOException, SAXException { final MartifHeader header = new MartifHeader(processFileDescrip(XMLUtils.child(root, "fileDesc"), mappings)); processID(header, root); for (Element e : XMLUtils.children(root)) { if (e.getTagName().equalsIgnoreCase("encodingDesc")) { header.encodingDesc = e.getChildNodes(); } else if (e.getTagName().equalsIgnoreCase("revisionDesc")) { header.revisionDesc = e.getChildNodes(); } else if (!e.getTagName().equalsIgnoreCase("fileDesc")) { unexpected(e); } } return header; } /** * Obtains a FileDesc object by parsing a XML element. * <filedesc>: A nesting element containing child elements that describe the TBX document instance. */ public FileDesc processFileDescrip(Element root, Mappings mappings) throws IOException, SAXException { final FileDesc fileDesc = new FileDesc(); for (Element e : XMLUtils.children(root)) { if (e.getTagName().equalsIgnoreCase("titleStmt")) { fileDesc.titleStmt = processTitleStmt(e, mappings); } else if (e.getTagName().equalsIgnoreCase("publicationStmt")) { fileDesc.publicationStmt = e; } else if (e.getTagName().equalsIgnoreCase("sourceDesc")) { fileDesc.sourceDesc.add(e); } else { unexpected(e); } } return fileDesc; } /** * Processes some metadata elements from the root element */ private TitleStmt processTitleStmt(Element root, Mappings mappings) { final TitleStmt titleStmt = new TitleStmt(XMLUtils.child(root, "title").getTextContent()); if (root.hasAttribute("xml:lang")) { titleStmt.lang = root.getAttribute("xml:lang"); } if (root.hasAttribute("id")) { titleStmt.id = root.getAttribute("id"); } final Element title = XMLUtils.child(root, "title"); if (title.hasAttribute("xml:lang")) { titleStmt.title_lang = title.getAttribute("xml:lang"); } if (root.hasAttribute("id")) { titleStmt.title_id = title.getAttribute("id"); } for (Element e : XMLUtils.children(root)) { if (e.getTagName().equalsIgnoreCase("note")) { titleStmt.notes.add(e); } } return titleStmt; } /** * Processes the body element and the back element. * We arrive here with a <text> element. */ Collection<Term> processText(Element root, Mappings mappings) throws IOException, SAXException { final Collection<Term> terms = new HashSet<Term>(); for (Element e : XMLUtils.children(root)) { if (e.getTagName().equalsIgnoreCase("body")) { terms.addAll(processBody(e, mappings)); } else if (e.getTagName().equalsIgnoreCase("back")) { terms.addAll(processBack(e, mappings)); } else { unexpected(e); } } return terms; } /** * Processes the collection of terms * We arreive here with a <body> element */ private Collection<? extends Term> processBody(Element root, Mappings mappings) { final Collection<Term> terms = new HashSet<Term>(); for (Element e : XMLUtils.children(root)) { if (e.getTagName().equalsIgnoreCase("termEntry")) { terms.add(processTermEntry(e, mappings)); } else { unexpected(e); } } return terms; } private Collection<? extends Term> processBack(Element root, Mappings mappings) { // TODO: This should do something right? return Collections.EMPTY_LIST; } /** * Processes, from a node, a termEntry * @return A Term */ Term processTermEntry(Element node, Mappings mappings) { // create new Term // add subjectField // add ID // <!ELEMENT termEntry ((%auxInfo;),(langSet+)) > // <!ATTLIST termEntry // id ID #IMPLIED > // <!ENTITY % auxInfo '(descrip | descripGrp | admin | adminGrp | transacGrp | note | ref | xref)*' > Term term = new Term(); int langsetcount = 0; String sid=node.getAttribute("id"); term.setID(sid); for (Element sub : XMLUtils.children(node)) { final String name = sub.getTagName(); if (name.equalsIgnoreCase("langSet")) { langsetcount++; this.processLangSet(term, sub, mappings); } else { processAuxInfo(term, sub, mappings); } } if (langsetcount == 0) { logger.warn("No langSet element in termEntry"); // throw new TBXFormatException("No langSet element in termEntry"); } return term; } void processReference(NoteLinkInfo descr, Element sub, Mappings mappings) { // <!ELEMENT ref (#PCDATA) > // <!ATTLIST ref // %impIDLangTypTgtDtyp; // > //<!ENTITY % impIDLangTypTgtDtyp ' id ID #IMPLIED //xml:lang CDATA #IMPLIED // type CDATA #REQUIRED // target IDREF #IMPLIED // datatype CDATA #IMPLIED //'> final Reference ref = new Reference(processType(sub, mappings, true), sub.getAttribute("xml:lang"), mappings, sub.getChildNodes()); if (sub.hasAttribute("id")) { ref.setID(sub.getAttribute("id")); } if (sub.hasAttribute("target")) { ref.target = sub.getAttribute("target"); } if (sub.hasAttribute("datatype")) { ref.datatype = sub.getAttribute("datatype"); } descr.References.add(ref); } void processAdminGrp(NoteLinkInfo descr, Element node, Mappings mappings) { // <!ELEMENT adminGrp (admin, (adminNote|note|ref|xref)*) > // <!ATTLIST adminGrp // id ID #IMPLIED > processID((impID) descr, node); int i = 0; for (Element tig_child : XMLUtils.children(node)) { String name = tig_child.getNodeName(); if (i == 0 && !name.equals("admin")) { throw new TBXFormatException("First element of TIG is not term !\n"); } if (name.equals("admin")) { processAdmin(descr, tig_child, mappings); } else if (name.equals("adminNote")) { processAdminGrp(descr, tig_child, mappings); } else if (name.equals("note")) { processNote(descr, tig_child, mappings); } else if (name.equals("ref")) { this.processReference(descr, tig_child, mappings); } else if (name.equals("xref")) { this.processXReference(descr, tig_child, mappings); } else { throw new TBXFormatException("Element " + name + "not defined by TBX standard"); } i++; } } /** * Processes the langset (xml:lang) * * @return a LexicalEntry */ Term processLangSet(Term term, Element langSet, Mappings mappings) { // <!ELEMENT langSet ((%auxInfo;), (tig | ntig)+) > // <!ATTLIST langSet // id ID #IMPLIED // xml:lang CDATA #REQUIRED > LexicalEntry entry; String language = XMLUtils.getValueOfAttribute(langSet, "xml:lang"); if (language == null) { throw new TBXFormatException("Language not specified for langSet!"); } int termCount = 0; processID(term, langSet); for (Element sub : XMLUtils.children(langSet)) { final String name = sub.getNodeName(); if (name.equals("ntig")) { termCount++; entry = new LexicalEntry(language, mappings); this.processNTIG(entry, sub, mappings); term.Lex_entries.add(entry); } else if (name.equals("tig")) { termCount++; entry = new LexicalEntry(language, mappings); this.processTIG(entry, sub, mappings); term.Lex_entries.add(entry); } else { processAuxInfo(term, sub, mappings); } } if (termCount == 0) { throw new TBXFormatException("No TIG nor NTIG in langSet !"); } return term; } void processTIG(LexicalEntry entry, Element tig, Mappings mappings) { // <!ELEMENT tig (term, (termNote)*, %auxInfo;) > // <!ATTLIST tig // id ID #IMPLIED > int i = 0; processID(entry, tig); Iterable<Element> children = XMLUtils.children(tig); for (Element tig_child : children) { String name = tig_child.getNodeName(); if (i == 0 && !name.equals("term")) { throw new TBXFormatException("First element of TIG is not term !\n"); } if (name.equals("term")) { this.processTerm(entry, tig_child, mappings); } else if (name.equals("termNote")) { entry.TermNotes.add(new TermNoteGrp(this.processTermNote(tig_child, mappings), mappings.defaultLanguage, mappings)); } else { processAuxInfo(entry, tig_child, mappings); } i++; } } /** * Processes a term within a termEntry */ void processTerm(LexicalEntry entry, Element node, Mappings mappings) { // <!ELEMENT term %basicText; > // <!ATTLIST term // id ID #IMPLIED > entry.Lemma = node.getTextContent(); } TermNote processTermNote(Element tig_child, Mappings mappings) { // <!ELEMENT termNote %noteText; > // <!ATTLIST termNote // %impIDLangTypTgtDtyp; // > // <!ENTITY % impIDLangTypTgtDtyp ' id ID #IMPLIED // xml:lang CDATA #IMPLIED type CDATA #REQUIRED target IDREF #IMPLIED datatype CDATA #IMPLIED // '> final TermNote note = new TermNote(tig_child.getChildNodes(), processType(tig_child, mappings, true), tig_child.getAttribute("xml:lang"), mappings); processImpIDLangTypeTgtDType(note, tig_child, mappings); return note; } void processNTIG(LexicalEntry entry, Node ntig, Mappings mappings) { // <!ELEMENT ntig (termGrp, %auxInfo;) > // <!ATTLIST ntig // id ID #IMPLIED // > int i = 0; for (Element ntig_child : XMLUtils.children(ntig)) { String name = ntig_child.getNodeName(); if (i == 0 && !name.equals("termGrp")) { if (Main.lenient==false) throw new TBXFormatException("First element of NTIG is not termGrp !\n"); } if (name.equals("termGrp")) { this.processTermGroup(entry, ntig_child, mappings); } else { processAuxInfo(entry, ntig_child, mappings); } i++; } } void processXReference(NoteLinkInfo descr, Element node, Mappings mappings) { // <!ELEMENT xref (#PCDATA) > // <!ATTLIST xref // %impIDType; // target CDATA #REQUIRED > XReference xref = new XReference(XMLUtils.getValueOfAttribute(node, "target"), node.getTextContent()); processID(xref, node); xref.type = processType(node, mappings, false); descr.Xreferences.add(xref); } void processDescripGroup(Describable descr, Element node, Mappings mappings) { // The DTD for a DescripGroup is as follows // <!ELEMENT descripGrp (descrip, (descripNote|admin|adminGrp|transacGrp|note|ref|xref)*) // > // <!ATTLIST descripGrp // id ID #IMPLIED > DescripGrp descrip = new DescripGrp(processDescrip(XMLUtils.firstChild("descrip", node), mappings)); processID(descrip, node); // get first child that needs to be a descrip // process other XMLUtils.children that can be: descripNote, admin, adminGroup, transacGrp, note, ref and xref for (Element sub : XMLUtils.children(node)) { final String name = sub.getTagName(); if (name.equalsIgnoreCase("descrip")) { // ignore } else if (name.equalsIgnoreCase("descripNote")) { processDescripNote(descrip, sub, mappings); } else if (name.equalsIgnoreCase("admin")) { this.processAdmin(descrip, sub, mappings); } else if (name.equalsIgnoreCase("adminGrp")) { this.processAdminGrp(descrip, sub, mappings); } else if (name.equalsIgnoreCase("transacGrp")) { this.processTransactionGroup(descrip, sub, mappings); } else if (name.equalsIgnoreCase("note")) { this.processTransactionGroup(descrip, sub, mappings); } else if (name.equalsIgnoreCase("ref")) { this.processReference(descrip, sub, mappings); } else if (name.equalsIgnoreCase("xref")) { this.processXReference(descrip, sub, mappings); } else { throw new TBXFormatException("Unexpected subnode " + node.getTagName()); } } descr.Descriptions.add(descrip); } void processAdmin(NoteLinkInfo descr, Element node, Mappings mappings) { // <!ELEMENT admin %noteText; > // <!ATTLIST admin // %impIDLangTypTgtDtyp; //> final AdminInfo admin = new AdminInfo(node.getChildNodes(), processType(node, mappings, true), node.getAttribute("xml:lang"), mappings); processImpIDLangTypeTgtDType(admin, node, mappings); descr.AdminInfos.add(new AdminGrp(admin)); } /** * Processes a Transaction Group www.isocat.org/datcat/DC-162 A transacGrp * element can contain either one transacNote element, or one date element, * or both. Example: * <transacGrp> * <transac type="transactionType">creation</transac> * <transacNote type="responsibility" target="CA5365">John * Harris</transacNote> * <date>2008‐05‐12</date> * </transacGrp> * * @param transacGroup A Transaction group in XML // According to the TBX * DTD, a transacGroup looks as follows: // <!ELEMENT transacGrp (transac, * (transacNote|date|note|ref|xref)* ) > * // <!ATTLIST transacGrp // id ID #IMPLIED > * // Transaction transaction = new Transaction(lex); */ void processTransactionGroup(NoteLinkInfo descr, Element elem, Mappings mappings) { // <!ELEMENT transacGrp (transac, (transacNote|date|note|ref|xref)* ) > // <!ATTLIST transacGrp // id ID #IMPLIED > Element elemTransac = null; try { elemTransac = XMLUtils.firstChild("transac", elem); } catch (Exception e) { return; } final TransacGrp transacGrp = new TransacGrp(processTransac(elemTransac, mappings)); int i = 0; for (Element child : XMLUtils.children(elem)) { String name = child.getNodeName(); if (i == 0 && !name.equals("transac")) { throw new TBXFormatException("First element of transacGrp is not termGrp !\n"); } if (name.equals("transac")) { //processTransac(transacGrp, child, mappings); } else if (name.equals("transacNote")) { processTransacNote(transacGrp, child, mappings); } else if (name.equals("date")) { processDate(transacGrp, child, mappings); } else if (name.equals("note")) { processNote(transacGrp, child, mappings); } else if (name.equals("xref")) { processXReference(transacGrp, child, mappings); } else if (name.equals("ref")) { this.processReference(transacGrp, child, mappings); } else { throw new TBXFormatException("Element " + name + " not defined by TBX standard\n"); } i++; } descr.Transactions.add(transacGrp); } void processTermGroup(LexicalEntry entry, Element node, Mappings mappings) { // <!ELEMENT termGrp (term, (termNote|termNoteGrp)*, (termCompList)* ) > // <!ATTLIST termGrp // id ID #IMPLIED //> for (Element elem : XMLUtils.children(node)) { final String name = elem.getTagName(); if (name.equalsIgnoreCase("term")) { processTerm(entry, elem, mappings); } else if (name.equalsIgnoreCase("termNote")) { entry.TermNotes.add(new TermNoteGrp(processTermNote(elem, mappings), mappings.defaultLanguage, mappings)); } else if (name.equalsIgnoreCase("termNoteGrp")) { entry.TermNotes.add(processTermNoteGrp(elem, mappings)); } else if (name.equalsIgnoreCase("termCompList")) { processTermCompList(entry, elem, mappings); } } } void processNote(NoteLinkInfo descr, Element elem, Mappings mappings) { //<!ELEMENT note %noteText; > //<!ATTLIST note %impIDLang; //> final Note note = new Note(elem.getChildNodes(), elem.getAttribute("xml:lang"), mappings); processID(note, elem); descr.notes.add(note); } Descrip processDescrip(Element elem, Mappings mappings) { //<!ELEMENT descrip %noteText; > //<!ATTLIST descrip //%impIDLangTypTgtDtyp; //> final Descrip descrip = new Descrip(elem.getChildNodes(), processType(elem, mappings, true), elem.getAttribute("xml:lang"), mappings); processImpIDLangTypeTgtDType(descrip, elem, mappings); return descrip; } void processDescripNote(DescripGrp descrip, Element sub, Mappings mappings) { // <!ELEMENT descripNote (#PCDATA) > //<!ATTLIST descripNote //%impIDLangTypTgtDtyp; //> final DescripNote descripNote = new DescripNote(sub.getChildNodes(), processType(sub, mappings, true), sub.getAttribute("xml:lang"), mappings); processImpIDLangTypeTgtDType(descripNote, sub, mappings); descrip.descripNote.add(descripNote); } Transaction processTransac(Element child, Mappings mappings) { // <!ELEMENT transac (#PCDATA) > //<!ATTLIST transac //%impIDLangTypTgtDtyp; //> final Transaction transaction = new Transaction(child.getChildNodes(), processType(child, mappings, true), child.getAttribute("xml:lang"), mappings); processImpIDLangTypeTgtDType(transaction, child, mappings); return transaction; } void processTransacNote(TransacGrp transacGrp, Element child, Mappings mappings) { //<!ELEMENT transacNote (#PCDATA) > //<!ATTLIST transacNote //%impIDLangTypTgtDtyp; //> final TransacNote transacNote = new TransacNote(child.getChildNodes(), processType(child, mappings, true), child.getAttribute("xml:lang"), mappings); processImpIDLangTypeTgtDType(transacNote, child, mappings); transacGrp.transacNotes.add(transacNote); } void processDate(TransacGrp transacGrp, Element child, Mappings mappings) { // <!ELEMENT date (#PCDATA) > //<!ATTLIST date //id ID #IMPLIED //> transacGrp.date = child.getTextContent(); } TermNoteGrp processTermNoteGrp(Element elem, Mappings mappings) { // <!ELEMENT termNoteGrp (termNote, %noteLinkInfo;) > //<!ATTLIST termNoteGrp //id ID #IMPLIED //> final TermNoteGrp termNoteGrp = new TermNoteGrp(processTermNote(XMLUtils.firstChild("termNote", elem), mappings), elem.getAttribute("xml:lang"), mappings); for (Element e : XMLUtils.children(elem)) { final String name = e.getTagName(); if (name.equalsIgnoreCase("termNote")) { // Do nothing } else if (name.equalsIgnoreCase("admin")) { processAdmin(termNoteGrp, e, mappings); } else if (name.equalsIgnoreCase("adminGrp")) { processAdminGrp(termNoteGrp, e, mappings); } else if (name.equalsIgnoreCase("transacGrp")) { processTransactionGroup(termNoteGrp, e, mappings); } else if (name.equalsIgnoreCase("note")) { processNote(termNoteGrp, e, mappings); } else if (name.equalsIgnoreCase("ref")) { processReference(termNoteGrp, e, mappings); } else if (name.equalsIgnoreCase("xref")) { processXReference(termNoteGrp, e, mappings); } } return termNoteGrp; } void processTermCompList(LexicalEntry entry, Element elem, Mappings mappings) { // <!ELEMENT termCompList ((%auxInfo;), (termComp | termCompGrp)+) > //<!ATTLIST termCompList //id ID #IMPLIED //type CDATA #REQUIRED //> final TermCompList termCompList = new TermCompList(mappings.getMapping("termCompList", "type", elem.getAttribute("type"))); processID(termCompList, elem); for (Element e : XMLUtils.children(elem)) { final String name = e.getTagName(); if (name.equalsIgnoreCase("termComp")) { final TermComp termComp = processTermComp(e, mappings); termCompList.termComp.add(new TermCompGrp(termComp, null, mappings)); } else if (name.equalsIgnoreCase("termCompGrp")) { processTermCompGrp(termCompList, e, mappings); } else if (name.equalsIgnoreCase("admin")) { processAdmin(termCompList, e, mappings); } else if (name.equalsIgnoreCase("adminGrp")) { processAdminGrp(termCompList, e, mappings); } else if (name.equalsIgnoreCase("transacGrp")) { processTransactionGroup(termCompList, e, mappings); } else if (name.equalsIgnoreCase("note")) { processNote(termCompList, e, mappings); } else if (name.equalsIgnoreCase("ref")) { processReference(termCompList, e, mappings); } else if (name.equalsIgnoreCase("xref")) { processXReference(termCompList, e, mappings); } } entry.Decomposition.add(termCompList); } TermComp processTermComp(Element e, Mappings mappings) { //<!ELEMENT termComp (#PCDATA) > //<!ATTLIST termComp // %impIDLang; //> final TermComp termComp = new TermComp(e.getTextContent(), e.getAttribute("xml:lang"), mappings); processID(termComp, e); return termComp; } void processTermCompGrp(TermCompList termCompList, Element elem, Mappings mappings) { //<!ELEMENT termCompGrp (termComp, (termNote|termNoteGrp)*, %noteLinkInfo;) > //<!ATTLIST termCompGrp //id ID #IMPLIED //> final TermCompGrp termCompGrp = new TermCompGrp(processTermComp(XMLUtils.firstChild("termComp", elem), mappings), null, mappings); for (Element e : XMLUtils.children(elem)) { final String name = e.getTagName(); if (name.equalsIgnoreCase("termNote")) { termCompGrp.termNoteGrps.add(new TermNoteGrp(processTermNote(e, mappings), null, mappings)); } else if (name.equalsIgnoreCase("termNoteGrp")) { termCompGrp.termNoteGrps.add(processTermNoteGrp(e, mappings)); } else if (name.equalsIgnoreCase("admin")) { processAdmin(termCompList, e, mappings); } else if (name.equalsIgnoreCase("adminGrp")) { processAdminGrp(termCompList, e, mappings); } else if (name.equalsIgnoreCase("transacGrp")) { processTransactionGroup(termCompList, e, mappings); } else if (name.equalsIgnoreCase("note")) { processNote(termCompList, e, mappings); } else if (name.equalsIgnoreCase("ref")) { processReference(termCompList, e, mappings); } else if (name.equalsIgnoreCase("xref")) { processXReference(termCompList, e, mappings); } } termCompList.termComp.add(termCompGrp); } /** * */ private void unexpected(Node n) { if (n instanceof Element) { throw new TBXFormatException("Unexpected " + ((Element) n).getTagName()); } else { throw new TBXFormatException("Unexpected"); } } private void processID(impID elem, Element node) { if (node.hasAttribute("id")) { elem.setID(node.getAttribute("id")); } } /** * */ private void processImpIDLangTypeTgtDType(impIDLangTypeTgtDtyp ref, Element sub, Mappings mappings) { // <!ENTITY % impIDLangTypTgtDtyp ' // id ID #IMPLIED // xml:lang CDATA #IMPLIED // type CDATA #REQUIRED // target IDREF #IMPLIED // datatype CDATA #IMPLIED // '> if (sub.hasAttribute("id")) { ref.setID(sub.getAttribute("id")); } if (sub.hasAttribute("target")) { ref.target = sub.getAttribute("target"); } if (sub.hasAttribute("datatype")) { ref.datatype = sub.getAttribute("datatype"); } if (sub.hasAttribute("subjectField")) { // System.out.println("uy"); } } private void processAuxInfo(Describable term, Element sub, Mappings mappings) { // <!ENTITY % auxInfo '(descrip | descripGrp | admin | adminGrp | transacGrp | note | ref // | xref)*' > final String name = sub.getTagName(); if (name.equalsIgnoreCase("descrip")) { term.Descriptions.add(new DescripGrp(processDescrip(sub, mappings))); } else if (name.equalsIgnoreCase("descripGrp")) { this.processDescripGroup(term, sub, mappings); } else if (name.equalsIgnoreCase("admin")) { this.processAdmin(term, sub, mappings); } else if (name.equalsIgnoreCase("adminGrp")) { this.processAdminGrp(term, sub, mappings); } else if (name.equalsIgnoreCase("transacGrp")) { this.processTransactionGroup(term, sub, mappings); } else if (name.equalsIgnoreCase("note")) { this.processNote(term, sub, mappings); } else if (name.equalsIgnoreCase("ref")) { this.processReference(term, sub, mappings); } else if (name.equalsIgnoreCase("xref")) { this.processXReference(term, sub, mappings); } else { throw new TBXFormatException("Element " + name + " not defined by TBX standard"); } } /** * */ private Mapping processType(Element sub, Mappings mappings, boolean required) { if (sub.hasAttribute("type")) { final Mapping m = mappings.getMapping(sub.getTagName(), "type", sub.getAttribute("type")); if (m == null && required) { logger.warn("Unrecognised mapping for <" + sub.getTagName() + " type=\"" + sub.getAttribute("type") + "\">"); } return m; } else if (required) { throw new TBXFormatException("type expected"); } else { System.err.println("Null type on " + sub.getTagName()); return null; } } /** * Converts a XML TBX file (handling large files...) * It does not hold in memory the whole dataset, but parses it as it comes. * * A TBX file root element is called "martif". It has two childre: marthifHeader and text * * * @param file Path to the input file * @param mappings Mappings * @return The TBX terminology */ public TBX_Terminology convertAndSerializeLargeFile(String file, PrintStream fos, Mappings mappings) { String resourceURI = new String(Main.DATA_NAMESPACE); FileInputStream inputStream = null; Scanner sc = null; int count = 0; int errors = 0; //We first count the lexicons we have SAXHandler handler = null; HashMap<String, Resource> lexicons = new HashMap(); try { InputStream xmlInput = new FileInputStream(file); SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = factory.newSAXParser(); handler = new SAXHandler(mappings); saxParser.parse(xmlInput, handler); lexicons = handler.getLexicons(); xmlInput.close(); } catch (Exception e) { logger.warn(e.getMessage()); } //WE PROCESS HERE THE MARTIF HEADER MartifHeader martifheader = extractAndReadMartifHeader(file, mappings); if (martifheader==null) return null; //First we serialize the header Model mdataset = ModelFactory.createDefaultModel(); //The whole dataset! final Resource rdataset = mdataset.createResource(resourceURI); rdataset.addProperty(DCTerms.type, handler.getMartifType()); //This should be generalized rdataset.addProperty(RDF.type, mdataset.createResource("http://www.w3.org/ns/dcat#Dataset")); rdataset.addProperty(DC.rights, IATE.rights); rdataset.addProperty(DC.source, IATE.iate); rdataset.addProperty(DC.attribution, "Download IATE, European Union, 2014"); martifheader.toRDF(mdataset, rdataset); RDFDataMgr.write(fos, mdataset, Lang.NTRIPLES); Model msubjectFields = SubjectFields.generateSubjectFields(); RDFDataMgr.write(fos, msubjectFields, Lang.NTRIPLES); //We declare that every lexicon belongs to Iterator it = lexicons.entrySet().iterator(); Property prootresource=mdataset.createProperty("http://www.w3.org/TR/void/rootResource"); while (it.hasNext()) { Map.Entry e = (Map.Entry) it.next(); Resource rlexicon = (Resource) e.getValue(); rlexicon.addProperty(prootresource, rdataset); } boolean dentro = false; try { inputStream = new FileInputStream(file); sc = new Scanner(inputStream, "UTF-8"); String xml = ""; while (sc.hasNextLine()) { String line = sc.nextLine(); //We identify the terms by scanning the strings. Not a very nice practice, though. int index = line.indexOf("<termEntry"); if (index != -1) { dentro = true; xml = line.substring(index) + "\n"; } if (dentro == true && index == -1) { xml = xml + line + "\n"; } index = line.indexOf("</termEntry>"); if (index != -1) { xml = xml + line.substring(0, index) + "\n"; count++; //We do a partial parsing of this XML fragment Document doc = loadXMLFromString(xml); if (doc == null) { continue; } Element root = doc.getDocumentElement(); if (root != null) { try { Term term = processTermEntry(root, mappings); Model model = ModelFactory.createDefaultModel(); TBX.addPrefixesToModel(model); model.setNsPrefix("", Main.DATA_NAMESPACE); final Resource rterm = term.getRes(model); rterm.addProperty(RDF.type, SKOS.Concept); term.toRDF(model, rterm); for (LexicalEntry le : term.Lex_entries) { final Resource lexicon = lexicons.get(term.lang); lexicon.addProperty(ONTOLEX.entry, le.getRes(model)); le.toRDF(model, rterm); } RDFDataMgr.write(fos, model, Lang.NTRIPLES); } catch (Exception e) { errors++; System.err.println("Error " + e.getMessage()); } if (count % 1000 == 0) { System.err.println("Total: " + count + " Errors: " + errors); } } xml = ""; } } //end of while //Now we serialize the lexicons RDFDataMgr.write(fos, handler.getLexiconsModel(), Lang.NTRIPLES); // note that Scanner suppresses exceptions if (sc.ioException() != null) { throw sc.ioException(); } } catch (Exception e) { e.printStackTrace(); } finally { if (sc != null) { sc.close(); } } return null; } /** * Gently loads a DOM XML document from a XML fragment. * If it fails, it returns null; */ private static Document loadXMLFromString(String xml) throws Exception { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); builder.setEntityResolver(new EntityResolver() { @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { if (systemId.endsWith(".dtd")) { return new InputSource(new StringReader("")); } else { return null; } } }); InputSource is = new InputSource(new StringReader(xml)); return builder.parse(is); } catch (Exception e) { return null; } } /** * Parses the text manually, extracting as text the fragment where the MartifHeader is and then parses it as XML. */ public MartifHeader extractAndReadMartifHeader(String file, Mappings mappings) { MartifHeader martifheader = null; boolean dentro = false; try { FileInputStream inputStream = new FileInputStream(file); Scanner sc = new Scanner(inputStream, "UTF-8"); String xml = ""; while (sc.hasNextLine()) { String line = sc.nextLine(); //We identify the terms by scanning the strings. Not a very nice practice, though. int index = line.indexOf("<martifHeader"); if (index != -1) { dentro = true; xml = line.substring(index) + "\n"; } if (dentro == true && index == -1) { xml = xml + line + "\n"; } index = line.indexOf("</martifHeader>"); if (index != -1) { xml = xml + line.substring(0, index) + "\n"; //We do a partial parsing of this XML fragment Document doc = loadXMLFromString(xml); Element root = doc.getDocumentElement(); martifheader = this.processMartifHeader(root, mappings); break; } } inputStream.close(); } catch (Exception e) { logger.warn("Could not parse well the general metadata (MartifHeader)" + e.getMessage()); } return martifheader; } }