TBX2RDF_Converter.java example

Explorer
tbx2rdf-master
package tbx2rdf;

//JAVA
import java.io.FileInputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.EntityResolver;
import org.xml.sax.SAXException;

//JENA
import org.openjena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.RDFFormat;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.vocabulary.DCTerms;
import com.hp.hpl.jena.vocabulary.RDF;

//TBX2RDF
import java.io.PrintStream;
import java.util.Collections;
import org.apache.log4j.Logger;
import tbx2rdf.datasets.iate.SubjectFields;
import tbx2rdf.vocab.ONTOLEX;
import tbx2rdf.vocab.SKOS;
import tbx2rdf.vocab.TBX;
import tbx2rdf.types.LexicalEntry;
import tbx2rdf.types.Describable;
import tbx2rdf.types.MartifHeader;
import tbx2rdf.types.TBX_Terminology;
import tbx2rdf.types.Descrip;
import tbx2rdf.types.XReference;
import tbx2rdf.types.Term;
import tbx2rdf.types.AdminGrp;
import tbx2rdf.types.AdminInfo;
import tbx2rdf.types.DescripGrp;
import tbx2rdf.types.DescripNote;
import tbx2rdf.types.MartifHeader.*;
import tbx2rdf.types.Note;
import tbx2rdf.types.NoteLinkInfo;
import tbx2rdf.types.Reference;
import tbx2rdf.types.TermComp;
import tbx2rdf.types.TermCompGrp;
import tbx2rdf.types.TermCompList;
import tbx2rdf.types.TermNote;
import tbx2rdf.types.TermNoteGrp;
import tbx2rdf.types.TransacGrp;
import tbx2rdf.types.TransacNote;
import tbx2rdf.types.Transaction;
import tbx2rdf.types.abs.impID;
import tbx2rdf.types.abs.impIDLangTypeTgtDtyp;
import tbx2rdf.utils.XMLUtils;
import tbx2rdf.vocab.DC;
import tbx2rdf.vocab.IATE;


/**
 * Entry point of the TBX2RDF converter
 *
 * TBX: framework consisting of a core structure, and a formalism (eXtensible
 * Constraint Specification) for identifying a set of data-categories and their
 * constraints, both expressed in XML
 *
 * Several of the remaining data categories, including definition, context, part
 * of speech, and subject field are very important and should be included in a
 * terminology whenever possible. The most important non-mandatory data category
 * is part of speech.
 *
 *
 * A very nice reference for the basic model can be found here:
 * http://www.terminorgs.net/downloads/TBX_Basic_Version_3.pdf
 *
 * @author Philipp Cimiano - Universität Bielefeld
 * @author Victor Rodriguez - Universidad Politécnica de Madrid
 */
public class TBX2RDF_Converter {

    private final static Logger logger = Logger.getLogger(TBX2RDF_Converter.class);
    
    
    /**
     * Do not construct
     */
    public TBX2RDF_Converter() {
    }

    /**
     * Converts a TBX string into a RDF. Parses the XML searching for termEntry
     * elements.
     *
     * Then, Serializes Terms and Lexicons
     *
     * @param str The TBX XML as a String.
     * @return str A Turtle string with the equivalent information
     */
    public String convert(String str, Mappings mappings, String resourceURI) throws Exception {
        TBX_Terminology result = convert(new StringReader(str), mappings);
        StringWriter sw = new StringWriter();
        RDFDataMgr.write(sw, result.getModel(resourceURI), RDFFormat.TURTLE_PRETTY);
        return sw.toString();
    }

    /**
     * Makes the conversion given a certain input and a set of mappings. This is done with a 
     * @param input Input 
     * @param mappings Mappings
     */
    public TBX_Terminology convert(Reader input, Mappings mappings) throws IOException, ParserConfigurationException, TBXFormatException, SAXException {
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        DocumentBuilder db = dbf.newDocumentBuilder();
        TransacNote.mapAgents.clear();
        db.setEntityResolver(new EntityResolver() {
                @Override
                public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
                    if (systemId.endsWith(".dtd")) {
                        return new InputSource(new StringReader(""));
                    } else {
                        return null;
                    }
                }
            });

        // parse the input document
        Document doc = db.parse(new InputSource(input));

        // extract here martif metadata
        Element root = doc.getDocumentElement();

        return createTerminology(root, mappings);

    }

    /**
     * Processes the whole TBX file from the root XML element (once built the DOM model)
     * @param root The root element
     */
    TBX_Terminology createTerminology(Element root, Mappings mappings) throws IOException, SAXException {
        
        MartifHeader header = processMartifHeader(XMLUtils.child(root, "martifHeader"), mappings); 
        final TBX_Terminology terminology = new TBX_Terminology(root.getAttribute("type"), header);
        mappings.defaultLanguage = "en";
        
        for (Element e : XMLUtils.children(root)) {
            if (e.getTagName().equalsIgnoreCase("text")) {
                for (Term t : processText(e, mappings)) {
                    terminology.addTerm(t);
                }
            } else if (!e.getTagName().equalsIgnoreCase("martifHeader")) {
                unexpected(root);
            }
        }
        return terminology;
    }

    
        
    /**
     * Given a XML root element, processes the Martif Header
     * @param root XML root element
     * @param mappings Mappings
     */
    MartifHeader processMartifHeader(Element root, Mappings mappings) throws IOException, SAXException {
        final MartifHeader header = new MartifHeader(processFileDescrip(XMLUtils.child(root, "fileDesc"), mappings));
        processID(header, root);
        for (Element e : XMLUtils.children(root)) {
            if (e.getTagName().equalsIgnoreCase("encodingDesc")) {
                header.encodingDesc = e.getChildNodes();
            } else if (e.getTagName().equalsIgnoreCase("revisionDesc")) {
                header.revisionDesc = e.getChildNodes();
            } else if (!e.getTagName().equalsIgnoreCase("fileDesc")) {
                unexpected(e);
            }
        }
        return header;
    }

    /**
     * Obtains a FileDesc object by parsing a XML element.
     * <filedesc>: A nesting element containing child elements that describe the TBX document instance.
     */
    public FileDesc processFileDescrip(Element root, Mappings mappings) throws IOException, SAXException {
        final FileDesc fileDesc = new FileDesc();

        for (Element e : XMLUtils.children(root)) {
            if (e.getTagName().equalsIgnoreCase("titleStmt")) {
                fileDesc.titleStmt = processTitleStmt(e, mappings);
            } else if (e.getTagName().equalsIgnoreCase("publicationStmt")) {
                fileDesc.publicationStmt = e;
            } else if (e.getTagName().equalsIgnoreCase("sourceDesc")) {
                fileDesc.sourceDesc.add(e);
            } else {
                unexpected(e);
            }

        }
        return fileDesc;

    }

    /**
     * Processes some metadata elements from the root element
     */
    private TitleStmt processTitleStmt(Element root, Mappings mappings) {
        final TitleStmt titleStmt = new TitleStmt(XMLUtils.child(root, "title").getTextContent());
        if (root.hasAttribute("xml:lang")) {
            titleStmt.lang = root.getAttribute("xml:lang");
        }
        if (root.hasAttribute("id")) {
            titleStmt.id = root.getAttribute("id");
        }
        final Element title = XMLUtils.child(root, "title");
        if (title.hasAttribute("xml:lang")) {
            titleStmt.title_lang = title.getAttribute("xml:lang");
        }
        if (root.hasAttribute("id")) {
            titleStmt.title_id = title.getAttribute("id");
        }
        for (Element e : XMLUtils.children(root)) {
            if (e.getTagName().equalsIgnoreCase("note")) {
                titleStmt.notes.add(e);
            }
        }

        return titleStmt;
    }

    /**
     * Processes the body element and the back element.
     * We arrive here with a <text> element.
     */
    Collection<Term> processText(Element root, Mappings mappings) throws IOException, SAXException {
        final Collection<Term> terms = new HashSet<Term>();
        for (Element e : XMLUtils.children(root)) {
            if (e.getTagName().equalsIgnoreCase("body")) {
                terms.addAll(processBody(e, mappings));
            } else if (e.getTagName().equalsIgnoreCase("back")) {
                terms.addAll(processBack(e, mappings));
            } else {
                unexpected(e);
            }

        }
        return terms;
    }

    /**
     * Processes the collection of terms
     * We arreive here with a <body> element
     */
    private Collection<? extends Term> processBody(Element root, Mappings mappings) {
        final Collection<Term> terms = new HashSet<Term>();
        for (Element e : XMLUtils.children(root)) {
            if (e.getTagName().equalsIgnoreCase("termEntry")) {
                terms.add(processTermEntry(e, mappings));
            } else {
                unexpected(e);
            }
        }
        return terms;
    }

    private Collection<? extends Term> processBack(Element root, Mappings mappings) {
		// TODO: This should do something right?
        return Collections.EMPTY_LIST;
    }

    /**
     * Processes, from a node, a termEntry
     * @return A Term
     */
    Term processTermEntry(Element node, Mappings mappings) {
        // create new Term 
        // add subjectField
        // add ID

        // <!ELEMENT termEntry  ((%auxInfo;),(langSet+)) >
        // <!ATTLIST termEntry
        // id ID #IMPLIED >
        // <!ENTITY % auxInfo '(descrip | descripGrp | admin | adminGrp | transacGrp | note | ref | xref)*' >
        Term term = new Term();

        int langsetcount = 0;

        String sid=node.getAttribute("id");
        term.setID(sid);

        for (Element sub : XMLUtils.children(node)) {
            final String name = sub.getTagName();

            if (name.equalsIgnoreCase("langSet")) {
                langsetcount++;
                this.processLangSet(term, sub, mappings);
            } else {
                processAuxInfo(term, sub, mappings);
            }
        }

        if (langsetcount == 0) {
            logger.warn("No langSet element in termEntry");
//            throw new TBXFormatException("No langSet element in termEntry");
        }

        return term;
    }

    void processReference(NoteLinkInfo descr, Element sub, Mappings mappings) {
        // <!ELEMENT ref (#PCDATA) >
        // <!ATTLIST ref
        //    %impIDLangTypTgtDtyp;
        // >

        //<!ENTITY % impIDLangTypTgtDtyp ' id ID #IMPLIED
        //xml:lang CDATA #IMPLIED 
        // type CDATA #REQUIRED 
        // target IDREF #IMPLIED 
        // datatype CDATA #IMPLIED
        //'>
        final Reference ref = new Reference(processType(sub, mappings, true), sub.getAttribute("xml:lang"), mappings, sub.getChildNodes());
        if (sub.hasAttribute("id")) {
            ref.setID(sub.getAttribute("id"));
        }
        if (sub.hasAttribute("target")) {
            ref.target = sub.getAttribute("target");
        }
        if (sub.hasAttribute("datatype")) {
            ref.datatype = sub.getAttribute("datatype");
        }
        descr.References.add(ref);
    }

    void processAdminGrp(NoteLinkInfo descr, Element node, Mappings mappings) {
        // <!ELEMENT adminGrp (admin, (adminNote|note|ref|xref)*) >
        // <!ATTLIST adminGrp
        // id ID #IMPLIED >


        processID((impID) descr, node);

        int i = 0;
        for (Element tig_child : XMLUtils.children(node)) {

            String name = tig_child.getNodeName();

            if (i == 0 && !name.equals("admin")) {
                throw new TBXFormatException("First element of TIG is not term !\n");
            }

            if (name.equals("admin")) {
                processAdmin(descr, tig_child, mappings);
            } else if (name.equals("adminNote")) {
                processAdminGrp(descr, tig_child, mappings);
            } else if (name.equals("note")) {
                processNote(descr, tig_child, mappings);
            } else if (name.equals("ref")) {
                this.processReference(descr, tig_child, mappings);
            } else if (name.equals("xref")) {
                this.processXReference(descr, tig_child, mappings);
            } else {
                throw new TBXFormatException("Element " + name + "not defined by TBX standard");
            }
            i++;
        }

    }

    /**
     * Processes the langset (xml:lang)
     *
     * @return a LexicalEntry
     */
    Term processLangSet(Term term, Element langSet, Mappings mappings) {

        // <!ELEMENT langSet ((%auxInfo;), (tig | ntig)+) >
        // <!ATTLIST langSet
        // id ID #IMPLIED
        // xml:lang CDATA #REQUIRED >

        LexicalEntry entry;
        String language = XMLUtils.getValueOfAttribute(langSet, "xml:lang");

        if (language == null) {
            throw new TBXFormatException("Language not specified for langSet!");
        }

        int termCount = 0;

        processID(term, langSet);

        for (Element sub : XMLUtils.children(langSet)) {

            final String name = sub.getNodeName();

            if (name.equals("ntig")) {
                termCount++;
                entry = new LexicalEntry(language, mappings);
                this.processNTIG(entry, sub, mappings);
                term.Lex_entries.add(entry);
            } else if (name.equals("tig")) {
                termCount++;
                entry = new LexicalEntry(language, mappings);
                this.processTIG(entry, sub, mappings);
                term.Lex_entries.add(entry);
            } else {
                processAuxInfo(term, sub, mappings);
            }
        }

        if (termCount == 0) {
            throw new TBXFormatException("No TIG nor NTIG in langSet !");
        }

        return term;
    }

    void processTIG(LexicalEntry entry, Element tig, Mappings mappings) {

        // <!ELEMENT tig (term, (termNote)*, %auxInfo;) >
        // <!ATTLIST tig
        // id ID #IMPLIED >
        int i = 0;

        processID(entry, tig);
        Iterable<Element> children = XMLUtils.children(tig);
        for (Element tig_child : children) {

            String name = tig_child.getNodeName();

            if (i == 0 && !name.equals("term")) {
                throw new TBXFormatException("First element of TIG is not term !\n");
            }

            if (name.equals("term")) {
                this.processTerm(entry, tig_child, mappings);
            } else if (name.equals("termNote")) {
                entry.TermNotes.add(new TermNoteGrp(this.processTermNote(tig_child, mappings), mappings.defaultLanguage, mappings));
            } else {
                processAuxInfo(entry, tig_child, mappings);
            }
            i++;
        }

    }

    /**
     * Processes a term within a termEntry
     */
    void processTerm(LexicalEntry entry, Element node, Mappings mappings) {

        // <!ELEMENT term %basicText; >
        // <!ATTLIST term
        // id ID #IMPLIED >
        entry.Lemma = node.getTextContent();
    }

    TermNote processTermNote(Element tig_child, Mappings mappings) {

        // <!ELEMENT termNote %noteText; >
        // <!ATTLIST termNote
        //    %impIDLangTypTgtDtyp;
        // >
        // <!ENTITY % impIDLangTypTgtDtyp ' id ID #IMPLIED
        // xml:lang CDATA #IMPLIED type CDATA #REQUIRED target IDREF #IMPLIED datatype CDATA #IMPLIED
        // '>
        final TermNote note = new TermNote(tig_child.getChildNodes(), processType(tig_child, mappings, true), tig_child.getAttribute("xml:lang"), mappings);
        processImpIDLangTypeTgtDType(note, tig_child, mappings);
        return note;
    }

    void processNTIG(LexicalEntry entry, Node ntig, Mappings mappings) {

        // <!ELEMENT ntig (termGrp, %auxInfo;) >
        // <!ATTLIST ntig
        // id ID #IMPLIED	
        // >
        int i = 0;
        for (Element ntig_child : XMLUtils.children(ntig)) {

            String name = ntig_child.getNodeName();

            if (i == 0 && !name.equals("termGrp")) {
                if (Main.lenient==false)
                    throw new TBXFormatException("First element of NTIG is not termGrp !\n");
            }

            if (name.equals("termGrp")) {
                this.processTermGroup(entry, ntig_child, mappings);
            } else {
                processAuxInfo(entry, ntig_child, mappings);
            }
            i++;
        }
    }

    void processXReference(NoteLinkInfo descr, Element node, Mappings mappings) {

        // <!ELEMENT xref (#PCDATA) >
        // <!ATTLIST xref
        // %impIDType;
        // target CDATA #REQUIRED >
        XReference xref = new XReference(XMLUtils.getValueOfAttribute(node, "target"), node.getTextContent());

        processID(xref, node);
        xref.type = processType(node, mappings, false);
        descr.Xreferences.add(xref);
    }

    void processDescripGroup(Describable descr, Element node, Mappings mappings) {

        // The DTD for a DescripGroup is as follows
        // <!ELEMENT descripGrp (descrip, (descripNote|admin|adminGrp|transacGrp|note|ref|xref)*)
        // >
        // <!ATTLIST descripGrp
        //  id ID #IMPLIED >

        DescripGrp descrip = new DescripGrp(processDescrip(XMLUtils.firstChild("descrip", node), mappings));
        processID(descrip, node);
        // get first child that needs to be a descrip
        // process other XMLUtils.children that can be: descripNote, admin, adminGroup, transacGrp, note, ref and xref
        for (Element sub : XMLUtils.children(node)) {
            final String name = sub.getTagName();
            if (name.equalsIgnoreCase("descrip")) {
                // ignore
            } else if (name.equalsIgnoreCase("descripNote")) {
                processDescripNote(descrip, sub, mappings);
            } else if (name.equalsIgnoreCase("admin")) {
                this.processAdmin(descrip, sub, mappings);
            } else if (name.equalsIgnoreCase("adminGrp")) {
                this.processAdminGrp(descrip, sub, mappings);
            } else if (name.equalsIgnoreCase("transacGrp")) {
                this.processTransactionGroup(descrip, sub, mappings);
            } else if (name.equalsIgnoreCase("note")) {
                this.processTransactionGroup(descrip, sub, mappings);
            } else if (name.equalsIgnoreCase("ref")) {
                this.processReference(descrip, sub, mappings);
            } else if (name.equalsIgnoreCase("xref")) {
                this.processXReference(descrip, sub, mappings);
            } else {
                throw new TBXFormatException("Unexpected subnode " + node.getTagName());
            }
        }

        descr.Descriptions.add(descrip);
    }

    void processAdmin(NoteLinkInfo descr, Element node, Mappings mappings) {
        // <!ELEMENT admin %noteText; >
        // <!ATTLIST admin
        //  %impIDLangTypTgtDtyp;
        //>
        final AdminInfo admin = new AdminInfo(node.getChildNodes(), processType(node, mappings, true), node.getAttribute("xml:lang"), mappings);
        processImpIDLangTypeTgtDType(admin, node, mappings);
        descr.AdminInfos.add(new AdminGrp(admin));
    }

    /**
     * Processes a Transaction Group www.isocat.org/datcat/DC-162 A transacGrp
     * element can contain either one transacNote element, or one date element,
     * or both. Example:
     * <transacGrp>
     * <transac type="transactionType">creation</transac>
     * <transacNote type="responsibility" target="CA5365">John
     * Harris</transacNote>
     * <date>2008‐05‐12</date>
     * </transacGrp>
     *
     * @param transacGroup A Transaction group in XML // According to the TBX
     * DTD, a transacGroup looks as follows: // <!ELEMENT transacGrp (transac,
     * (transacNote|date|note|ref|xref)* ) >
     * // <!ATTLIST transacGrp // id ID #IMPLIED >
     * // Transaction transaction = new Transaction(lex);
     */
    void processTransactionGroup(NoteLinkInfo descr, Element elem, Mappings mappings) {

        // <!ELEMENT transacGrp (transac, (transacNote|date|note|ref|xref)* ) >
        // <!ATTLIST transacGrp
        // id ID #IMPLIED >
        Element elemTransac = null;
        try {
            elemTransac = XMLUtils.firstChild("transac", elem);
        } catch (Exception e) {
            return;
        }
        final TransacGrp transacGrp = new TransacGrp(processTransac(elemTransac, mappings));

        int i = 0;
        for (Element child : XMLUtils.children(elem)) {

            String name = child.getNodeName();

            if (i == 0 && !name.equals("transac")) {
                throw new TBXFormatException("First element of transacGrp is not termGrp !\n");
            }

            if (name.equals("transac")) {
                //processTransac(transacGrp, child, mappings);
            } else if (name.equals("transacNote")) {
                processTransacNote(transacGrp, child, mappings);
            } else if (name.equals("date")) {
                processDate(transacGrp, child, mappings);
            } else if (name.equals("note")) {
                processNote(transacGrp, child, mappings);
            } else if (name.equals("xref")) {
                processXReference(transacGrp, child, mappings);
            } else if (name.equals("ref")) {
                this.processReference(transacGrp, child, mappings);
            } else {
                throw new TBXFormatException("Element " + name + " not defined by TBX standard\n");
            }
            i++;
        }
        descr.Transactions.add(transacGrp);
    }

    void processTermGroup(LexicalEntry entry, Element node, Mappings mappings) {
        // <!ELEMENT termGrp (term, (termNote|termNoteGrp)*, (termCompList)* ) >
        // <!ATTLIST termGrp
        //  id ID #IMPLIED
        //>
        for (Element elem : XMLUtils.children(node)) {
            final String name = elem.getTagName();
            if (name.equalsIgnoreCase("term")) {
                processTerm(entry, elem, mappings);
            } else if (name.equalsIgnoreCase("termNote")) {
                entry.TermNotes.add(new TermNoteGrp(processTermNote(elem, mappings), mappings.defaultLanguage, mappings));
            } else if (name.equalsIgnoreCase("termNoteGrp")) {
                entry.TermNotes.add(processTermNoteGrp(elem, mappings));
            } else if (name.equalsIgnoreCase("termCompList")) {
                processTermCompList(entry, elem, mappings);
            }
        }
    }

    void processNote(NoteLinkInfo descr, Element elem, Mappings mappings) {
        //<!ELEMENT note %noteText; >
        //<!ATTLIST note %impIDLang;
        //>
        final Note note = new Note(elem.getChildNodes(), elem.getAttribute("xml:lang"), mappings);
        processID(note, elem);
        descr.notes.add(note);
    }

    Descrip processDescrip(Element elem, Mappings mappings) {
        //<!ELEMENT descrip %noteText; >
        //<!ATTLIST descrip
        //%impIDLangTypTgtDtyp;
        //>
        final Descrip descrip = new Descrip(elem.getChildNodes(), processType(elem, mappings, true), elem.getAttribute("xml:lang"), mappings);
        processImpIDLangTypeTgtDType(descrip, elem, mappings);
        return descrip;
    }

    void processDescripNote(DescripGrp descrip, Element sub, Mappings mappings) {
        // <!ELEMENT descripNote (#PCDATA) >
        //<!ATTLIST descripNote
        //%impIDLangTypTgtDtyp;
        //> 
        final DescripNote descripNote = new DescripNote(sub.getChildNodes(), processType(sub, mappings, true), sub.getAttribute("xml:lang"), mappings);
        processImpIDLangTypeTgtDType(descripNote, sub, mappings);
        descrip.descripNote.add(descripNote);
    }

    Transaction processTransac(Element child, Mappings mappings) {
        //  <!ELEMENT transac (#PCDATA) >
        //<!ATTLIST transac
        //%impIDLangTypTgtDtyp;
        //>
        final Transaction transaction = new Transaction(child.getChildNodes(), processType(child, mappings, true), child.getAttribute("xml:lang"), mappings);
        processImpIDLangTypeTgtDType(transaction, child, mappings);
        return transaction;
    }

    void processTransacNote(TransacGrp transacGrp, Element child, Mappings mappings) {

        //<!ELEMENT transacNote (#PCDATA) >
        //<!ATTLIST transacNote
        //%impIDLangTypTgtDtyp;
        //> 
        final TransacNote transacNote = new TransacNote(child.getChildNodes(), processType(child, mappings, true), child.getAttribute("xml:lang"), mappings);
        processImpIDLangTypeTgtDType(transacNote, child, mappings);
        transacGrp.transacNotes.add(transacNote);
    }

    void processDate(TransacGrp transacGrp, Element child, Mappings mappings) {
        //  <!ELEMENT date (#PCDATA) >
        //<!ATTLIST date
        //id ID #IMPLIED
        //> 
        transacGrp.date = child.getTextContent();
    }

    TermNoteGrp processTermNoteGrp(Element elem, Mappings mappings) {
        //  <!ELEMENT termNoteGrp (termNote, %noteLinkInfo;) >
        //<!ATTLIST termNoteGrp
        //id ID #IMPLIED
        //> 
        final TermNoteGrp termNoteGrp = new TermNoteGrp(processTermNote(XMLUtils.firstChild("termNote", elem), mappings), elem.getAttribute("xml:lang"), mappings);
        for (Element e : XMLUtils.children(elem)) {
            final String name = e.getTagName();
            if (name.equalsIgnoreCase("termNote")) {
                // Do nothing
            } else if (name.equalsIgnoreCase("admin")) {
                processAdmin(termNoteGrp, e, mappings);
            } else if (name.equalsIgnoreCase("adminGrp")) {
                processAdminGrp(termNoteGrp, e, mappings);
            } else if (name.equalsIgnoreCase("transacGrp")) {
                processTransactionGroup(termNoteGrp, e, mappings);
            } else if (name.equalsIgnoreCase("note")) {
                processNote(termNoteGrp, e, mappings);
            } else if (name.equalsIgnoreCase("ref")) {
                processReference(termNoteGrp, e, mappings);
            } else if (name.equalsIgnoreCase("xref")) {
                processXReference(termNoteGrp, e, mappings);
            }
        }
        return termNoteGrp;
    }

    void processTermCompList(LexicalEntry entry, Element elem, Mappings mappings) {
        // <!ELEMENT termCompList ((%auxInfo;), (termComp | termCompGrp)+) >
        //<!ATTLIST termCompList
        //id ID #IMPLIED
        //type CDATA #REQUIRED
        //>
        final TermCompList termCompList = new TermCompList(mappings.getMapping("termCompList", "type", elem.getAttribute("type")));
        processID(termCompList, elem);
        for (Element e : XMLUtils.children(elem)) {
            final String name = e.getTagName();
            if (name.equalsIgnoreCase("termComp")) {
                final TermComp termComp = processTermComp(e, mappings);
                termCompList.termComp.add(new TermCompGrp(termComp, null, mappings));
            } else if (name.equalsIgnoreCase("termCompGrp")) {
                processTermCompGrp(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("admin")) {
                processAdmin(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("adminGrp")) {
                processAdminGrp(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("transacGrp")) {
                processTransactionGroup(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("note")) {
                processNote(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("ref")) {
                processReference(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("xref")) {
                processXReference(termCompList, e, mappings);
            }
        }
        entry.Decomposition.add(termCompList);

    }

    TermComp processTermComp(Element e, Mappings mappings) {
        //<!ELEMENT termComp (#PCDATA) >
        //<!ATTLIST termComp
        // %impIDLang;
        //>
        final TermComp termComp = new TermComp(e.getTextContent(), e.getAttribute("xml:lang"), mappings);
        processID(termComp, e);
        return termComp;
    }

    void processTermCompGrp(TermCompList termCompList, Element elem, Mappings mappings) {
        //<!ELEMENT termCompGrp (termComp, (termNote|termNoteGrp)*, %noteLinkInfo;) >
        //<!ATTLIST termCompGrp
        //id ID #IMPLIED
        //>
        final TermCompGrp termCompGrp = new TermCompGrp(processTermComp(XMLUtils.firstChild("termComp", elem), mappings), null, mappings);
        for (Element e : XMLUtils.children(elem)) {
            final String name = e.getTagName();
            if (name.equalsIgnoreCase("termNote")) {
                termCompGrp.termNoteGrps.add(new TermNoteGrp(processTermNote(e, mappings), null, mappings));
            } else if (name.equalsIgnoreCase("termNoteGrp")) {
                termCompGrp.termNoteGrps.add(processTermNoteGrp(e, mappings));
            } else if (name.equalsIgnoreCase("admin")) {
                processAdmin(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("adminGrp")) {
                processAdminGrp(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("transacGrp")) {
                processTransactionGroup(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("note")) {
                processNote(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("ref")) {
                processReference(termCompList, e, mappings);
            } else if (name.equalsIgnoreCase("xref")) {
                processXReference(termCompList, e, mappings);
            }
        }
        termCompList.termComp.add(termCompGrp);
    }





    /**
     * 
     */
    private void unexpected(Node n) {
        if (n instanceof Element) {
            throw new TBXFormatException("Unexpected " + ((Element) n).getTagName());
        } else {
            throw new TBXFormatException("Unexpected");
        }
    }

    private void processID(impID elem, Element node) {
        if (node.hasAttribute("id")) {
            elem.setID(node.getAttribute("id"));
        }
    }

    /**
     * 
     */
    private void processImpIDLangTypeTgtDType(impIDLangTypeTgtDtyp ref, Element sub, Mappings mappings) {
        // <!ENTITY % impIDLangTypTgtDtyp '
        //  id ID #IMPLIED
        //  xml:lang CDATA #IMPLIED
        //  type CDATA #REQUIRED
        //  target IDREF #IMPLIED
        //  datatype CDATA #IMPLIED
        // '>
        if (sub.hasAttribute("id")) {
            ref.setID(sub.getAttribute("id"));
        }
        if (sub.hasAttribute("target")) {
            ref.target = sub.getAttribute("target");
        }
        if (sub.hasAttribute("datatype")) {
            ref.datatype = sub.getAttribute("datatype");
        }
        if (sub.hasAttribute("subjectField"))
        {
//            System.out.println("uy");
        }
    }

    private void processAuxInfo(Describable term, Element sub, Mappings mappings) {
        //   <!ENTITY % auxInfo '(descrip | descripGrp | admin | adminGrp | transacGrp | note | ref
        //        | xref)*' >
        final String name = sub.getTagName();
        if (name.equalsIgnoreCase("descrip")) {
            term.Descriptions.add(new DescripGrp(processDescrip(sub, mappings)));
        } else if (name.equalsIgnoreCase("descripGrp")) {
            this.processDescripGroup(term, sub, mappings);
        } else if (name.equalsIgnoreCase("admin")) {
            this.processAdmin(term, sub, mappings);
        } else if (name.equalsIgnoreCase("adminGrp")) {
            this.processAdminGrp(term, sub, mappings);
        } else if (name.equalsIgnoreCase("transacGrp")) {
            this.processTransactionGroup(term, sub, mappings);
        } else if (name.equalsIgnoreCase("note")) {
            this.processNote(term, sub, mappings);
        } else if (name.equalsIgnoreCase("ref")) {
            this.processReference(term, sub, mappings);
        } else if (name.equalsIgnoreCase("xref")) {
            this.processXReference(term, sub, mappings);
        } else {
            throw new TBXFormatException("Element " + name + " not defined by TBX standard");
        }

    }


    /**
     * 
     */
    private Mapping processType(Element sub, Mappings mappings, boolean required) {
        if (sub.hasAttribute("type")) {
            final Mapping m = mappings.getMapping(sub.getTagName(), "type", sub.getAttribute("type"));
            if (m == null && required) {
                logger.warn("Unrecognised mapping for <" + sub.getTagName() + " type=\"" + sub.getAttribute("type") + "\">");
            }
            return m;
        } else if (required) {
            throw new TBXFormatException("type expected");
        } else {
            System.err.println("Null type on " + sub.getTagName());
            return null;
        }
    }
    
    
    /**
     * Converts a XML TBX file (handling large files...)
     * It does not hold in memory the whole dataset, but parses it as it comes.
     * 
     * A TBX file root element is called "martif". It has two childre: marthifHeader and text
     * 
     * 
     * @param file Path to the input file
     * @param mappings Mappings
     * @return The TBX terminology
     */
    public TBX_Terminology convertAndSerializeLargeFile(String file, PrintStream fos, Mappings mappings) {
        String resourceURI = new String(Main.DATA_NAMESPACE);
        FileInputStream inputStream = null;
        Scanner sc = null;
        int count = 0;
        int errors = 0;

        //We first count the lexicons we have
        SAXHandler handler = null;
        HashMap<String, Resource> lexicons = new HashMap();
        try {
            InputStream xmlInput = new FileInputStream(file);
            SAXParserFactory factory = SAXParserFactory.newInstance();
            SAXParser saxParser = factory.newSAXParser();
            handler = new SAXHandler(mappings);
            saxParser.parse(xmlInput, handler);
            lexicons = handler.getLexicons();
            xmlInput.close();
        } catch (Exception e) {
            logger.warn(e.getMessage());
        }

        //WE PROCESS HERE THE MARTIF HEADER
        MartifHeader martifheader = extractAndReadMartifHeader(file, mappings);
        

        if (martifheader==null)
            return null;
        
        //First we serialize the header
        Model mdataset = ModelFactory.createDefaultModel();
        //The whole dataset!
        final Resource rdataset = mdataset.createResource(resourceURI);
        rdataset.addProperty(DCTerms.type, handler.getMartifType());
        //This should be generalized
        rdataset.addProperty(RDF.type, mdataset.createResource("http://www.w3.org/ns/dcat#Dataset"));
        rdataset.addProperty(DC.rights, IATE.rights);
        rdataset.addProperty(DC.source, IATE.iate);
        rdataset.addProperty(DC.attribution, "Download IATE, European Union, 2014");
        martifheader.toRDF(mdataset, rdataset);
        RDFDataMgr.write(fos, mdataset, Lang.NTRIPLES);

        
        Model msubjectFields = SubjectFields.generateSubjectFields();
        RDFDataMgr.write(fos, msubjectFields, Lang.NTRIPLES);
        
        

        //We declare that every lexicon belongs to 
        Iterator it = lexicons.entrySet().iterator();
        Property prootresource=mdataset.createProperty("http://www.w3.org/TR/void/rootResource");
        while (it.hasNext()) {
            Map.Entry e = (Map.Entry) it.next();
            Resource rlexicon = (Resource) e.getValue();
            rlexicon.addProperty(prootresource, rdataset);
        }
        
        
        boolean dentro = false;
        try {
            inputStream = new FileInputStream(file);
            sc = new Scanner(inputStream, "UTF-8");
            String xml = "";

            while (sc.hasNextLine()) {
                String line = sc.nextLine();
                //We identify the terms by scanning the strings. Not a very nice practice, though.
                int index = line.indexOf("<termEntry");
                if (index != -1) {
                    dentro = true;
                    xml = line.substring(index) + "\n";
                }
                if (dentro == true && index == -1) {
                    xml = xml + line + "\n";
                }
                index = line.indexOf("</termEntry>");
                if (index != -1) {
                    xml = xml + line.substring(0, index) + "\n";
                    count++;
                    //We do a partial parsing of this XML fragment
                    Document doc = loadXMLFromString(xml);
                    if (doc == null) {
                        continue;
                    }
                    Element root = doc.getDocumentElement();
                    if (root != null) {
                        try {
                            Term term = processTermEntry(root, mappings);
                            Model model = ModelFactory.createDefaultModel();
                            TBX.addPrefixesToModel(model);
                            model.setNsPrefix("", Main.DATA_NAMESPACE);
                            final Resource rterm = term.getRes(model);
                            rterm.addProperty(RDF.type, SKOS.Concept);
                            term.toRDF(model, rterm);
                            for (LexicalEntry le : term.Lex_entries) {
                                final Resource lexicon = lexicons.get(term.lang);
                                lexicon.addProperty(ONTOLEX.entry, le.getRes(model));
                                le.toRDF(model, rterm);
                            }
                            RDFDataMgr.write(fos, model, Lang.NTRIPLES);
                        } catch (Exception e) {
                            errors++;
                            System.err.println("Error " + e.getMessage());
                        }
                        if (count % 1000 == 0) {
                            System.err.println("Total: " + count + " Errors: " + errors);
                        }
                    }
                    xml = "";
                }
            } //end of while

            //Now we serialize the lexicons
            RDFDataMgr.write(fos, handler.getLexiconsModel(), Lang.NTRIPLES);



            // note that Scanner suppresses exceptions
            if (sc.ioException() != null) {
                throw sc.ioException();
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (sc != null) {
                sc.close();
            }
        }
        return null;
    }        
    /**
     * Gently loads a DOM XML document from a XML fragment.
     * If it fails, it returns null;
     */
    private static Document loadXMLFromString(String xml) throws Exception {
        try {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = factory.newDocumentBuilder();
            builder.setEntityResolver(new EntityResolver() {
                @Override
                public InputSource resolveEntity(String publicId, String systemId)
                        throws SAXException, IOException {
                    if (systemId.endsWith(".dtd")) {
                        return new InputSource(new StringReader(""));
                    } else {
                        return null;
                    }
                }
            });
            InputSource is = new InputSource(new StringReader(xml));
            return builder.parse(is);
        } catch (Exception e) {
            return null;
        }
    }        
    

    /**
     * Parses the text manually, extracting as text the fragment where the MartifHeader is and then parses it as XML.
     */
    public MartifHeader extractAndReadMartifHeader(String file, Mappings mappings)
    {
        MartifHeader martifheader = null;
        boolean dentro = false;
        try {
            FileInputStream inputStream = new FileInputStream(file);
            Scanner sc = new Scanner(inputStream, "UTF-8");
            String xml = "";
            while (sc.hasNextLine()) {
                String line = sc.nextLine();
                //We identify the terms by scanning the strings. Not a very nice practice, though.
                int index = line.indexOf("<martifHeader");
                if (index != -1) {
                    dentro = true;
                    xml = line.substring(index) + "\n";
                }
                if (dentro == true && index == -1) {
                    xml = xml + line + "\n";
                }
                index = line.indexOf("</martifHeader>");
                if (index != -1) {
                    xml = xml + line.substring(0, index) + "\n";
                    //We do a partial parsing of this XML fragment
                    Document doc = loadXMLFromString(xml);
                    Element root = doc.getDocumentElement();
                    martifheader = this.processMartifHeader(root, mappings);
                    break;
                }

            }
            inputStream.close();
        } catch (Exception e) {
            logger.warn("Could not parse well the general metadata (MartifHeader)" + e.getMessage());
        }
        return martifheader;
}    
    
}