/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.lmf.transform.ontowiktionary; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import de.tudarmstadt.ukp.jwktl.api.RelationType; import de.tudarmstadt.ukp.jwktl.api.util.ILanguage; public class OntoWiktionary { protected File directory; protected ILanguage language; protected Map<String, List<OntoWiktionarySemanticRelation>> semanticRelations; protected List<OntoWiktionaryConcept> concepts; public OntoWiktionary(final File directory, final ILanguage language) { this.directory = directory; this.language = language; } public List<OntoWiktionarySemanticRelation> getSemanticRelations( final String senseId) throws IOException { if (semanticRelations != null) return semanticRelations.get(senseId); File dataFile = new File(directory, "semantic_relations_" + language.getISO639_1() + ".tsv"); semanticRelations = new TreeMap<String, List<OntoWiktionarySemanticRelation>>(); BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(dataFile), StandardCharsets.UTF_8)); try { int lineNo = 1; String line = reader.readLine(); // skip first line while ((line = reader.readLine()) != null) { int idx = line.indexOf('\t'); if (idx < 0) throw new RuntimeException("Invalid file format at " + dataFile.getName() + " line " + lineNo++); String sourceId = line.substring(0, idx); line = line.substring(idx + 1); idx = line.indexOf('\t'); if (idx < 0) throw new RuntimeException("Invalid file format at " + dataFile.getName() + " line " + lineNo++); line = line.substring(idx + 1); idx = line.indexOf('\t'); if (idx < 0) throw new RuntimeException("Invalid file format at " + dataFile.getName() + " line " + lineNo++); String relationTypeStr = line.substring(0, idx); line = line.substring(idx + 1); idx = line.indexOf('\t'); if (idx < 0) throw new RuntimeException("Invalid file format at " + dataFile.getName() + " line " + lineNo++); String targetId = line.substring(0, idx); String target = line.substring(idx + 1); List<OntoWiktionarySemanticRelation> relations = semanticRelations.get(sourceId); if (relations == null) { relations = new ArrayList<OntoWiktionarySemanticRelation>(); semanticRelations.put(sourceId, relations); } RelationType relationType = null; if ("HAS_SYN".equals(relationTypeStr)) relationType = RelationType.SYNONYM; else if ("HAS_ANT".equals(relationTypeStr)) relationType = RelationType.ANTONYM; else if ("HAS_HYPER".equals(relationTypeStr)) relationType = RelationType.HYPERNYM; else if ("HAS_HYPO".equals(relationTypeStr)) relationType = RelationType.HYPONYM; else if ("HAS_HOLO".equals(relationTypeStr)) relationType = RelationType.HOLONYM; else if ("HAS_MERO".equals(relationTypeStr)) relationType = RelationType.MERONYM; else if ("CHARACTERISTIC_WORD".equals(relationTypeStr)) relationType = RelationType.CHARACTERISTIC_WORD_COMBINATION; else if ("DERIVED_TERM".equals(relationTypeStr)) relationType = RelationType.DERIVED_TERM; else if ("ETYMOLOGICALLY_RELA".equals(relationTypeStr)) relationType = RelationType.ETYMOLOGICALLY_RELATED_TERM; else if ("COORDINATE_TERM".equals(relationTypeStr)) relationType = RelationType.COORDINATE_TERM; else if ("TROPONYM".equals(relationTypeStr)) relationType = RelationType.TROPONYM; else if ("DESCENDANT".equals(relationTypeStr)) relationType = RelationType.DESCENDANT; else if ("SEE_ALSO".equals(relationTypeStr)) relationType = RelationType.SEE_ALSO; else System.out.println("UNKNOWN RELATION TYPE: " + relationTypeStr); relations.add(new OntoWiktionarySemanticRelation(sourceId, relationType, targetId, target)); } } finally { reader.close(); } return semanticRelations.get(senseId); } public void freeSemanticRelations() { if (semanticRelations != null) { semanticRelations.clear(); semanticRelations = null; } } public List<OntoWiktionaryConcept> getConcepts() throws IOException, ParserConfigurationException, SAXException { if (concepts != null) return concepts; concepts = new ArrayList<OntoWiktionaryConcept>(); InputStream in = new FileInputStream(new File(directory, "OntoWiktionary_" + language.getISO639_1().toUpperCase() + ".xml")); try { // Run the SAX parser. SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser saxParser = factory.newSAXParser(); saxParser.parse(in, new DefaultHandler(){ protected OntoWiktionaryConcept currentConcept; @Override public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) throws SAXException { if ("Concept".equals(qName)) { currentConcept = new OntoWiktionaryConcept(attributes.getValue("id")); concepts.add(currentConcept); } else if ("Lexicalization".equals(qName)) currentConcept.addLexicalization(attributes.getValue("id")); else if ("Subsumes".equals(qName)) currentConcept.addSubsumesRelation(attributes.getValue("target_id")); else if ("SubsumedBy".equals(qName)) currentConcept.addSubsumedByRelation(attributes.getValue("target_id")); else if ("RelatedConcept".equals(qName)) currentConcept.addRelatedConcept(attributes.getValue("target_id")); } }); } finally { in.close(); } return concepts; } public Iterable<OntoWiktionaryConcept> getStreamedConcepts() throws IOException, ParserConfigurationException, SAXException { return new Iterable<OntoWiktionaryConcept>() { @Override public Iterator<OntoWiktionaryConcept> iterator() { try { final InputStream in = new FileInputStream(new File(directory, "OntoWiktionary_" + language.getISO639_1().toUpperCase() + ".xml")); final XMLStreamReader parser = XMLInputFactory.newInstance().createXMLStreamReader(in); return new Iterator<OntoWiktionaryConcept>() { protected boolean atStartPosition = true; @Override public boolean hasNext() { try { return parser.hasNext(); } catch (XMLStreamException e) { throw new RuntimeException(e); } } @Override public OntoWiktionaryConcept next() { try { // Find the first concept node. if (atStartPosition) { while (parser.hasNext() && (parser.getEventType() != XMLStreamConstants.START_ELEMENT || !"Concept".equals(parser.getLocalName()))) parser.next(); atStartPosition = false; } if (!parser.hasNext()) { in.close(); return null; } OntoWiktionaryConcept currentConcept = new OntoWiktionaryConcept(parser.getAttributeValue(null, "id")); // Iterate over all elements before the concept node's end element. while (parser.hasNext() && (parser.getEventType() != XMLStreamConstants.END_ELEMENT || !"Concept".equals(parser.getLocalName()))) { if (parser.getEventType() == XMLStreamConstants.START_ELEMENT) { String qName = parser.getLocalName(); if ("Lexicalization".equals(qName)) currentConcept.addLexicalization(parser.getAttributeValue(null, "id")); else if ("Subsumes".equals(qName)) currentConcept.addSubsumesRelation(parser.getAttributeValue(null, "target_id")); else if ("SubsumedBy".equals(qName)) currentConcept.addSubsumedByRelation(parser.getAttributeValue(null, "target_id")); else if ("RelatedConcept".equals(qName)) currentConcept.addRelatedConcept(parser.getAttributeValue(null, "target_id")); } parser.next(); } // Find the next concept node. while (parser.hasNext() && (parser.getEventType() != XMLStreamConstants.START_ELEMENT || !"Concept".equals(parser.getLocalName()))) parser.next(); return currentConcept; } catch (Exception e) { throw new RuntimeException(e); } } @Override public void remove() { throw new UnsupportedOperationException("Iterator<OntoWiktionaryConcept>.remove()"); } }; } catch (Exception e) { throw new RuntimeException(e); } } }; } public void freeConcepts() { if (concepts != null) { concepts.clear(); concepts = null; } } }