package edu.emory.clir.clearnlp.lexicon.dbpedia; import java.io.FileInputStream; import java.io.InputStream; import java.io.PrintStream; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.kohsuke.args4j.IllegalAnnotationError; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.XmlUtils; import edu.emory.clir.clearnlp.util.constant.StringConst; public class DBPediaOntologyExtractor implements DBPediaXML { public void extract(InputStream in, PrintStream out) throws Exception { DocumentBuilderFactory dFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = dFactory.newDocumentBuilder(); Document doc = builder.parse(in); NodeList classes = doc.getElementsByTagName(OWL_CLASS); int i, j, len, size = classes.getLength(); String rdfAbout, rdfResource; Element owlClass, subClass; NodeList subClasses; StringBuilder sb = new StringBuilder(); for (i=0; i<size; i++) { sb.setLength(0);; owlClass = (Element)classes.item(i); subClasses = owlClass.getElementsByTagName(RDFS_SUBCLASS_OF); len = subClasses.getLength(); rdfAbout = extractType(XmlUtils.getTrimmedAttribute(owlClass, RDF_ABOUT)); sb.append(extractType(rdfAbout)); for (j=0; j<len; j++) { subClass = (Element)subClasses.item(j); rdfResource = XmlUtils.getTrimmedAttribute(subClass, RDF_RESOURCE); if (rdfResource.startsWith(DBPEDIA_ORG_ONTOLOGY)) { sb.append(StringConst.TAB); sb.append(extractType(rdfResource)); } } if (rdfAbout.equals("Mayor")) sb.append("\tPolitician"); out.println(sb.toString()); } out.close(); } private String extractType(String url) { int idx = url.lastIndexOf(StringConst.FW_SLASH) + 1; if (idx >= url.length()) throw new IllegalAnnotationError(url); return url.substring(idx); } static public void main(String[] args) throws Exception { // args[0] = "dbpedia.owl"; new DBPediaOntologyExtractor().extract(new FileInputStream(args[0]), new PrintStream(IOUtils.createXZBufferedOutputStream(args[1]))); } }