/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package edu.toronto.cs.xcurator.eval; import edu.toronto.cs.xcurator.utils.StrUtils; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.io.IOUtils; import org.semanticweb.yars.nx.parser.NxParser; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * * @author Amir */ public class GoldStandardGenerator { private static DocumentBuilder builder; Map<String, String> xmlTags; public GoldStandardGenerator() { xmlTags = new HashMap<>(); setupDocumentBuilder(); } static String xmlfile = "D:\\workspace\\bio2rdf-data\\data\\download\\drugbank\\drugbank.org.xml"; static String xmlout = "D:\\workspace\\bio2rdf-data\\data\\download\\drugbank\\drugbank.xml"; // static String xmlout = "D:\\workspace\\bio2rdf-data\\data\\download\\drugbank\\old\\drugbank.xml"; static String rdf = "D:\\workspace\\bio2rdf-data\\data\\rdf\\drugbank\\drugbank.nq"; // static String rdf = "D:\\workspace\\bio2rdf-data\\data\\rdf\\drugbank\\original\\drugbank.nq"; public static void generateXML() { GoldStandardGenerator gs = new GoldStandardGenerator(); gs.generateXMLUniqueValueFile(xmlfile, xmlout); } public static void main(String[] args) { GoldStandardGenerator gs = new GoldStandardGenerator(); // List<RDFEnt> result = gs.readRDFFile(rdf); // System.out.println(result.size()); // System.out.println(result); gs.generateMappingFromXMLAndRDF(xmlout, rdf); } public void generateMappingFromXMLAndRDF(String xmlfile, String rdffile) { InputStream inputStream = null; try { inputStream = new FileInputStream(new File(xmlfile)); String xml = IOUtils.toString(inputStream); final InputStream xmlInputStream = IOUtils.toInputStream(xml); Document dataDocument = createDocument(xmlInputStream); removeWhiteSpaceTextNodes(dataDocument); List<RDFEnt> rdfents = readRDFFile(rdffile); travelesXML(dataDocument.getDocumentElement()); System.out.println(xmlTags); final Map<String, Boolean> results = matchTags(xmlTags, rdfents); for (String key : results.keySet()) { System.out.println(key + "\t" + results.get(key)); } } catch (FileNotFoundException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } finally { try { inputStream.close(); } catch (IOException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } } } public void travelesXML(Node node) { // do something with the current node instead of System.out // System.out.println(node); // System.out.println("NodeName:" + node.getNodeName()); // System.out.println("NodeValue:" + node.getNodeValue()); // System.out.println("NodeType:" + node.getNodeType()); if (node.getNodeType() == Node.TEXT_NODE) { // System.out.println("TextContent:" + node.getTextContent()); // node.setTextContent(StrUtils.nextRandString()); // System.out.println(node.getNodeName() + " " + node.getNodeValue() + " " + node.getPrefix()); // System.out.println(node.getParentNode().getNodeName()); xmlTags.put(node.getTextContent(), node.getParentNode().getNodeName()); } NamedNodeMap attrs = node.getAttributes(); if (attrs != null) { for (int i = 0; i < attrs.getLength(); i++) { final Node attr = attrs.item(i); // attr.setTextContent(StrUtils.nextRandString()); // System.out.println(attr.getFirstChild()); // System.out.println(node.getNodeName()); xmlTags.put(attr.getTextContent(), attr.getNodeName()); } } NodeList nodeList = node.getChildNodes(); for (int i = 0; i < nodeList.getLength(); i++) { Node currentNode = nodeList.item(i); // if (currentNode.getNodeType() == Node.ELEMENT_NODE) { //calls this method for all the children which is Element travelesXML(currentNode); // } } } public void generateXMLUniqueValueFile(String xmlfile, String xmlout) { InputStream inputStream = null; try { inputStream = new FileInputStream(new File(xmlfile)); String xml = IOUtils.toString(inputStream); final InputStream xmlInputStream = IOUtils.toInputStream(xml); Document dataDocument = createDocument(xmlInputStream); removeWhiteSpaceTextNodes(dataDocument); modifyValues(dataDocument.getDocumentElement()); Transformer tf = TransformerFactory.newInstance().newTransformer(); tf.setOutputProperty(OutputKeys.INDENT, "yes"); tf.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); // tf.setOutputProperty(OutputKeys.INDENT, "yes"); // tf.setOutputProperty(OutputKeys.METHOD, "xml"); // tf.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4"); DOMSource domSource = new DOMSource(dataDocument); StreamResult sr = new StreamResult(new File(xmlout)); tf.transform(domSource, sr); } catch (FileNotFoundException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException | TransformerException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } finally { try { inputStream.close(); } catch (IOException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } } } public void modifyValues(Node node) { // do something with the current node instead of System.out // System.out.println(node); // System.out.println("NodeName:" + node.getNodeName()); // System.out.println("NodeValue:" + node.getNodeValue()); // System.out.println("NodeType:" + node.getNodeType()); if (node.getNodeType() == Node.TEXT_NODE) { // System.out.println("TextContent:" + node.getTextContent()); node.setTextContent(StrUtils.nextRandString()); } NamedNodeMap attrs = node.getAttributes(); if (attrs != null) { for (int i = 0; i < attrs.getLength(); i++) { final Node attr = attrs.item(i); attr.setTextContent(StrUtils.nextRandString()); } } NodeList nodeList = node.getChildNodes(); for (int i = 0; i < nodeList.getLength(); i++) { Node currentNode = nodeList.item(i); // if (currentNode.getNodeType() == Node.ELEMENT_NODE) { //calls this method for all the children which is Element modifyValues(currentNode); // } } } private Document createDocument(InputStream inputStream) { try { return builder.parse(inputStream); } catch (SAXException | IOException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } return null; } private void setupDocumentBuilder() { DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); builderFactory.setNamespaceAware(true); try { builder = builderFactory.newDocumentBuilder(); } catch (ParserConfigurationException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } } public void removeWhiteSpaceTextNodes(Document doc) { try { XPathFactory xpathFactory = XPathFactory.newInstance(); // XPath to find empty text nodes. XPathExpression xpathExp = xpathFactory.newXPath().compile( "//text()[normalize-space(.) = '']"); NodeList emptyTextNodes = (NodeList) xpathExp.evaluate(doc, XPathConstants.NODESET); // Remove each empty text node from document. for (int i = 0; i < emptyTextNodes.getLength(); i++) { Node emptyTextNode = emptyTextNodes.item(i); emptyTextNode.getParentNode().removeChild(emptyTextNode); } } catch (XPathExpressionException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } } private List<RDFEnt> readRDFFile(String rdffile) { List<RDFEnt> results = new ArrayList<>(); try { NxParser nxp = new NxParser(new FileInputStream(rdffile)); while (nxp.hasNext()) { final org.semanticweb.yars.nx.Node[] ns = nxp.next(); final RDFEnt rdfent = pruneRDFEnt(ns[0], ns[1], ns[2]); if (rdfent != null) { results.add(rdfent); } } } catch (FileNotFoundException ex) { Logger.getLogger(GoldStandardGenerator.class.getName()).log(Level.SEVERE, null, ex); } return results; } // List<String> exclude = Arrays.asList("http://bio2rdf.org/drugbank_resource:", "http://bio2rdf.org/bio2rdf_vocabulary:", "http://bio2rdf.org/drugbank_vocabulary:", "http://bio2rdf.org/drugbank:", "http://bio2rdf.org/cas:", "http://bio2rdf.org/cas_vocabulary:"); private String pruneSubject(String subject) { // for (String str : exclude) { // if (subject.startsWith(str)) { // subject = subject.replace(str, ""); // break; // } // } if (subject.startsWith("http://")) { subject = subject.replaceFirst("http://", ""); } if (subject.contains(":")) { final int pos = subject.indexOf(":"); return subject.substring(pos + 1); } else { return subject; } } // private boolean isValidGStr(String str) { // if (str.length() == 32) { // return true; // } // return false; // } private Map<String, Boolean> matchTags(Map<String, String> xmlTags, List<RDFEnt> rdfents) { Map<String, Boolean> results = new HashMap<String, Boolean>(); for (RDFEnt ent : rdfents) { String subjectType = findMatch(ent.subject, xmlTags); String objectType = findMatch(ent.object, xmlTags); if (subjectType != null && objectType != null && !subjectType.equals(objectType)) { // System.out.println(ent.subject + "\t" + ent.object); // System.out.println(subjectType + "\t" + objectType + "\t" + ent.isAttr); results.put(subjectType + "\t" + objectType, ent.isAttr); // System.out.println(); } } return results; } private String findMatch(String subject, Map<String, String> xmlTags) { for (String key : xmlTags.keySet()) { if (subject.contains(key)) { return xmlTags.get(key); } } return null; } class RDFEnt { // String key; String subject; String object; boolean isAttr; } private RDFEnt pruneRDFEnt(org.semanticweb.yars.nx.Node subject, org.semanticweb.yars.nx.Node predicate, org.semanticweb.yars.nx.Node object) { RDFEnt ent = new RDFEnt(); String subjectLabel = subject.getLabel(); subjectLabel = pruneSubject(subjectLabel); String objectStr = object + ""; String objectLabel = object.getLabel(); ent.subject = subjectLabel; ent.object = objectLabel; ent.isAttr = false; if (objectStr.endsWith("@en") || objectStr.endsWith("#string>")) { ent.isAttr = true; } // System.out.println(subject + "\t" + predicate + "\t" + object + "\t"); // // System.out.println(subjectLabel + "\t" + predicate + "\t" + objectLabel + "\t"); // System.out.println(); // if (isValidGStr(subjectLabel) && isValidGStr(objectLabel)) { // ent.key = subjectLabel + "_" + objectLabel; // } return ent; } }