/** * Copyright 2014 Marco Cornolti * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.unipi.di.acube.batframework.datasetPlugins; import java.io.*; import java.util.*; import javax.xml.parsers.*; import javax.xml.xpath.XPathExpressionException; import org.w3c.dom.*; import org.xml.sax.SAXException; import it.unipi.di.acube.batframework.data.Annotation; import it.unipi.di.acube.batframework.data.Mention; import it.unipi.di.acube.batframework.data.Tag; import it.unipi.di.acube.batframework.problems.A2WDataset; import it.unipi.di.acube.batframework.utils.ProblemReduction; import it.unipi.di.acube.batframework.utils.WikipediaApiInterface; public class SMAPHDataset implements A2WDataset { private List<String> queries = new Vector<String>(); private List<HashSet<Tag>> tags = new Vector<HashSet<Tag>>(); private List<HashSet<Annotation>> annotations = new Vector<HashSet<Annotation>>(); public SMAPHDataset(String xmlFile, WikipediaApiInterface api) { File fXmlFile = new File(xmlFile); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder; Document doc; try { dBuilder = dbFactory.newDocumentBuilder(); doc = dBuilder.parse(fXmlFile); } catch (SAXException | IOException | ParserConfigurationException e) { throw new RuntimeException(e); } doc.getDocumentElement().normalize(); List<HashMap<Mention, Vector<String>>> queryMenToTitles = new Vector<HashMap<Mention, Vector<String>>>(); NodeList nList = doc.getElementsByTagName("instance"); for (int i = 0; i < nList.getLength(); i++) { HashMap<Mention, Vector<String>> mentionToTitles = new HashMap<>(); String query = ""; Node instanceNode = nList.item(i); Element eElement = (Element) instanceNode; NodeList instElemList = eElement.getChildNodes(); for (int j = 0; j < instElemList.getLength(); j++) { Node instElemNode = instElemList.item(j); if (instElemNode.getNodeType() == Node.ELEMENT_NODE) { if (!instElemNode.getNodeName().equals("annotation")) throw new RuntimeException( "Found internal node that is not an annotation."); int pos = query.length(); query += instElemNode.getTextContent(); int len = query.length() - pos; Mention men = new Mention(pos, len); mentionToTitles.put(men, new Vector<String>()); NamedNodeMap attrs = instElemNode.getAttributes(); int h = 0; Node n = null; while ((n = attrs.getNamedItem(String.format( "rank_%d_title", h))) != null) { mentionToTitles.get(men).add(n.getTextContent()); h++; } } else if (instElemNode.getNodeType() == Node.TEXT_NODE) query += instElemNode.getTextContent(); } queries.add(query); queryMenToTitles.add(mentionToTitles); } List<String> titlesToPrefetch = new Vector<String>(); for (HashMap<Mention, Vector<String>> setS : queryMenToTitles) for (Vector<String> titles : setS.values()) titlesToPrefetch.addAll(titles); try { api.prefetchTitles(titlesToPrefetch); } catch (XPathExpressionException | IOException | ParserConfigurationException | SAXException e) { throw new RuntimeException(e); } try { for (int i = 0; i < queryMenToTitles.size(); i++) { HashSet<Tag> qTags = new HashSet<Tag>(); HashSet<Annotation> qAnns = new HashSet<Annotation>(); HashMap<Mention, Vector<String>> menToTitles = queryMenToTitles .get(i); for (Mention m : menToTitles.keySet()) { String title = menToTitles.get(m).get(0); int id = api.getIdByTitle(title); if (id == -1) System.err.println("Error in dataset " + this.getName() + ": Could not find wikipedia title: " + title); else { qAnns.add(new Annotation(m.getPosition(), m.getLength(), id)); } } for (Vector<String> menTitles : queryMenToTitles.get(i) .values()) { for (String title : menTitles) { int id = api.getIdByTitle(title); if (id == -1) System.err.println("Error in dataset " + this.getName() + ": Could not find wikipedia title: " + title); else qTags.add(new Tag(id)); } } annotations.add(qAnns); tags.add(qTags); } } catch (DOMException | IOException e) { throw new RuntimeException(e); } if (queries.size() != tags.size() || tags.size() != annotations.size()) throw new RuntimeException("Parsing error"); } @Override public int getSize() { return queries.size(); } @Override public String getName() { return "SMAPH"; } @Override public List<String> getTextInstanceList() { return queries; } @Override public int getTagsCount() { int count = 0; for (HashSet<Tag> tagSet : tags) count += tagSet.size(); return count; } @Override public List<HashSet<Tag>> getC2WGoldStandardList() { return tags; } @Override public List<HashSet<Mention>> getMentionsInstanceList() { return ProblemReduction.A2WToD2WMentionsInstance(this .getA2WGoldStandardList()); } @Override public List<HashSet<Annotation>> getD2WGoldStandardList() { return annotations; } @Override public List<HashSet<Annotation>> getA2WGoldStandardList() { return annotations; } }