SMAPHDataset.java example

Explorer

smaph-erd-master
- src
  - main
    - java
      - it
        acubelab
        batframework
        datasetPlugins
        ERD2014Dataset.java
        SMAPHDataset.java
        systemPlugins
        ERDSystem.java
        WATAnnotator.java
        smaph
        IndexMatch.java
        SmaphAnnotator.java
        SmaphAnnotatorDebugger.java
        SmaphConfig.java
        SmaphUtils.java
        boldfilters
        BoldFilter.java
        EditDistanceBoldFilter.java
        FrequencyBoldFilter.java
        NoBoldFilter.java
        RankWeightBoldFilter.java
        entityfilters
        EntityFilter.java
        LibSvmEntityFilter.java
        NoEntityFilter.java
        learn
        BinaryExampleGatherer.java
        GenerateModel.java
        GenerateTrainingAndTest.java
        LibSvmFilter.java
        LibSvmUtils.java
        ModelConfigurationResult.java
        TuneModel.java
        linkback
        BaselineLinkBack.java
        DummyLinkBack.java
        LinkBack.java
        main
        ERDDatasetFilter.java
        cnr
        isti
        hpc
        erd
        Annotation.java
        Annotator.java
        WikipediaLabelToFreebaseRecord.java
        WikipediaToFreebase.java
        cli
        IndexWikipediaLabelToFreebaseIdCLI.java
        rest
        RestService.java
        mapdb
        MapDB.java
        MapDBException.java
  - test
    - java
      - it
        acubelab
        smaph
        linkback
        BaselineLinkBackTest.java
        cnr
        isti
        hpc
        erd
        SmaphUtilsTest.java
        WikipediaLabelToFreebaseIdTest.java

/**
 *  Copyright 2014 Marco Cornolti
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package it.unipi.di.acube.batframework.datasetPlugins;

import java.io.*;
import java.util.*;

import javax.xml.parsers.*;
import javax.xml.xpath.XPathExpressionException;

import org.w3c.dom.*;
import org.xml.sax.SAXException;

import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaApiInterface;

public class SMAPHDataset implements A2WDataset {
	private List<String> queries = new Vector<String>();
	private List<HashSet<Tag>> tags = new Vector<HashSet<Tag>>();
	private List<HashSet<Annotation>> annotations = new Vector<HashSet<Annotation>>();

	public SMAPHDataset(String xmlFile, WikipediaApiInterface api) {
		File fXmlFile = new File(xmlFile);
		DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
		DocumentBuilder dBuilder;
		Document doc;
		try {
			dBuilder = dbFactory.newDocumentBuilder();
			doc = dBuilder.parse(fXmlFile);
		} catch (SAXException | IOException | ParserConfigurationException e) {
			throw new RuntimeException(e);
		}
		doc.getDocumentElement().normalize();

		List<HashMap<Mention, Vector<String>>> queryMenToTitles = new Vector<HashMap<Mention, Vector<String>>>();
		NodeList nList = doc.getElementsByTagName("instance");
		for (int i = 0; i < nList.getLength(); i++) {
			HashMap<Mention, Vector<String>> mentionToTitles = new HashMap<>();
			String query = "";
			Node instanceNode = nList.item(i);
			Element eElement = (Element) instanceNode;
			NodeList instElemList = eElement.getChildNodes();
			for (int j = 0; j < instElemList.getLength(); j++) {
				Node instElemNode = instElemList.item(j);
				if (instElemNode.getNodeType() == Node.ELEMENT_NODE) {
					if (!instElemNode.getNodeName().equals("annotation"))
						throw new RuntimeException(
								"Found internal node that is not an annotation.");
					int pos = query.length();
					query += instElemNode.getTextContent();
					int len = query.length() - pos;
					Mention men = new Mention(pos, len);
					mentionToTitles.put(men, new Vector<String>());
					NamedNodeMap attrs = instElemNode.getAttributes();
					int h = 0;
					Node n = null;
					while ((n = attrs.getNamedItem(String.format(
							"rank_%d_title", h))) != null) {
						mentionToTitles.get(men).add(n.getTextContent());
						h++;
					}
				} else if (instElemNode.getNodeType() == Node.TEXT_NODE)
					query += instElemNode.getTextContent();

			}
			queries.add(query);
			queryMenToTitles.add(mentionToTitles);
		}

		List<String> titlesToPrefetch = new Vector<String>();
		for (HashMap<Mention, Vector<String>> setS : queryMenToTitles)
			for (Vector<String> titles : setS.values())
				titlesToPrefetch.addAll(titles);
		try {
			api.prefetchTitles(titlesToPrefetch);
		} catch (XPathExpressionException | IOException
				| ParserConfigurationException | SAXException e) {
			throw new RuntimeException(e);
		}
		try {
			for (int i = 0; i < queryMenToTitles.size(); i++) {
				HashSet<Tag> qTags = new HashSet<Tag>();
				HashSet<Annotation> qAnns = new HashSet<Annotation>();
				HashMap<Mention, Vector<String>> menToTitles = queryMenToTitles
						.get(i);

				for (Mention m : menToTitles.keySet()) {
					String title = menToTitles.get(m).get(0);
					int id = api.getIdByTitle(title);
					if (id == -1)
						System.err.println("Error in dataset " + this.getName()
								+ ": Could not find wikipedia title: " + title);
					else {
						qAnns.add(new Annotation(m.getPosition(),
								m.getLength(), id));
					}
				}

				for (Vector<String> menTitles : queryMenToTitles.get(i)
						.values()) {
					for (String title : menTitles) {

						int id = api.getIdByTitle(title);
						if (id == -1)
							System.err.println("Error in dataset "
									+ this.getName()
									+ ": Could not find wikipedia title: "
									+ title);
						else
							qTags.add(new Tag(id));

					}
				}
				annotations.add(qAnns);
				tags.add(qTags);
			}
		} catch (DOMException | IOException e) {
			throw new RuntimeException(e);
		}

		if (queries.size() != tags.size() || tags.size() != annotations.size())
			throw new RuntimeException("Parsing error");
	}

	@Override
	public int getSize() {
		return queries.size();
	}

	@Override
	public String getName() {
		return "SMAPH";
	}

	@Override
	public List<String> getTextInstanceList() {
		return queries;
	}

	@Override
	public int getTagsCount() {
		int count = 0;
		for (HashSet<Tag> tagSet : tags)
			count += tagSet.size();
		return count;
	}

	@Override
	public List<HashSet<Tag>> getC2WGoldStandardList() {
		return tags;
	}

	@Override
	public List<HashSet<Mention>> getMentionsInstanceList() {
		return ProblemReduction.A2WToD2WMentionsInstance(this
				.getA2WGoldStandardList());
	}

	@Override
	public List<HashSet<Annotation>> getD2WGoldStandardList() {
		return annotations;
	}

	@Override
	public List<HashSet<Annotation>> getA2WGoldStandardList() {
		return annotations;
	}

}