/**
* Copyright (c) 2014, the LESK-WSD-DSM AUTHORS.
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* Neither the name of the University of Bari nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007
*
*/
package di.uniba.it.wsd;
import di.uniba.it.wsd.data.TextReader;
import di.uniba.it.wsd.data.POSenum;
import di.uniba.it.wsd.data.Token;
import java.io.File;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* Text reader for the old SemEval XML format
* @author Pierpaolo Basile pierpaolo.basile@gmail.com
*/
public class OldXMLTextReader implements TextReader {
private final File inputFile;
private Document doc;
private List<List<Token>> passages;
private int pointer = 0;
private int mode = TextReader.SENTENCE_MODE;
/**
*
* @param inputFile
* @param mode
*/
public OldXMLTextReader(File inputFile, int mode) {
this.inputFile = inputFile;
this.mode = mode;
}
private List<Token> loadTokenFromSentence(Node sentence) throws Exception {
List<Token> list = new ArrayList<>();
int position = 0;
NodeList childs = sentence.getChildNodes();
for (int i = 0; i < childs.getLength(); i++) {
Node item = childs.item(i);
if (item.getNodeName().equals("instance")) {
String token = item.getTextContent();
String lemma = item.getAttributes().getNamedItem("lemma").getNodeValue();
String posTag = item.getAttributes().getNamedItem("pos").getNodeValue();
position++;
POSenum pos = POSenum.OTHER;
switch (posTag) {
case "n":
pos = POSenum.NOUN;
break;
case "v":
pos = POSenum.VERB;
break;
case "a":
pos = POSenum.ADJ;
break;
case "r":
pos = POSenum.ADV;
break;
}
if (item.getNodeName().equals("instance")) {
String id = item.getAttributes().getNamedItem("id").getNodeValue();
list.add(new Token(token, lemma, pos, position, id, true));
}
} else if (item.getNodeType() == Node.TEXT_NODE) {
String[] lines = item.getTextContent().split("\n+");
for (int l = 0; l < lines.length; l++) {
lines[l] = lines[l].trim();
if (lines[l].length() > 0) {
position++;
list.add(new Token(lines[l], lines[l], POSenum.OTHER, position, false));
}
}
}
}
return list;
}
/**
*
* @throws Exception
*/
@Override
public void openTextReader() throws Exception {
DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
doc = db.parse(inputFile);
passages = new ArrayList<>();
pointer = 0;
if (mode == TextReader.SENTENCE_MODE) {
NodeList sentences = doc.getElementsByTagName("sentence");
for (int i = 0; i < sentences.getLength(); i++) {
passages.add(loadTokenFromSentence(sentences.item(i)));
}
} else if (mode == TextReader.TEXT_MODE) {
NodeList textNodes = doc.getElementsByTagName("text");
for (int i = 0; i < textNodes.getLength(); i++) {
passages.add(new ArrayList<Token>());
NodeList childNodes = textNodes.item(i).getChildNodes();
for (int j = 0; j < childNodes.getLength(); j++) {
if (childNodes.item(j).getNodeType() == Node.ELEMENT_NODE && childNodes.item(j).getNodeName().equals("sentence")) {
passages.get(i).addAll(loadTokenFromSentence(childNodes.item(j)));
}
}
}
} else if (mode == TextReader.DOC_MODE) {
passages.add(new ArrayList<Token>());
NodeList sentences = doc.getElementsByTagName("sentence");
for (int i = 0; i < sentences.getLength(); i++) {
passages.get(0).addAll(loadTokenFromSentence(sentences.item(i)));
}
} else {
throw new Exception("No valid reader mode");
}
}
/**
*
* @throws Exception
*/
@Override
public void closeTextReader() throws Exception {
}
/**
*
* @return
* @throws Exception
*/
@Override
public List<Token> getTokenList() throws Exception {
List<Token> tokens = null;
if (pointer < passages.size()) {
tokens = passages.get(pointer);
pointer++;
}
return tokens;
}
/**
*
* @param writer
* @throws Exception
*/
public void write(Writer writer) throws Exception {
for (int i = 0; i < this.getTokenList().size(); i++) {
writer.write(this.getTokenList().get(i).print());
writer.write("\n");
}
}
}