/**
* Copyright 2013 MIR@MU Project
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package cz.muni.fi.mir.mathmlcanonicalization.modules;
import cz.muni.fi.mir.mathmlcanonicalization.Settings;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;
/**
* Remove useless elements and attributes from MathML.
*
* <div class="simpleTagLabel">Input:</div><ul>
* <li>Well-formed MathML, first module</li>
* <li>Property file(s) with names of elements and attributes for removal or
* preservation</li></ul>
* <div class="simpleTagLabel">Output:</div>
* The original code with:<ul>
* <li>removed elements insignificant for the formula searching and indexing
* purpose (e.q. spacing and appearance altering tags) including the content
* between open and close tag or preserving it (depending on the tag)</li>
* <li>removed useless attributes but preserved those that are used in other
* modules, e.q. separator attribute in mfenced element</li>
* <li>removed attributes with default values?</li>
* <li>removed redundant spaces?</li></ul>
*
* @author Maros Kucbel
*/
public class ElementMinimizer extends AbstractModule implements StreamModule {
private Set<String> removeWithChildren;
private Set<String> removeKeepChildren;
public ElementMinimizer() {
declareProperty("remove_all");
declareProperty("remove");
declareProperty("keepAttributes");
declareProperty("keepAttributes.mfrac");
declareProperty("keepAttributes.cn");
declareProperty("keepAttributes.ci");
declareProperty("keepAttributes.set");
declareProperty("keepAttributes.tendsto");
declareProperty("keepAttributes.interval");
declareProperty("keepAttributes.declare");
declareProperty("keepAttributes.mfenced");
}
@Override
public ByteArrayOutputStream execute(final InputStream input) throws ModuleException {
if (input == null) {
throw new NullPointerException("input");
}
removeWithChildren = getPropertySet("remove_all");
removeKeepChildren = getPropertySet("remove");
final ByteArrayOutputStream output = new ByteArrayOutputStream();
try {
minimizeElements(input, output);
} catch (XMLStreamException ex) {
Logger.getLogger(this.getClass().getName()).log(
Level.SEVERE, "error while parsing the input file. ", ex);
throw new ModuleException("Error while parsing the input file", ex);
}
return output;
}
/**
* Decides which attributes to keep based on keepAttributes properties.
*/
private boolean keepAttribute(final String name, final String attributeName,
final String attributeValue) {
assert name != null && attributeName != null && attributeValue != null;
assert !name.isEmpty() && !attributeName.isEmpty();
String property = getProperty("keepAttributes");
final String elementPropertyName = "keepAttributes." + name;
if (isProperty(elementPropertyName)) {
property += " " + getProperty(elementPropertyName);
}
final List<String> whitelist = Arrays.asList(property.split(" "));
for (String attribute : whitelist) {
if (attributeName.equals(attribute)
|| attribute.contains("=")
&& attributeName.equals(attribute.substring(0, attribute.lastIndexOf('=')))
&& attributeValue.equals(attribute.substring(attribute.lastIndexOf('=') + 1))) {
return true;
}
}
return false;
}
private void minimizeElements(final InputStream input, final OutputStream outputStream)
throws XMLStreamException {
assert input != null && outputStream != null;
// TODO: refactoring
// TODO: add logging
final XMLInputFactory inputFactory = Settings.setupXMLInputFactory();
final XMLOutputFactory outputFactory = Settings.xmlOutputFactory();
// stream for reading event from input stream
final XMLStreamReader reader = inputFactory.createXMLStreamReader(input);
// stream that writes events to given output stream
final XMLStreamWriter writer = outputFactory.createXMLStreamWriter(outputStream, "UTF-8");
writer.writeStartDocument(reader.getEncoding(), reader.getVersion());
// depth of current branch, used when removing element with all its children
int depth = 0;
boolean mathElement = false;
// check for event
while (reader.hasNext()) {
// get event code
final int event = reader.next();
// based on event code choose action
switch (event) {
case XMLStreamConstants.START_ELEMENT: {
// write this element
// omit if it should be skipped
String name = reader.getLocalName();
if (name.equals(MATH)) {
mathElement = true;
}
if (mathElement) {
if (removeKeepChildren.contains(name)) {
continue;
}
// omit this element if it is marked to skip or is a child
// of such an element
if (removeWithChildren.contains(name)) {
depth++;
}
if (depth > 0) {
continue;
}
}
writer.writeStartElement(reader.getName().getPrefix(), name, reader.getName().getNamespaceURI());
for (int index = 0; index < reader.getAttributeCount(); ++index) {
final String attributeName = reader.getAttributeLocalName(index);
final String attributeValue = reader.getAttributeValue(index);
final String attributePrefix = reader.getAttributePrefix(index);
final String attributeNamespace = reader.getAttributeNamespace(index);
// write only chosen attributes
if (!mathElement || (mathElement && keepAttribute(name, attributeName, attributeValue))) {
if (attributeNamespace == null) {
writer.writeAttribute(attributeName, attributeValue);
} else {
writer.writeAttribute(attributePrefix, attributeNamespace, attributeName, attributeValue);
}
}
}
for (int index = 0; index < reader.getNamespaceCount(); ++index) {
writer.writeNamespace(reader.getNamespacePrefix(index), reader.getNamespaceURI(index));
}
break;
}
case XMLStreamConstants.END_ELEMENT: {
if (mathElement) {
String name = reader.getLocalName();
if (name.equals(MATH)) {
mathElement = false;
}
if (removeKeepChildren.contains(name)) {
continue;
}
if (depth > 0) {
if (removeWithChildren.contains(name)) {
depth--;
}
continue;
}
}
writer.writeEndElement();
break;
}
case XMLStreamConstants.CHARACTERS: {
// warning: white space is counted as CHARACTER event (new line after element)
if (depth > 0) {
continue;
}
writer.writeCharacters(reader.getText());
break;
}
case XMLStreamConstants.END_DOCUMENT: {
writer.writeEndDocument();
break;
}
case XMLStreamConstants.DTD: {
writer.writeDTD(reader.getText());
break;
}
default: {
break;
}
}
}
writer.flush();
writer.close();
}
}