package org.exist.storage.statistics;
import org.exist.Namespaces;
import org.exist.dom.QName;
import org.exist.dom.SymbolTable;
import org.exist.storage.ElementValue;
import org.exist.storage.NodePath;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
/**
* Collects statistics about the distribution of elements in a document or
* even the entire database. The class creates a graph structure which describes
* all possible element paths and their frequency. For example, for a TEI document, a typical
* path could be:
*
* <pre>TEI[44,63330] -> text[44,62757] -> body[44,44206] -> div[300,5584] -> p[5336,820]</pre>
*
* which means there are 44 TEI, text and body elements in the db with 300 div children and
* 5336 paragraphs below them. The second number indicates the size of the largest element,
* expressed as the number of descendant elements below the node. The largest p node in this
* distribution has 820 elements below it.
*/
public class DataGuide {
private final static int BYTES_PER_NODE = 16;
// the (virtual) root of the tree whose name will always be null.
private NodeStats root = new NodeStatsRoot();
public DataGuide() {
}
public int getSize() {
return root.getSize();
}
/**
* Add the given node path (a path like /root/childA/childB) to the data guide.
* The frequency for the target element (i.e. the last component in the path)
* is incremented by one.
*/
public NodeStats add(NodePath path) {
return add(path, null);
}
/**
* Add the given node path using the frequency and size information
* given in the second argument. Used to merge two DataGuides.
*/
protected NodeStats add(NodePath path, NodeStats mergeWith) {
NodeStats current = root;
for (int i = 0; i < path.length(); i++) {
QName qn = path.getComponent(i);
if (qn.getNameType() != ElementValue.ELEMENT) {
return null;
}
current = current.addChild(qn);
}
if (mergeWith != null) {
current.mergeStats(mergeWith);
} else
current.addOccurrence();
return current;
}
/**
* Merge paths and statistics from this instance into the
* other instance.
*
* @param other
* @return the other instance containing the merged graphs
*/
public DataGuide mergeInto(DataGuide other) {
root.mergeInto(other, new NodePath());
return other;
}
public int getMaxParentDepth(QName qname) {
NodeStats temp = new NodeStats(qname);
root.getMaxParentDepth(qname, temp);
return temp.getMaxDepth();
}
public String toString() {
List paths = new ArrayList();
root.dump(new StringBuilder(), paths);
StringBuilder buf = new StringBuilder();
for (int i = 0; i < paths.size(); i++) {
buf.append(paths.get(i));
buf.append('\n');
}
return buf.toString();
}
public void toSAX(ContentHandler handler) throws SAXException {
root.toSAX(handler);
}
public void write(FileChannel fc, SymbolTable symbols) throws IOException {
int nodeCount = root.getSize();
ByteBuffer buffer = ByteBuffer.allocate(nodeCount * BYTES_PER_NODE + 4);
root.write(buffer, symbols);
buffer.flip();
fc.write(buffer);
}
public void read(FileChannel fc, SymbolTable symbols) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate((int) fc.size());
fc.read(buffer);
buffer.flip();
root.read(buffer, symbols);
}
private static class NodeStatsRoot extends NodeStats {
private NodeStatsRoot() {
super(null);
}
protected void write(ByteBuffer buffer, SymbolTable symbols) {
if (children == null)
buffer.putInt(0);
else {
buffer.putInt(children.length);
for (int i = 0; i < children.length; i++) {
children[i].write(buffer, symbols);
}
}
}
protected void read(ByteBuffer buffer, SymbolTable symbols) {
int childCount = buffer.getInt();
if (childCount > 0) {
children = new NodeStats[childCount];
for (int i = 0; i < childCount; i++) {
children[i] = new NodeStats(null);
children[i].read(buffer, symbols);
}
}
}
public void toSAX(ContentHandler handler) throws SAXException {
handler.startElement(Namespaces.EXIST_NS, "distribution", "distribution", new AttributesImpl());
if (children != null) {
for (int i = 0; i < children.length; i++) {
children[i].toSAX(handler);
}
}
handler.endElement(Namespaces.EXIST_NS, "distribution", "distribution");
}
}
}