import java.io.File;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Set;
import java.util.Stack;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Vector;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* DTDGenerator<BR>
* Generates a possible DTD from an XML document instance.
* Pure SAX version of the Saxon DTDGenerator
* The program has no remaining dependencies on Saxon; all it needs is:
* JAXP 1.1
* SAX2
* A JAXP 1.1 conformant XML parser
* Java 1.2
* @author M.H.Kay
* @version 7.0: separated from Saxon source, now works with any JAXP 1.1 XML parser
*/
@SuppressWarnings("unchecked")
public class DTDGenerator extends org.xml.sax.helpers.DefaultHandler {
// DTDSAXGen is a ContentHandler, created for convenience
// by extending the default handler that comes with SAX2
protected static int MIN_ENUMERATION_INSTANCES = 10;
// minimum number of appearances of an attribute for
// it to be considered a candidate for an enumeration type
protected static int MAX_ENUMERATION_VALUES = 20;
// maximum number of distinct attribute values to be
// included in an enumeration
protected static int MIN_ENUMERATION_RATIO = 3;
// an attribute will be regarded as an enumeration attribute
// only if the number of instances divided by the number of
// distinct values is >= this ratio
protected static int MIN_FIXED = 5;
// minimum number of attributes that must appear, with
// the same value each time, for the value to be regarded
// as FIXED
protected static int MIN_ID_VALUES = 10;
// minumum number of attribute values that must appear
// for the attribute to be regarded as an ID value
protected static int MAX_ID_VALUES = 100000;
// maximum number of attribute values to be saved
// while checking for uniqueness
TreeMap elementList; // alphabetical list of element types appearing in the document;
// each has the element name as a key and an ElementDetails object
// as the value
Stack elementStack; // stack of elements currently open; each entry is a StackEntry
// object
/**
* Entry point
* Usage: java DTDSAXGen input-file >output-file
*/
public static void main (String args[]) throws java.lang.Exception
{
// Check the command-line arguments.
if (args.length != 1) {
System.err.println("Usage: java DTDSAXGen input-file >output-file");
System.exit(1);
}
// Instantiate and run the application
DTDGenerator app = new DTDGenerator();
app.run(args[0]);
app.printDTD();
}
public DTDGenerator ()
{
elementList = new TreeMap();
elementStack = new Stack();
}
@SuppressWarnings("deprecation")
private void run(String filename) {
try {
InputSource is = new InputSource(new File(filename).toURL().toString());
XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
parser.setContentHandler(this);
parser.parse(is);
} catch (java.io.FileNotFoundException nf) {
System.err.println("File " + filename + " not found");
} catch (Exception err) {
System.err.println("Failed while parsing source file");
System.err.println(err.getMessage());
err.printStackTrace();
System.exit(2);
}
}
/**
* Test whether a string is an XML name.
* TODO: This is currently an incomplete test, it treats all non-ASCII characters
* as being valid in names.
*/
private boolean isValidName(String s) {
if (!isValidNMTOKEN(s)) return false;
int c = s.charAt(0);
return ! ((c>=0x30 && c<=0x39) || c=='.' || c=='-' );
}
/**
* Test whether a string is an XML NMTOKEN.
* TODO: This is currently an incomplete test, it treats all non-ASCII characters
* as being valid in NMTOKENs.
*/
private boolean isValidNMTOKEN(String s) {
if (s.length()==0) return false;
for (int i=0; i<s.length(); i++) {
int c = s.charAt(i);
if (!( (c>=0x41 && c<=0x5a) ||
(c>=0x61 && c<=0x7a) ||
(c>=0x30 && c<=0x39) ||
c=='.' ||
c=='_' ||
c=='-' ||
c==':' ||
c>128 ))
return false;
}
return true;
}
/**
* When the whole document has been analysed, construct the DTD
*/
private void printDTD ()
{
// process the element types encountered, in turn
Iterator e=elementList.keySet().iterator();
while ( e.hasNext() )
{
String elementname = (String) e.next();
ElementDetails ed = (ElementDetails) elementList.get(elementname);
TreeMap children = ed.children;
Set childKeys = children.keySet();
//EMPTY content
if (childKeys.size()==0 && !ed.hasCharacterContent)
System.out.print("<!ELEMENT " + elementname + " EMPTY >\n");
//CHARACTER content
if (childKeys.size()==0 && ed.hasCharacterContent)
System.out.print("<!ELEMENT " + elementname + " ( #PCDATA ) >\n");
//ELEMENT content
if (childKeys.size()>0 && !ed.hasCharacterContent) {
System.out.print("<!ELEMENT " + elementname + " ( ");
if (ed.sequenced) {
// all elements of this type have the same child elements
// in the same sequence, retained in the childseq vector
Enumeration c = ed.childseq.elements();
while (true) {
ChildDetails ch = (ChildDetails)c.nextElement();
System.out.print(ch.name);
if (ch.repeatable && !ch.optional)
System.out.print("+");
if (ch.repeatable && ch.optional)
System.out.print("*");
if (ch.optional && !ch.repeatable)
System.out.print("?");
if (c.hasMoreElements())
System.out.print(", ");
else
break;
}
System.out.print(" ) >\n");
}
else {
// the children don't always appear in the same sequence; so
// list them alphabetically and allow them to be in any order
Iterator c1 = childKeys.iterator();
while (c1.hasNext()) {
System.out.print((String)c1.next());
if (c1.hasNext()) System.out.print(" | ");
}
System.out.print(" )* >\n");
}
};
//MIXED content
if (childKeys.size()>0 && ed.hasCharacterContent) {
System.out.print("<!ELEMENT " + elementname + " ( #PCDATA");
Iterator c2 = childKeys.iterator();
while (c2.hasNext()) {
System.out.print(" | " + (String)c2.next());
}
System.out.print(" )* >\n");
};
// Now examine the attributes encountered for this element type
TreeMap attlist = ed.attributes;
boolean doneID = false; // to ensure we have at most one ID attribute per element
Iterator a=attlist.keySet().iterator();
while ( a.hasNext() )
{
String attname = (String) a.next();
AttributeDetails ad = (AttributeDetails) attlist.get(attname);
// If the attribute is present on every instance of the element, treat it as required
boolean required = (ad.occurrences==ed.occurrences);
// If every value of the attribute is distinct,
// and there are at least MIN_ID_VALUES, treat it as an ID
// TODO: this may give the wrong answer, we should check whether the value sets of two
// candidate-ID attributes overlap, in which case they can't both be IDs !!)
boolean isid = ad.allNames && // ID values must be Names
(!doneID) && // Only allowed one ID attribute per element type
(ad.unique) &&
(ad.occurrences>=MIN_ID_VALUES);
// if there is only one attribute value, and at least MIN_FIXED occurrences of it,
// treat it as FIXED
boolean isfixed = required && ad.values.size()==1 && ad.occurrences >= MIN_FIXED;
// if the number of distinct values is small compared with the number of occurrences,
// treat it as an enumeration
boolean isenum = ad.allNMTOKENs && // Enumeration values must be NMTOKENs
(ad.occurrences>=MIN_ENUMERATION_INSTANCES) &&
(ad.values.size()<=ad.occurrences/MIN_ENUMERATION_RATIO) &&
(ad.values.size()<=MAX_ENUMERATION_VALUES);
System.out.print("<!ATTLIST " + elementname + " " + attname + " ");
String tokentype = (ad.allNMTOKENs ? "NMTOKEN" : "CDATA");
if (isid) {
System.out.print("ID");
doneID = true;
}
else if (isfixed) {
String val = (String) ad.values.first();
System.out.print(tokentype + " #FIXED \"" + escape(val) + "\" >\n");
}
else if (isenum) {
System.out.print("( ");
Iterator v = ad.values.iterator();
while (v.hasNext()) {
System.out.print((String) v.next());
if (!v.hasNext()) break;
System.out.print(" | ");
};
System.out.print(" )");
}
else
System.out.print(tokentype);
if (!isfixed) {
if (required)
System.out.print(" #REQUIRED >\n");
else
System.out.print(" #IMPLIED >\n");
}
};
System.out.print("\n");
};
}
/**
* Escape special characters for display.
* @param ch The character array containing the string
* @param start The start position of the input string within the character array
* @param length The length of the input string within the character array
* @return The XML/HTML representation of the string<br>
* This static method converts a Unicode string to a string containing
* only ASCII characters, in which non-ASCII characters are represented
* by the usual XML/HTML escape conventions (for example, "<" becomes "<").
* Note: if the input consists solely of ASCII or Latin-1 characters,
* the output will be equally valid in XML and HTML. Otherwise it will be valid
* only in XML.
* The escaped characters are written to the dest array starting at position 0; the
* number of positions used is returned as the result
*/
private static int escape(char ch[], int start, int length, char[] out)
{
int o = 0;
for (int i = start; i < start+length; i++) {
if (ch[i]=='<') {("<").getChars(0,4,out,o); o+=4;}
else if (ch[i]=='>') {(">").getChars(0,4,out,o); o+=4;}
else if (ch[i]=='&') {("&").getChars(0,5,out,o); o+=5;}
else if (ch[i]=='\"') {(""").getChars(0,5,out,o); o+=5;}
else if (ch[i]=='\'') {("'").getChars(0,5,out,o); o+=5;}
else if (ch[i]<=0x7f) {out[o++]=ch[i];}
else {
String dec = "" + Integer.toString((int)ch[i]) + ';';
dec.getChars(0, dec.length(), out, o);
o+=dec.length();
}
}
return o;
}
/**
* Escape special characters in a String value.
* @param in The input string
* @return The XML representation of the string<br>
* This static method converts a Unicode string to a string containing
* only ASCII characters, in which non-ASCII characters are represented
* by the usual XML/HTML escape conventions (for example, "<" becomes
* "<").<br>
* Note: if the input consists solely of ASCII or Latin-1 characters,
* the output will be equally valid in XML and HTML. Otherwise it will be valid
* only in XML.
*/
private static String escape(String in)
{
char[] dest = new char[in.length()*8];
int newlen = escape( in.toCharArray(), 0, in.length(), dest);
return new String(dest, 0, newlen);
}
/**
* Handle the start of an element. Record information about the position of this
* element relative to its parent, and about the attributes of the element.
*/
public void startElement (String uri, String localName, String name, Attributes attributes)
throws SAXException
{
StackEntry se = new StackEntry();
// create an entry in the Element List, or locate the existing entry
ElementDetails ed = (ElementDetails) elementList.get(name);
if (ed==null) {
ed = new ElementDetails(name);
elementList.put(name,ed);
};
// retain the associated element details object
se.elementDetails = ed;
// initialise sequence numbering of child element types
se.sequenceNumber = -1;
// count occurrences of this element type
ed.occurrences++;
// Handle the attributes accumulated for this element.
// Merge the new attribute list into the existing list for the element
for (int a=0; a<attributes.getLength(); a++) {
String attName = attributes.getQName(a);
String val = attributes.getValue(a);
AttributeDetails ad = (AttributeDetails) ed.attributes.get(attName);
if (ad==null) {
ad=new AttributeDetails(attName);
ed.attributes.put(attName, ad);
};
if (!ad.values.contains(val)) {
// We haven't seen this attribute value before
ad.values.add(val);
// Check if attribute value is a valid name
if (ad.allNames && !isValidName(val)) {
ad.allNames = false;
}
// Check if attribute value is a valid NMTOKEN
if (ad.allNMTOKENs && !isValidNMTOKEN(val)) {
ad.allNMTOKENs = false;
}
// For economy, don't save the new value unless it's needed;
// it's needed only if we're looking for ID values or enumerated values
if (ad.unique && ad.allNames && ad.occurrences <= MAX_ID_VALUES) {
ad.values.add(val);
} else if (ad.values.size() <= MAX_ENUMERATION_VALUES) {
ad.values.add(val);
}
} else {
// We've seen this attribute value before
ad.unique = false;
}
ad.occurrences++;
};
// now keep track of the nesting and sequencing of child elements
if (!elementStack.isEmpty()) {
StackEntry parent = (StackEntry)elementStack.peek();
ElementDetails parentDetails = parent.elementDetails;
int seq = parent.sequenceNumber;
// for sequencing, we're interested in consecutive groups of the same child element type
boolean isFirstInGroup = (parent.latestChild==null || (!parent.latestChild.equals(name)));
if (isFirstInGroup) {
seq++;
parent.sequenceNumber++;
}
parent.latestChild = name;
// if we've seen this child of this parent before, get the details
TreeMap children = parentDetails.children;
ChildDetails c = (ChildDetails)children.get(name);
if (c==null) {
// this is the first time we've seen this child belonging to this parent
c = new ChildDetails();
c.name = name;
c.position = seq;
c.repeatable = false;
c.optional = false;
children.put(name, c);
parentDetails.childseq.addElement(c);
// if the first time we see this child is not on the first instance of the parent,
// then we allow it as an optional element
if (parentDetails.occurrences!=1) {
c.optional = true;
}
} else {
// if it's the first occurrence of the parent element, and we've seen this
// child before, and it's the first of a new group, then the child occurrences are
// not consecutive
if (parentDetails.occurrences==1 && isFirstInGroup) {
parentDetails.sequenced = false;
}
// check whether the position of this group of children in this parent element is
// the same as its position in previous instances of the parent.
if (parentDetails.childseq.size()<=seq ||
!((ChildDetails)parentDetails.childseq.elementAt(seq)).name.equals(name))
{
parentDetails.sequenced = false;
}
}
// if there's more than one child element, mark it as repeatable
if (!isFirstInGroup) {
c.repeatable = true;
}
}
elementStack.push(se);
}
/**
* End of element. If sequenced, check that all expected children are accounted for.
*/
public void endElement (String uri, String localName, String name)
throws SAXException
{
// If the number of child element groups in this parent element is less than the
// number in previous elements, then the absent children are marked as optional
ElementDetails ed = (ElementDetails) elementList.get(name);
if (ed.sequenced) {
StackEntry se = (StackEntry)elementStack.peek();
int seq = se.sequenceNumber;
for (int i=seq+1; i<ed.childseq.size(); i++) {
((ChildDetails)ed.childseq.elementAt(i)).optional = true;
}
}
elementStack.pop();
}
/**
* Handle character data.
* Make a note whether significant character data is found in the element
*/
public void characters (char ch[], int start, int length)
throws SAXException
{
ElementDetails ed = ((StackEntry)elementStack.peek()).elementDetails;
if (!ed.hasCharacterContent) {
for (int i=start; i<start+length; i++) {
if ((int)ch[i] > 0x20) {
ed.hasCharacterContent = true;
break;
}
}
}
}
/**
* ElementDetails is a data structure to keep information about element types
*/
private class ElementDetails {
String name;
int occurrences;
boolean hasCharacterContent;
boolean sequenced;
TreeMap children;
Vector childseq;
TreeMap attributes;
public ElementDetails ( String name ) {
this.name = name;
this.occurrences = 0;
this.hasCharacterContent = false;
this.sequenced = true;
this.children = new TreeMap();
this.childseq = new Vector();
this.attributes = new TreeMap();
}
}
/**
* ChildDetails records information about the presence of a child element within its
* parent element. If the parent element is sequenced, then the child elements always
* occur in sequence with the given frequency.
*/
private class ChildDetails {
String name;
int position;
boolean repeatable;
boolean optional;
}
/**
* AttributeDetails is a data structure to keep information about attribute types
*/
private class AttributeDetails {
String name; // name of the attribute
int occurrences; // number of occurrences of the attribute
boolean unique; // true if no duplicate values encountered
TreeSet values; // set of all distinct values encountered for this attribute
boolean allNames; // true if all the attribute values are valid names
boolean allNMTOKENs; // true if all the attribute values are valid NMTOKENs
public AttributeDetails ( String name ) {
this.name = name;
this.occurrences = 0;
this.unique = true;
this.values = new TreeSet();
this.allNames = true;
this.allNMTOKENs = true;
}
}
/**
* StackEntry is a data structure we put on the stack for each nested element
*/
private class StackEntry {
ElementDetails elementDetails;
int sequenceNumber;
String latestChild;
}
} // end of outer class DTDSAXGen