XMLIngest.java example

Explorer
VUE-master
- VUE2
/*
* Copyright 2003-2010 Tufts University  Licensed under the
 * Educational Community License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 * 
 * http://www.osedu.org/licenses/ECL-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an "AS IS"
 * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package tufts.vue.ds;

import tufts.Util;
import tufts.vue.DEBUG;
import tufts.vue.MetaMap;
import tufts.vue.MetaMap.*;

import java.util.*;
import java.io.*;
import java.net.*;

import javax.xml.xpath.*;
import javax.xml.parsers.*;

import org.w3c.dom.Node;
import org.w3c.dom.*;
import org.xml.sax.*;


// TODO: just forget handling depth (e.g., jira comments) for now -- can tackle later.
// The keep in mind w/respect to how we handle data-set interation, so could
// add this under the hood if we like later.

// As for our data-model, we could literally use the XML DOM, tho that's got
// way more than we need in it and isn't very convenient.  I guess we
// just need our own API that nicely abstracts everything, so under the
// hood we could use anything from Jackrabbit to DOM to Mutlimaps to SQL
// or whatever.

// NEED TO GENERICALLY HANDLE KEY MANAGEMENT, AND DATA-CHANGE DETECTION.

// Big question: do we persist original raw XML streams, or digest
// the data first then persist it?  First case is safer for
// ultimate data integrity -- can fix parsing / data coalesecing
// bugs or make enhancements more easily.  We could persist the
// mashed data, but but then we'd just need another format / persist
// schema anyway.


/**
 * @version $Revision: 1.16 $ / $Date: 2010-02-03 19:13:16 $ / $Author: mike $
 * @author Scott Fraize
 */

public class XMLIngest {

    private static final org.apache.log4j.Logger Log = org.apache.log4j.Logger.getLogger(XMLIngest.class);

    private static final boolean XML_DEBUG = false;
    private static final boolean XML_OUTPUT = false;

    // REPLACE WITH A WRAPPER: XMLIngestor / XML-SCHEMA-LOADER (or just split out as XMLIngest methods)
    // ALL WE NEED are the track methods, and a public final schema for dumpSchema debug & isXMLKeyFold(),
    // which we can probably pull out of generic Schema

    public static class XmlSchema extends tufts.vue.ds.Schema
    {
        final String itemPath;
        final int itemPathLen;

        DataRow curRow;

        /** castor peristance only */
        // todo: see if we can get rid of this class entirely and have Schema be a final class
        // so we don't run into all the persistance complications with castor.
        public XmlSchema() {
            itemPath = "<unknown>";
            itemPathLen = 0;
        }

        public XmlSchema(tufts.vue.Resource source, String itemPath) 
        {
            super.setResource(source);
            this.itemPath = itemPath;
            if (itemPath == null || itemPath.length() == 0)
                itemPathLen = 0;
            else
                itemPathLen = itemPath.length() + 1; // add one for dot
            setXMLKeyFold(itemPath != null && itemPath.startsWith("plist."));

            Log.debug("Constructed XmlSchema " + this);
            
            //itemPathLen = itemPath.length() + (itemPath.endsWith(".") ? 0 : 1);
        }

        @Override public void dumpSchema(PrintWriter ps) {
            if (itemPath != null) ps.println("ItemPath: " + itemPath);
            super.dumpSchema(ps);
        }
        
        void trackFieldValuePair(String name, String value) {

            //errout("TRACK " + name + "=" + value);

            if (itemPath != null && name.startsWith(itemPath) && name.length() > itemPathLen)
                name = name.substring(itemPathLen);

//             Field field = mFields.get(name);
//             if (field == null) {
//                 field = new Field(name, this);
// //                 if (name.equals(getKeyNode()))
// //                     keyField = field;
//                 mFields.put(name, field);
//                 if (name.length() > mLongestFieldName)
//                     mLongestFieldName = name.length();
//             }
            Field field = getField(name);
            if (field == null) {
                field = addField(name);
                if (name.length() > mLongestFieldName)
                    mLongestFieldName = name.length();
            }
            if (curRow != null)
                curRow.addValue(field, value);
            else
                field.trackValue(value);
        }

        void trackNodeOpen(String name) {
            if (name.equals(getRowStartNode())) {
                //errout("OPEN " + name);
                // curRow = new VRow(fields.size()); // fields includes non-row-extraction values
                curRow = new DataRow(this);
                addRow(curRow);
            }
        }

        void trackNodeClose(String name) {
            if (name.equals(getRowStartNode())) {
                //errout(String.format("CLOSE %s with %2d fields, key %s", name, curRow.size(), curRow.getValue(keyField)));
                curRow = null;
            }
        }

        private String getRowStartNode() { return itemPath; }
        //private String getKeyNode() { return null; }
        
    }

//     static class RssSchema extends XmlSchema {
        
//         @Override
//         final String getRowStartNode() {
//             //return "item";
//             return "rss.channel.item";
//         }

//         @Override
//         final String getKeyNode() {
//             //return "rss.channel.item.key";
//             //return "item.key";
//             return "key";
//         }
        
//     }

    
    static int depth = 0;


    static void XPathExtract(XmlSchema schema, Document document)
    {
    
        try {
      
            XPath xpath = XPathFactory.newInstance().newXPath();

            String expression = "/rss/channel/item";
            //String expression = "rss/channel/item/title";

            errout("Extracting " + expression);
            
            // First, obtain the element as a node.

            //tufts.DocDump.dump(document);
                        
            Node nodeValue = (Node) xpath.evaluate(expression, document, XPathConstants.NODE);
            errout("   Node: " + nodeValue);

            // Next, obtain the element as a String.

            String stringValue = (String) xpath.evaluate(expression, document, XPathConstants.STRING);
            System.out.println(" String: " + stringValue);

            NodeList nodeSet = (NodeList) xpath.evaluate(expression, document, XPathConstants.NODESET);
            errout("NodeSet: " + Util.tag(nodeSet) + "; size=" + nodeSet.getLength());

            for (int i = 0; i < nodeSet.getLength(); i++) {
                scanNode(schema, nodeSet.item(i), null, null);
            }
            

//             // Finally, obtain the element as a Number (Double).

//             Double birthdateDouble = (Double) xpath.evaluate(expression, document, XPathConstants.NUMBER);
   
//             System.out.println("Double is: " + birthdateDouble);
   
        } catch (XPathExpressionException e) {
            System.err.println("XPathExpressionException caught...");
            e.printStackTrace();
        } catch (Throwable t) {
            t.printStackTrace();
        }
    }

    public static Schema ingestXML(XmlSchema schema, org.xml.sax.InputSource input, String itemKey)
    {
        final org.w3c.dom.Document doc = parseXML(input, false);

        //doc.normalizeDocument();
        if (DEBUG.DR) {
            try {
                errout("XML parsed, document built:");
                errout("org.w3c.dom.Document: " + Util.tags(doc));
                final org.w3c.dom.DocumentType type = doc.getDoctype();
                //errout("InputEncoding: " + doc.getInputEncoding()); // AbstractMethodError ?
                //errout("xmlEncoding: " + doc.getXmlEncoding()); // AbstractMethodError
                //errout("xmlVersion: " + doc.getXmlVersion()); // AbstractMethodError
                errout("docType: " + Util.tags(type));
                if (type != null) {
                    errout("docType.name: " + Util.tags(type.getName()));
                    errout("docType.entities: " + Util.tags(type.getEntities()));
                    errout("docType.notations: " + Util.tags(type.getNotations()));
                    errout("docType.publicId: " + Util.tags(type.getPublicId()));
                    errout("docType.systemId: " + Util.tags(type.getSystemId()));
                }
                errout("impl: " + Util.tags(doc.getImplementation().getClass()));
                errout("docElement: " + Util.tags(doc.getDocumentElement().getClass())); // toString() can dump whole document!
            } catch (Throwable t) {
                Log.error("debug failure", t);
            }
        }
        //out("element: " + Util.tags(doc.getDocumentElement()));

        //outln("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
        //outln("<!-- created by RSSTest " + new Date() + " from " + src + " -->");
   
        
        if (schema == null)
        	schema = new XmlSchema(tufts.vue.Resource.instance(input), itemKey);
        else
        	schema.flushData();
        
        if (false)
            XPathExtract(schema, doc);
        else
            scanNode(schema, doc.getDocumentElement(), null, null);

        if (DEBUG.DR || DEBUG.SCHEMA) schema.dumpSchema(System.err);
        return schema;
    }


    private static boolean isText(int type) {
        return type == Node.TEXT_NODE || type == Node.CDATA_SECTION_NODE;
    }
    private static boolean isText(Node node) {
        return isText(node.getNodeType());
    }

    private static final String getNodeType(Node n) {
        return getNodeType(n.getNodeType());
    }
    private static final String getNodeType(int t) {
        if (t == Node.ATTRIBUTE_NODE) return "attr";
        if (t == Node.CDATA_SECTION_NODE) return "cdata";
        if (t == Node.COMMENT_NODE) return "comment";
        if (t == Node.DOCUMENT_NODE) return "document";
        if (t == Node.ELEMENT_NODE) return "element";
        if (t == Node.ENTITY_NODE) return "entity";
        if (t == Node.TEXT_NODE) return "text";
        return "" + t;
        
    }
        
    
    // parentPath is the fully-qualified parent name

    private static void scanNode(XmlSchema schema, org.w3c.dom.Node n, String parentPath, String parentName) {

        final int type = n.getNodeType();
        final String value = n.getNodeValue();
        final boolean isAttribute = (type == Node.ATTRIBUTE_NODE);        
        String name = n.getNodeName();

        scanNode(schema, n, type, parentPath, parentName, name, value);
    }

    private static void scanNode(final XmlSchema schema,
                                final org.w3c.dom.Node node,
                                final int type,
                                final String parentPath,
                                final String parentName,
                                final String nodeName,
                                final String value)
    {
        final boolean isAttribute = (type == Node.ATTRIBUTE_NODE);
        final boolean isMergedText = FOLD_TEXT && isText(type);
        final boolean hasAttributes = (!isAttribute && node != null && node.hasAttributes());
        Node firstChild = null, lastChild = null;

        if (node != null) {
            firstChild = node.getFirstChild();
            lastChild = node.getLastChild();
        }

        final String XMLName;

        if (isAttribute)
            XMLName = parentName + ATTR_SEPARATOR + nodeName;
        else
            XMLName = nodeName;
        
        final String fullName;

        if (parentPath != null) { // should only be null first time in at the top root
            if (isMergedText)
                fullName = parentPath;
            else if (isAttribute)
                fullName = parentPath + ATTR_SEPARATOR + nodeName;
            else 
                fullName = parentPath + '.' + nodeName;
        } else {
            fullName = nodeName;
        }

        if (type == Node.ELEMENT_NODE)
            schema.trackNodeOpen(fullName);

        if (depth < REPORT_THRESH) {
            if (depth < REPORT_THRESH - 1) {
                if (type == Node.TEXT_NODE)
                    eoutln(String.format("node(%s) {%s} (len=%d)", getNodeType(type), fullName, value.length()));
                else
                    eoutln(String.format("NODE(%s) {%s} %.192s", getNodeType(type), fullName, node, Util.tags(firstChild)));
            }
            //eoutln("NODE: " + type + " name=" + name + " " + Util.tags(n) + " firstChild=" + Util.tags(firstChild));
                //System.err.println(name);
            else if (XML_DEBUG)
                System.err.print(".");
        }


        if (hasAttributes && ATTRIBUTES_IMMEDIATE)
            scanAttributes(schema, fullName, nodeName, node.getAttributes());

        String outputValue = null;

        if (value != null) {
            outputValue = value.trim();
            if (outputValue.length() > 0) {
                schema.trackFieldValuePair(fullName, outputValue);
            } else
                outputValue = null;
        }

        final NodeList children = node == null ? null : node.getChildNodes();
        final boolean DO_TAG;

        if (isMergedText) {
            DO_TAG = false;
        }
        else if (outputValue == null && node != null) {
            if (!node.hasChildNodes()) {
                DO_TAG = false;
            }
            else if (children.getLength() == 1 && isText(firstChild) && firstChild.getNodeValue().trim().length() == 0) {
                DO_TAG = false;
            }
            else
                DO_TAG = true;
            
            // if (!DO_TAG) ioutln("<!-- empty: " + nodeName + " -->");
        }
        else
            DO_TAG = true;
        
        boolean closeOnSameLine = false;

        if (DO_TAG) {
            
            iout("<");
            out(XMLName);
            //if (node.hasChildNodes()) out(" children=" + node.getChildNodes().getLength() + " first=" + node.getFirstChild());
            out(">");
            
            if (firstChild == null || (isText(firstChild) && firstChild == lastChild)) {
//                 if (firstChild != null && firstChild.getNodeType() == Node.CDATA_SECTION_NODE)
//                     ;
//                 else
                    closeOnSameLine = true;
            } else if (XML_OUTPUT)
                System.out.print('\n');
            
            if (FOLD_TEXT && (type != Node.ELEMENT_NODE && type != Node.ATTRIBUTE_NODE)) {
                final String err = "UNHANDLED TYPE=" + type + "; " + nodeName;
                outln("<" + err + ">");
                errout(err);
            }
        }

        if (outputValue != null) {
            if (type == Node.CDATA_SECTION_NODE) {
                out("<![CDATA[");
                out(outputValue);
                out("]]>");
            } else {
                out(XMLEntityEncode(outputValue));
            }
        }
            
        if (!isAttribute && node != null) {

            // god knows why, but attributes have themselves as children?  (or is that
            // the #text entry?)  Anyway, if we allow this for an attribute dump, the
            // value of the attribute will literally appear twice in the output,
            // back-to-back as one string.
            
            depth++;

            if (FOLD_KEYS || schema.isXMLKeyFold()) {

                scanFoldedChildren(schema, children, fullName, nodeName);

            } else {
                
                for (int i = 0; i < children.getLength(); i++)
                    scanNode(schema, children.item(i), fullName, nodeName);
            }
            
            depth--;
            
        }

        if (DO_TAG) {

            if (closeOnSameLine)
                outln("</" + XMLName + ">");
            else
                ioutln("</" + XMLName + ">");
        }

        if (type == Node.ELEMENT_NODE)
            schema.trackNodeClose(fullName);
            
        if (hasAttributes && ! ATTRIBUTES_IMMEDIATE)
            scanAttributes(schema, fullName, nodeName, node.getAttributes());
        
        //iout("children: " + Util.tags(n.getChildNodes()));
    }

    private static void scanAttributes(XmlSchema schema, String fullName, String nodeName, NamedNodeMap attr) {

        if (attr != null && attr.getLength() > 0) {
            //depth++;
            for (int i = 0; i < attr.getLength(); i++) {
                final Node a = attr.item(i);
                scanNode(schema, a, fullName, nodeName);
            }
            //depth--;
        }
    }

    private static void scanFoldedChildren(XmlSchema schema, final NodeList children, final String fullName, final String nodeName)
    {
        // Test code for folding Apple plist style <dict> pairs (<key>UserKey</key><string>UserValue</string>)
        // using iTunes Music Library.xml as test case.
                
        for (int i = 0; i < children.getLength(); i++) {
            final Node item = children.item(i);
            final Node next = children.item(i+1);

            if (next != null) {
                final String nextName = next.getNodeName();
                //errout("checking pair: " + item.getNodeName() + "/" + nextName); 
                //if ("key".equals(item.getNodeName()) && !"dict".equals(nextName)) {
                if ("key".equals(item.getNodeName())) {
                    //final String newNodeName = item.getNodeValue();
                    //final String newNodeValue = next.getNodeValue();
                            
                    // must extract through one more layer of indirection

                    String newNodeName = item.getChildNodes().item(0).getNodeValue();
                            
                    if (newNodeName != null)
                        newNodeName = newNodeName.replace(' ', '_');

                    final String newNodeValue;

                    if ("true".equals(nextName)) {
                        //newNodeValue = next.getNodeValue()
                        newNodeValue = "true"; // is a simle "<true/>" self-terminating value with/NO CHILDREN
                    }
                    else if ("false".equals(nextName)) {
                        // almost never see this in iTunes Music Library.xml
                        //errout("GOT FALSE");
                        newNodeValue = "false";
                    }
                    else if ("dict".equals(nextName) || "array".equals(nextName)) {

                        continue;
                                
                        //                                 //newNodeValue = "(todo: pull-up under: " + nextName + ")";
                        //                                 newNodeValue = nextName;
                        //                                 i--;  // we're not extracting this yet, so don't pull it out below
                    }
                    else {
                        newNodeValue = next.getChildNodes().item(0).getNodeValue();
                    }

                    //if ("Visible".equals(newNodeName)) errout("VALUE: " + newNodeValue);
                            
                    //errout(String.format("\t%s=[%s]", newNodeName, newNodeValue));
                    //errout("value children: " + item.getChildNodes());
                    // extract the current node value as a new node name, and the next node value as the new node value
                    scanNode(schema, null, Node.ELEMENT_NODE, fullName, nodeName, newNodeName, newNodeValue);
                    i++;
                    continue;
                }
            }

            scanNode(schema, item, fullName, nodeName);
        }
    }
    
    
    /*
    public static void dumpElement(Element e) {
        out("\tElement: " + Util.tags(e));
        out("\tElement tag: " + e.getTagName());
        out("\tElement SchemaTypeInfo: " + Util.tags(e.getSchemaTypeInfo()));
    }
    */
    
    // Parses an XML file and returns a DOM document.
    // If validating is true, the contents is validated against the DTD
    // specified in the file.
    private static org.w3c.dom.Document parseXML(Object input, boolean validating) {
        try {
            // Create a builder factory
            javax.xml.parsers.DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setIgnoringElementContentWhitespace(true);
            factory.setIgnoringComments(true);
            //factory.setCoalescing(true);
            factory.setValidating(validating);
    
            // Create the builder and parse the file
            final org.w3c.dom.Document doc;
            if (input instanceof String) {
                doc = factory.newDocumentBuilder().parse(new File((String)input));
            } else if (input instanceof InputSource) {
                doc = factory.newDocumentBuilder().parse((InputSource)input);
            } else if (input instanceof InputStream) {
//                 InputSource encoded = new InputSource();
//                 encoded.setByteStream((InputStream)input);
//                 encoded.setEncoding("ISO-8859-1"); // TODO: get from url stream
//                 doc = factory.newDocumentBuilder().parse(encoded);
//                 //doc = factory.newDocumentBuilder().parse(new InputStreamReader((InputStream) input, "ISO-8859-1"));
                doc = factory.newDocumentBuilder().parse((InputStream) input);
            }
            else
                throw new Error("Unhandled input type: " + Util.tags(input));
            return doc;
        } catch (Throwable t) {
            t.printStackTrace();
        }
        /*catch (SAXException e) {
            // A parsing error occurred; the xml input is not valid
        } catch (ParserConfigurationException e) {
        } catch (IOException e) {
        }
        */
        return null;
    }

   public static String XMLEntityEncode(final String text)
   {
       // todo: if the result of this is simply destined for a writer, would
       // be more efficient to pass the writer in, and skip constructing new
       // StringBuffers.  Apache Commons has methods for this, presumably
       // for this reason -- eventually go ahead and use that:

       // will NOT introduce " uneeded for us, possibly problematic, in
       // that this text may ultimately be handled by an HTML component which
       // won't handle """ (todo: test w/JLabel <html>)
       //return org.apache.commons.lang.StringEscapeUtils.escapeHtml(s);
       
       // will introduce "
       //return org.apache.commons.lang.StringEscapeUtils.escapeXml(s);
       
       StringBuilder buf = null;
       final int len = (text == null ? -1 : text.length());

       for ( int i = 0; i < len; i++ )
       {
           final char c = text.charAt(i);
           String entity = null;

           switch (c) {
               // These are the five basic XML entities:
               // See http://commons.apache.org/lang/api/org/apache/commons/lang/StringEscapeUtils.html
               
           case '&':  entity = "&";     break;
           case '<':  entity = "<";      break;
           case '>':  entity = ">";      break;
           case '"':  entity = """;    break;
         //case '\'': entity = "'";    break; // not a legal HTML entity, even tho is a legal XML entity
         //case '\r': entity = "";     break; // test
           default:
               if (buf != null)
                   buf.append(c);
               continue;
           }

           // We've encountered something to encode: entity has been set:
           
           if (buf == null) {
               buf = new StringBuilder(len + 12);
               buf.append(text, 0, i);
           }
           buf.append(entity);
           
       }
       return buf == null ? text : buf.toString();

//        for ( int i = 0; i < len; i++ ) {
//            final char c = s.charAt( i );
//            if (c >= 'a' && c <= 'z' || c >='A' && c <= 'Z' || c >= '0' && c <= '9') {
//                buf.append( c );
//            } else {
//                final String entity;
//                switch (c) {
//                case '&':  entity = "&";     break;
//                case '<':  entity = "<";      break;
//                case '>':  entity = ">";      break;
//                case '"':  entity = """;    break;
//              //case '\'': entity = "'";    break; // apparently, not actually a legal entity
//              //case '\r': entity = "";     break;
//                default:   entity = null;
//                }
//                if (entity != null)
//                    buf.append(entity);
//                else
//                    //buf.append( "&#" + (int)c + ";" );
//                    buf.append(c);
//            }
//        }
//       return buf.toString();
       
   }    


    public static void iout(String s) {
        iout(depth, s);
    }

    public static void ioutln(String s) {
        ioutln(depth, s);
    }

    final static String TAB = "    ";

    public static void iout(int _depth, String s) {
        if (XML_OUTPUT) {
            for (int x = 0; x < _depth; x++) System.out.print(TAB);
            System.out.print(s);
        }
    }
    
    public static void ioutln(int _depth, String s) {
        if (XML_OUTPUT) {
            for (int x = 0; x < _depth; x++) System.out.print(TAB);
            System.out.println(s);
        }
    }

    public static void eoutln(int _depth, String s) {
        if (XML_OUTPUT) {
            for (int x = 0; x < _depth; x++) System.err.print(TAB);
            System.err.println(s);
        }
    }

    public static void eoutln(String s) {
        eoutln(depth, s);
    }
    
    

    public static void out(String s) {
        if (XML_OUTPUT)
            System.out.print(s == null ? "null" : s);
    }

    public static void outln(String s) {
        if (XML_OUTPUT)
            System.out.println(s == null ? "null" : s);
    }
    public static void errout(String s) {
        Log.debug(s == null ? "null" : s);
        //System.err.println("XMLIngest: " + s);
    }




    final static boolean ATTRIBUTES_IMMEDIATE = false; // false better for clearer XML output, true better for schema output (e.g., rss.version 1st, not last)
    final static boolean FOLD_TEXT = true; // default true: fold Node.TEXT_NODE(#text) and CDATA items into parent node

    final static boolean FOLD_KEYS = false; // auto-enabled if top-level item is "plist" (current breaks on JIRA XML if true)

    //final static int REPORT_THRESH = FOLD_KEYS ? 4 : 3;
    final static int REPORT_THRESH = 4;
    //final static int REPORT_THRESH = 1;

    final static char ATTR_SEPARATOR = '@';


    private static final String JIRA_VUE_URL = "http://bugs.atech.tufts.edu/secure/IssueNavigator.jspa?view=rss&pid=10001&tempMax=9999&reset=true&decorator=none";
    private static final String JIRA_SFRAIZE_COOKIE = "seraph.os.cookie=LkPlQkOlJlHkHiEpGiOiGjJjFi";

    private static InputStream getTestXMLStream()
        throws IOException
    {

//         // SMF 2008-10-02: E.g. Craigslist XML streams use ISO-8859-1, which is provided in
//         // HTML headers as "Content-Type: application/rss+xml; charset=ISO-8859-1", (tho not
//         // in a special content-encoding header), and our current XML parser fails unless
//         // the stream is read with this set: e.g.: [org.xml.sax.SAXParseException: Character
//         // conversion error: "Unconvertible UTF-8 character beginning with 0x95" (line
//         // number may be too low).]  Actually, in this case it turns out that providing a
//         // default InputStreamReader (encoding not specified) as opposed to a direct
//         // InputStream from the URLConnection works, and the XML parser is presumably then
//         // finding and handling the "<?xml version="1.0" encoding="ISO-8859-1"?>" line at
//         // the top of the XML stream
//         final XmlSchema schema = new XmlSchema(conn.getURL(), itemKey);
//         InputStream is = null;
//         try {
//             is = conn.getInputStream();
//             errout("GOT INPUT STREAM: " + Util.tags(is));
//         } catch (IOException e) {
//             e.printStackTrace();
//             return null;
//         }
//         final Document doc = parseXML(is, false);

        // Could also use a ROME API XmlReader(URLConnection) for handling
        // the input, which does it's own magic to figure out the encoding.
        // For more on the complexity of this issue, see:
        // http://diveintomark.org/archives/2004/02/13/xml-media-types

        
        URL url = new URL(JIRA_VUE_URL);
        URLConnection conn = url.openConnection();
        conn.setRequestProperty("Cookie", JIRA_SFRAIZE_COOKIE);
        errout("Opening connection to " + url);
        conn.connect();
        
        errout("Getting InputStream...");
        InputStream in = conn.getInputStream();
        errout("Got " + Util.tags(in));
        
        errout("Getting headers...");
        Map<String,List<String>> headers = conn.getHeaderFields();

        errout("HEADERS:");
        for (Map.Entry<String,List<String>> e : headers.entrySet()) {
            errout(e.getKey() + ": " + e.getValue());
        }

        return in;
    }
    
    public static void main(String[] args)
        throws IOException
    {
        DEBUG.Enabled = DEBUG.DR = DEBUG.IO = DEBUG.SCHEMA = true;

        tufts.vue.VUE.parseArgs(args);

        org.apache.log4j.Logger.getRootLogger().removeAllAppenders(); // need to do this or we get everything twice
        org.apache.log4j.Logger.getRootLogger().addAppender
            (new org.apache.log4j.ConsoleAppender(tufts.vue.VUE.MasterLogPattern, "System.err"));
        
        //final XmlSchema schema = new RssSchema();
        
        errout("Max mem: " + Util.abbrevBytes(Runtime.getRuntime().maxMemory()));
        //getXMLStream();System.exit(0);

        final String file = args[0];
        final String key = args[1];

        Log.debug("File: " + file);
        Log.debug("Key: " + key);

        final InputSource is = new InputSource(file);
        is.setCharacterStream(new FileReader(file));
        

        //XMLIngest.XML_DEBUG = true;

        Schema schema = ingestXML(null,is, key);
        
        //schema.dumpSchema(System.err);

        System.err.println("\n");
        Log.debug("done");
    }

//     public static void main(String[] args)
//         throws IOException
//     {
//         //final XmlSchema schema = new RssSchema();
        
//         errout("Max mem: " + Util.abbrevBytes(Runtime.getRuntime().maxMemory()));
//         //getXMLStream();System.exit(0);
        
//         Document doc;
//         String src;

//         if (args.length < 1) {
//             doc = parseXML(getTestXMLStream(), false);
//             src = JIRA_VUE_URL;
//         } else {
//             doc = parseXML(args[0], false);
//             src = args[0];
//         }
//         //doc.normalizeDocument();
//         errout("GOT DOC " + Util.tag(doc) + " " + doc);
//         errout("InputEncoding: " + doc.getInputEncoding());
//         errout("xmlEncoding: " + doc.getXmlEncoding());
//         errout("xmlVersion: " + doc.getXmlVersion());
//         errout("docType: " + Util.tags(doc.getDoctype()));
//         errout("impl: " + Util.tags(doc.getImplementation()));
//         errout("docElement: " + Util.tags(doc.getDocumentElement()));
//         //out("element: " + Util.tags(doc.getDocumentElement()));

//         outln("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
//         outln("<!-- created by RSSTest " + new Date() + " from " + src + " -->");

//         final XmlSchema schema = new XmlSchema(Util.tag(doc), "rss.channel.item");
        
//         if (true)
//             XPathExtract(schema, doc);
//         else
//             scanNode(schema, doc.getDocumentElement(), null, null);

//         schema.dumpSchema(System.err);
//     }

    

    

    
    

    


}