// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. // // TagSoup is licensed under the Apache License, // Version 2.0. You may obtain a copy of this license at // http://www.apache.org/licenses/LICENSE-2.0 . You may also have // additional legal rights not granted by this license. // // TagSoup is distributed in the hope that it will be useful, but // unless required by applicable law or agreed to in writing, TagSoup // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS // OF ANY KIND, either express or implied; not even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // // // The TagSoup command line UI package org.ccil.cowan.tagsoup; import java.util.Hashtable; import java.util.Enumeration; import java.io.*; import java.net.URL; import java.net.URLConnection; import org.xml.sax.*; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.ext.LexicalHandler; /** The stand-alone TagSoup program. **/ public class CommandLine { static Hashtable options = new Hashtable(); static { options.put("--nocdata", Boolean.FALSE); // CDATA elements are normal options.put("--files", Boolean.FALSE); // process arguments as separate files options.put("--reuse", Boolean.FALSE); // reuse a single Parser options.put("--nons", Boolean.FALSE); // no namespaces options.put("--nobogons", Boolean.FALSE); // suppress unknown elements options.put("--any", Boolean.FALSE); // unknowns have ANY content model options.put("--emptybogons", Boolean.FALSE); // unknowns have EMPTY content model options.put("--norootbogons", Boolean.FALSE); // unknowns can't be the root options.put("--pyxin", Boolean.FALSE); // input is PYX options.put("--lexical", Boolean.FALSE); // output comments options.put("--pyx", Boolean.FALSE); // output is PYX options.put("--html", Boolean.FALSE); // output is HTML options.put("--method=", Boolean.FALSE); // output method options.put("--doctype-public=", Boolean.FALSE); // override public id options.put("--doctype-system=", Boolean.FALSE); // override system id options.put("--output-encoding=", Boolean.FALSE); // output encoding options.put("--omit-xml-declaration", Boolean.FALSE); // omit XML decl options.put("--encoding=", Boolean.FALSE); // specify encoding options.put("--help", Boolean.FALSE); // display help options.put("--version", Boolean.FALSE); // display version options.put("--nodefaults", Boolean.FALSE); // no default attrs options.put("--nocolons", Boolean.FALSE); // colon to underscore options.put("--norestart", Boolean.FALSE); // no restartable elements options.put("--ignorable", Boolean.FALSE); // return ignorable whitespace } /** Main method. Processes specified files or standard input. **/ public static void main(String[] argv) throws IOException, SAXException { int optind = getopts(options, argv); if (hasOption(options, "--help")) { doHelp(); return; } if (hasOption(options, "--version")) { System.err.println("TagSoup version 1.2"); return; } if (argv.length == optind) { process("", System.out); } else if (hasOption(options, "--files")) { for (int i = optind; i < argv.length; i++) { String src = argv[i]; String dst; int j = src.lastIndexOf('.'); if (j == -1) dst = src + ".xhtml"; else if (src.endsWith(".xhtml")) dst = src + "_"; else dst = src.substring(0, j) + ".xhtml"; System.err.println("src: " + src + " dst: " + dst); OutputStream os = new FileOutputStream(dst); process(src, os); } } else { for (int i = optind; i < argv.length; i++) { System.err.println("src: " + argv[i]); process(argv[i], System.out); } } } // Print the help message private static void doHelp() { System.err.print("usage: java -jar tagsoup-*.jar "); System.err.print(" [ "); boolean first = true; for (Enumeration e = options.keys(); e.hasMoreElements(); ) { if (!first) { System.err.print("| "); } first = false; String key = (String)(e.nextElement()); System.err.print(key); if (key.endsWith("=")) System.err.print("?"); System.err.print(" "); } System.err.println("]*"); } private static Parser theParser = null; private static HTMLSchema theSchema = null; private static String theOutputEncoding = null; // Process one source onto an output stream. private static void process(String src, OutputStream os) throws IOException, SAXException { XMLReader r; if (hasOption(options, "--reuse")) { if (theParser == null) theParser = new Parser(); r = theParser; } else { r = new Parser(); } theSchema = new HTMLSchema(); r.setProperty(Parser.schemaProperty, theSchema); if (hasOption(options, "--nocdata")) { r.setFeature(Parser.CDATAElementsFeature, false); } if (hasOption(options, "--nons") || hasOption(options, "--html")) { r.setFeature(Parser.namespacesFeature, false); } if (hasOption(options, "--nobogons")) { r.setFeature(Parser.ignoreBogonsFeature, true); } if (hasOption(options, "--any")) { r.setFeature(Parser.bogonsEmptyFeature, false); } else if (hasOption(options, "--emptybogons")) { r.setFeature(Parser.bogonsEmptyFeature, true); } if (hasOption(options, "--norootbogons")) { r.setFeature(Parser.rootBogonsFeature, false); } if (hasOption(options, "--nodefaults")) { r.setFeature(Parser.defaultAttributesFeature, false); } if (hasOption(options, "--nocolons")) { r.setFeature(Parser.translateColonsFeature, true); } if (hasOption(options, "--norestart")) { r.setFeature(Parser.restartElementsFeature, false); } if (hasOption(options, "--ignorable")) { r.setFeature(Parser.ignorableWhitespaceFeature, true); } if (hasOption(options, "--pyxin")) { r.setProperty(Parser.scannerProperty, new PYXScanner()); } Writer w; if (theOutputEncoding == null) { w = new OutputStreamWriter(os); } else { w = new OutputStreamWriter(os, theOutputEncoding); } ContentHandler h = chooseContentHandler(w); r.setContentHandler(h); if (hasOption(options, "--lexical") && h instanceof LexicalHandler) { r.setProperty(Parser.lexicalHandlerProperty, h); } InputSource s = new InputSource(); if (src != "") { s.setSystemId(src); } else { s.setByteStream(System.in); } if (hasOption(options, "--encoding=")) { // System.out.println("%% Found --encoding"); String encoding = (String)options.get("--encoding="); if (encoding != null) s.setEncoding(encoding); } r.parse(s); } // Pick a content handler to generate the desired format. private static ContentHandler chooseContentHandler(Writer w) { XMLWriter x; if (hasOption(options, "--pyx")) { return new PYXWriter(w); } x = new XMLWriter(w); if (hasOption(options, "--html")) { x.setOutputProperty(XMLWriter.METHOD, "html"); x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes"); } if (hasOption(options, "--method=")) { String method = (String)options.get("--method="); if (method != null) { x.setOutputProperty(XMLWriter.METHOD, method); } } if (hasOption(options, "--doctype-public=")) { String doctype_public = (String)options.get("--doctype-public="); if (doctype_public != null) { x.setOutputProperty(XMLWriter.DOCTYPE_PUBLIC, doctype_public); } } if (hasOption(options, "--doctype-system=")) { String doctype_system = (String)options.get("--doctype-system="); if (doctype_system != null) { x.setOutputProperty(XMLWriter.DOCTYPE_SYSTEM, doctype_system); } } if (hasOption(options, "--output-encoding=")) { theOutputEncoding = (String)options.get("--output-encoding="); // System.err.println("%%%% Output encoding is " + theOutputEncoding); if (theOutputEncoding != null) { x.setOutputProperty(XMLWriter.ENCODING, theOutputEncoding); } } if (hasOption(options, "--omit-xml-declaration")) { x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes"); } x.setPrefix(theSchema.getURI(), ""); return x; } // Options processing private static int getopts(Hashtable options, String[] argv) { int optind; for (optind = 0; optind < argv.length; optind++) { String arg = argv[optind]; String value = null; if (arg.charAt(0) != '-') break; int eqsign = arg.indexOf('='); if (eqsign != -1) { value = arg.substring(eqsign + 1, arg.length()); arg = arg.substring(0, eqsign + 1); } if (options.containsKey(arg)) { if (value == null) options.put(arg, Boolean.TRUE); else options.put(arg, value); // System.out.println("%% Parsed [" + arg + "]=[" + value + "]"); } else { System.err.print("Unknown option "); System.err.println(arg); System.exit(1); } } return optind; } // Return true if an option exists. private static boolean hasOption(Hashtable options, String option) { if (Boolean.getBoolean(option)) return true; else if (options.get(option) != Boolean.FALSE) return true; return false; } }