sgmlp.java example

Explorer
robonobo-master
 /* 
  *  `gnu.iou' 
  *  Copyright (C) 2006 John Pritchard.
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License as
  *  published by the Free Software Foundation; either version 2 of
  *  the License, or (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  *  General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  *  02111-1307 USA
  */
package gnu.iou ;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;

/**
 * SGML Lexer splits document text string into lines and columns
 * delimited by tags.  
 *
 * <p> <b>Static function</b>
 *
 * <p> The `parse' function returns a two dimensional array.  The
 * first dimension is lines, the second we'll sometimes call "columns"
 * -- although each line can have an irregular number of columns (zero
 * or more second dimension elements per line).
 *
 * <p> This is less than an SGML tokenizer, but a pre- tokenizer
 * preserving lines, quoted attributes, and creating arrays within
 * lines for tags and non- tags.  
 *
 * <P> Each element of a line is checked for starting or ending with a
 * less- than (<tt>"<b><</b>"</tt>) or greater- than
 * (<tt>"<b>></b>"</tt>) character to see if it is a tag, or part
 * of a tag.  Tags can span multiple lines, so the opening less- than
 * (LT) character may not be closed by a closing greater- than (GT)
 * character till the next line.
 *
 * <p> The `parseValidate' function only returns a non- null result
 * when the source text contains a <tt>"<b>< - ></b>"</tt> valid
 * document possessing at least one <tt>"<b>< - ></b>"</tt> tag.
 *
 * @author John Pritchard (john@syntelos.org)
 */
public abstract class sgmlp {

    private final static String[][] add ( String[][] src, String[] element){
	if ( null == src)
	    src = new String[1][];
	else {
	    String[][] copier = new String[src.length+1][];
	    System.arraycopy(src,0,copier,0,src.length);
	    src = copier;
	}
	src[src.length-1] = element;
	return src;
    }

    /**
     * Parse as a template, returning null if there are no template
     * objects in the text.  
     * 
     * <p> "Server- side" parse validation goes into quotes.
     */
    public final static String[][] parseValidateServer ( String text){
	return parse(text,true,false);
    }
    /**
     * Parse as a template, returning null if there are no template
     * objects in the text.  
     * 
     * <p> "Client- side" parse validation ignores everything in quotes.
     */
    public final static String[][] parseValidateClient ( String text){
	return parse(text,true,true);
    }
    /**
     * Parse text string into lines and columns.  One column per line
     * unless there are "%...%" tokens in the line, in which case the
     * "%...%" tokens are separated.
     *
     * @param text Text string */
    public final static String[][] parse ( String text){
	return parse(text,false,true);
    }
    /**
     * Parse as a template, returning null if there are no template
     * objects in the text.  
     * 
     * <p> "Server- side" parse validation goes into quotes.
     */
    public final static String[][] parseValidateServer ( InputStream in){
	return parse(in,true,false);
    }
    /**
     * Parse as a template, returning null if there are no template
     * objects in the text.  
     * 
     * <p> "Client- side" parse validation ignores everything in quotes.
     */
    public final static String[][] parseValidateClient ( InputStream in){
	return parse(in,true,true);
    }
    /**
     * Parse text string into lines and columns.  One column per line
     * unless there are "%...%" tokens in the line, in which case the
     * "%...%" tokens are separated.
     *
     * @param text Text string */
    public final static String[][] parse ( InputStream in){
	return parse(in,false,true);
    }
    /**
     * 
     * @param text Source with CRLF or LF newlines
     *
     * @param validate If source doesn't contain <tt><sgml
     * tokens></tt>, return null.  
     * 
     * @param blindquotes If anything in quotes is blindly ignored. 
     */
    public final static String[][] parse ( String text, boolean validate, boolean blindquotes) {

	StringTokenizer strtok = new StringTokenizer(text,"\r\n",true);

	String[][] src = null;

	int c = 0, cc = 0;

	String s1;

	try {
	    /* parse input into lines */

	    String[] tmp;

	    while (true){

		s1 = strtok.nextToken();

		if ( 1 == s1.length()){

		    switch(s1.charAt(0)){

		    case '\r':

			break;

		    case '\n':
			cc += 1;
			if ( 2 <= cc)
			    src = add(src,null);

			break;

		    default:
			cc = 0;

			tmp = new String[1];
			tmp[0] = s1;
			src = add(src,tmp);

			break;
		    }
		}
		else {
		    cc = 0;

		    tmp = new String[1];
		    tmp[0] = s1;
		    src = add(src,tmp);
		}
	    }
	}
	catch ( NoSuchElementException nsx){}

	return _parse(src, validate, blindquotes);
    }

    /**
     * 
     * @param in Source
     *
     * @param validate If source doesn't contain <tt><sgml
     * tokens></tt>, return null.  
     * 
     * @param blindquotes If anything in quotes is blindly ignored. 
     */
    public final static String[][] parse ( InputStream in, boolean validate, boolean blindquotes) {

	String src[][] = null;

	try {
	    DataInputStream din;

	    if ( in instanceof DataInputStream)

		din = (DataInputStream)in;
	    else
		din = new DataInputStream (in);

	    String line, tmp[];

	    while ( null != (line = din.readLine())){

		if ( 0 == line.length())

		    src = add(src,null);

		else {
		    tmp = new String[1];
		    tmp[0] = line;
		    src = add(src,tmp);
		}
	    }
	    return _parse( src, validate, blindquotes);
	}
	catch ( IOException iox){

	    iox.printStackTrace();

	    return null;
	}
    }

    private final static String[][] _parse ( String[][] src, boolean validate, boolean blindquotes){

	if ( null == src) return null;

	// TODO (blindquotes)?(HTML-COMMENTS) [[for correct client side interp]]

	/*
	 * Parse lines for tags and tagged- data
	 */
	char line[], quot = 0;

	int llen, intag = 0, /* sgml_validity = 0, */ sgml_tagcount = 0, idx;

	String[] linary;

	for ( int c = 0, cc; c < src.length; c++){

	    linary = src[c];

	    if ( null != linary){

		line = linary[0].toCharArray();

		llen = line.length;

		for ( cc = 0; cc < llen; cc++){

		    switch(line[cc]){

		    case '<':

			if (blindquotes){

			    if ( 0 < quot)

				break;

			    else if (0 < intag)

				break; // '<' within '<'

			}

			// sgml_validity += 1;

			intag += 1;

			if ( 0 < cc)
			    src[c] = _parse_splitter(src[c],cc);

			break;

		    case '>':

			if ( blindquotes && 0 < quot)

			    break;

			else if (0 < intag){

			    //sgml_validity -= 1; 

			    sgml_tagcount += 1;

			    intag -= 1; // multiline tag

			    idx = cc+1;

			    if ( idx < llen)
				src[c] = _parse_splitter( src[c], idx);
			}
			break;

		    case '\'':
		    case '`':
		    case '"':

			if (blindquotes){

			    if (0 < intag){

				if ( 0 == quot)
				    quot = line[cc];
				else if ( quot == line[cc])
				    quot = 0;
			    }
			    else if ( 0 < quot && quot == line[cc])
				quot = 0;
			}
			break;

		    default:
			break;
		    }
		}
	    }
	}

	if (validate){

	    if ( 0 < sgml_tagcount) /* && 0 == sgml_validity //JS comparison "<>"  */

		return src;
	    else
		return null;
	}
	else
	    return src;
    }



    /**
     * This doesn't check for appending a zero- length substring
     */
    private final static String[] _parse_splitter( String[] line, int atidx){

	int llen = line.length;

	if ( 1 < llen){

	    for ( int clen = (llen-1), cc = 0; cc < clen; cc++)
		atidx -= line[cc].length();

	    if ( 0 < atidx){

		String s = line[llen-1];

		String s0 = s.substring(0,atidx);
		String s1 = s.substring(atidx);

		String[] copier = new String[llen+1];

		System.arraycopy( line, 0, copier, 0, llen);

		copier[llen-1] = s0;
		copier[llen]   = s1;

		return copier;
	    }
	    else
		return line;
	}
	else if ( 0 < atidx){

	    String s = line[0];

	    String s0 = s.substring(0,atidx);
	    String s1 = s.substring(atidx);

	    line = new String[2];

	    line[0] = s0;
	    line[1] = s1;

	    return line;
	}
	else
	    return line;
    }



    /**
     * Interned "="
     */
    public final static String EQ = "=".intern();

    /**
     * Split a tag into tokens according to tag syntax, preserving
     * quoted attribute values, stripping leading SGML tag "start" and
     * "end" ('<', '>') characters.  Does not require any
     * particular elements of a tag, does not require a start or end
     * character.  Stops at an SGML tag end character that is not
     * within a symmetrically quoted string.
     *
     * <p> Tag attributes are guaranteed as three tokens, as
     * available: name string, equals character string, value string.
     * This equals string is "interned" so that in processing the
     * result, each element can be compared by value to a similarly
     * interned string ("=".intern()) using the java equivalent value
     * ("==") operator.
     *
     * <p> Returned tokens are all trim: no leading or trailing
     * whitespace.
     * 
     * @param tag A whole or part of an SGML tag.  Ignores anything
     * before the tag open character ('<'). */
    public final static String[] tag_tokenizer ( String tag){
	if ( null == tag) 
	    return null;
	else {
	    int len = tag.length();

	    if ( 0 >= len) return null;

	    linebuf buf = new linebuf();

	    char quot = 0, ch, cary[] = tag.toCharArray();

	    int mark = 0, tokl;

	    String tok;

	    for ( int c = 0; c < len; c++){

		ch = cary[c];

		switch(ch){

		case '<':

		    if ( 0 == quot)

			mark = c+1;

		    break;

		case '>':

		    if ( 0 == quot){

			tokl = c-mark;

			if ( 0 < tokl){

			    tok = new String(cary,mark,tokl);

			    buf.append(tok);
			}

			return buf.toStringArray();
		    }
		    else
			break;

		case '=':
		    if ( 0 == quot){

			tokl = c-mark;

			if ( 0 < tokl){

			    tok = new String(cary,mark,tokl);

			    buf.append(tok);
			}

			buf.append(EQ); // "=".intern()

			mark = c+1;
		    }
		    break;

		case '\'':
		case '`':
		case '"':

		    if ( 0 == quot){

			quot = ch;

			mark = c;
		    }
		    else if ( ch == quot){

			quot = 0;

			tokl = c-mark;

			if ( 0 < tokl){

			    tok = new String(cary,mark,tokl);

			    buf.append(tok);
			}
			mark = c+1;
		    }
		    break;

		case ' ':
		case '\t':

		    if ( 0 == quot){

			tokl = c-mark;

			if ( 0 < tokl){

			    tok = new String(cary,mark,tokl);

			    buf.append(tok);
			}
			mark = c+1;
		    }
		    break;

		default:
		    break;
		}
	    }

	    return buf.toStringArray();
	}
    }


    /**
     * Normalize an SGML tag attribute value, stripping symmetric
     * quotes, returning null for empty strings.  
     *
     * @param att_value Must not include leading or trailing
     * whitespace, or otherwise be a string other than a bare tag
     * attribute value.  */
    public final static String trim_value( String att_value){
	if ( null == att_value) 
	    return null;
	else {
	    int len = att_value.length();

	    if ( 0 >= len) return null;

	    char ch = att_value.charAt(0);

	    switch(ch){

	    case '\'':
	    case '`':
	    case '"':

		if ( ch == att_value.charAt(len-1))

		    return att_value.substring(1,len-2);

		else
		    return att_value;

	    default:
		return att_value;
	    }
	}
    }


    private final static String TEXT_PREFIX = "{\\{\\{";

    private final static String LINE_LEFT = "{\\{";

    private final static String LINE_RIGHT = "}/}";

    private final static String LINE_INFIX = "|,|";

    private final static String TEXT_SUFFIX = "}/}/}";

    private final static String[] helpary = {
	null,
	" Usage: sgmlp -f filename [ -s | -c ]",
	null,
	" Description",
	null,
	"\tDisplays parsed input file using tryglyph token delimiters.",
	null,
	"\tUses \""+TEXT_PREFIX+"\" before the text.",
	null,
	"\tUses \""+LINE_LEFT+"\" on the left hand side of a line.",
	null,
	"\tUses \""+LINE_INFIX+"\" among tokens within a line.",
	null,
	"\tUses \""+LINE_RIGHT+"\" on the right hand side of a line.",
	null,
	"\tUses \""+TEXT_SUFFIX+"\" after the text.",
	null,
	"\tIn this way, the tokenization of the text is readable.",
	null,
	" Options",
	null,
	"\t-s\t-- Use server side parsing model.",
	"\t  \t   Enter quoted content.",
	null,
	"\t-c\t-- Use client side parsing model (default).",
	"\t  \t   Ignore quoted content.",
	null,
    };

    private final static String help = new linebuf(helpary).toString();

    public static void main( String[] argv){
	try {
	    File inf = null;

	    boolean validate_server = false, validate_client = false;

	    if ( null == argv || 1 > argv.length)
		throw new IllegalArgumentException(help);
	    else {
		int alen = argv.length;
		String arg; 

		for ( int argc = 0; argc < alen; argc++){
		    arg = argv[argc];

		    if ( 2 > arg.length())
			throw new IllegalArgumentException("Unrecognized argument `"+arg+"'");
		    else {
			switch(arg.charAt(0)){
			case '-':
			case '/':
			    switch(arg.charAt(1)){
			    case 'h':
			    case 'H':
			    case '?':
				throw new IllegalArgumentException(help);

			    case 'f':
				argc += 1;
				if ( argc < alen){

				    arg = argv[argc];

				    inf = new File(arg);
				}
				else
				    throw new IllegalArgumentException("Argument `-f' requires filename.");
				break;

			    case 's':
				validate_server = (!validate_server);
				break;

			    case 'c':
				validate_client = (!validate_client);
				break;

			    default:
				throw new IllegalArgumentException("Unrecognized argument `"+arg+"'");
			    }
			    break;
			default:
			    throw new IllegalArgumentException("Unrecognized argument `"+arg+"'");
			}
		    }
		}
	    }

	    if ( null == inf)
		throw new IllegalArgumentException(help);
	    else if (! (inf.exists() && inf.canRead()))
		throw new IllegalArgumentException("Can't read file `"+inf+"'");

	    InputStream in = new FileInputStream(inf);

	    try {
		String[][] pars = null;

		if (validate_server)
		    pars = parseValidateServer(in);
		else if (validate_client)
		    pars = parseValidateClient(in);
		else
		    pars = parse(in);

		if ( null != pars){
		    System.out.println(TEXT_PREFIX);

		    for ( int c = 0, cc; c < pars.length; c++){

			System.out.print("\t"+LINE_LEFT);

			for ( cc = 0; null != pars[c] && cc < pars[c].length; cc++){

			    if ( null != pars[c][cc])
				System.out.print(pars[c][cc]);

			    if ( cc < pars[c].length-1)
				System.out.print(LINE_INFIX);
			}
			System.out.println(LINE_RIGHT);
		    }

		    System.out.println(TEXT_SUFFIX);
		}
		else
		    System.out.println("Not SGML.");
	    }
	    finally {
		in.close();
	    }
	}
	catch ( IllegalArgumentException ilx){
	    System.err.println(ilx.getMessage());
	}
	catch ( Exception exc){
	    exc.printStackTrace();
	}
    }

}