/*
* `gnu.iou'
* Copyright (C) 2006 John Pritchard.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
* 02111-1307 USA
*/
package gnu.iou ;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
/**
* SGML Lexer splits document text string into lines and columns
* delimited by tags.
*
* <p> <b>Static function</b>
*
* <p> The `parse' function returns a two dimensional array. The
* first dimension is lines, the second we'll sometimes call "columns"
* -- although each line can have an irregular number of columns (zero
* or more second dimension elements per line).
*
* <p> This is less than an SGML tokenizer, but a pre- tokenizer
* preserving lines, quoted attributes, and creating arrays within
* lines for tags and non- tags.
*
* <P> Each element of a line is checked for starting or ending with a
* less- than (<tt>"<b><</b>"</tt>) or greater- than
* (<tt>"<b>></b>"</tt>) character to see if it is a tag, or part
* of a tag. Tags can span multiple lines, so the opening less- than
* (LT) character may not be closed by a closing greater- than (GT)
* character till the next line.
*
* <p> The `parseValidate' function only returns a non- null result
* when the source text contains a <tt>"<b>< - ></b>"</tt> valid
* document possessing at least one <tt>"<b>< - ></b>"</tt> tag.
*
* @author John Pritchard (john@syntelos.org)
*/
public abstract class sgmlp {
private final static String[][] add ( String[][] src, String[] element){
if ( null == src)
src = new String[1][];
else {
String[][] copier = new String[src.length+1][];
System.arraycopy(src,0,copier,0,src.length);
src = copier;
}
src[src.length-1] = element;
return src;
}
/**
* Parse as a template, returning null if there are no template
* objects in the text.
*
* <p> "Server- side" parse validation goes into quotes.
*/
public final static String[][] parseValidateServer ( String text){
return parse(text,true,false);
}
/**
* Parse as a template, returning null if there are no template
* objects in the text.
*
* <p> "Client- side" parse validation ignores everything in quotes.
*/
public final static String[][] parseValidateClient ( String text){
return parse(text,true,true);
}
/**
* Parse text string into lines and columns. One column per line
* unless there are "%...%" tokens in the line, in which case the
* "%...%" tokens are separated.
*
* @param text Text string */
public final static String[][] parse ( String text){
return parse(text,false,true);
}
/**
* Parse as a template, returning null if there are no template
* objects in the text.
*
* <p> "Server- side" parse validation goes into quotes.
*/
public final static String[][] parseValidateServer ( InputStream in){
return parse(in,true,false);
}
/**
* Parse as a template, returning null if there are no template
* objects in the text.
*
* <p> "Client- side" parse validation ignores everything in quotes.
*/
public final static String[][] parseValidateClient ( InputStream in){
return parse(in,true,true);
}
/**
* Parse text string into lines and columns. One column per line
* unless there are "%...%" tokens in the line, in which case the
* "%...%" tokens are separated.
*
* @param text Text string */
public final static String[][] parse ( InputStream in){
return parse(in,false,true);
}
/**
*
* @param text Source with CRLF or LF newlines
*
* @param validate If source doesn't contain <tt><sgml
* tokens></tt>, return null.
*
* @param blindquotes If anything in quotes is blindly ignored.
*/
public final static String[][] parse ( String text, boolean validate, boolean blindquotes) {
StringTokenizer strtok = new StringTokenizer(text,"\r\n",true);
String[][] src = null;
int c = 0, cc = 0;
String s1;
try {
/* parse input into lines */
String[] tmp;
while (true){
s1 = strtok.nextToken();
if ( 1 == s1.length()){
switch(s1.charAt(0)){
case '\r':
break;
case '\n':
cc += 1;
if ( 2 <= cc)
src = add(src,null);
break;
default:
cc = 0;
tmp = new String[1];
tmp[0] = s1;
src = add(src,tmp);
break;
}
}
else {
cc = 0;
tmp = new String[1];
tmp[0] = s1;
src = add(src,tmp);
}
}
}
catch ( NoSuchElementException nsx){}
return _parse(src, validate, blindquotes);
}
/**
*
* @param in Source
*
* @param validate If source doesn't contain <tt><sgml
* tokens></tt>, return null.
*
* @param blindquotes If anything in quotes is blindly ignored.
*/
public final static String[][] parse ( InputStream in, boolean validate, boolean blindquotes) {
String src[][] = null;
try {
DataInputStream din;
if ( in instanceof DataInputStream)
din = (DataInputStream)in;
else
din = new DataInputStream (in);
String line, tmp[];
while ( null != (line = din.readLine())){
if ( 0 == line.length())
src = add(src,null);
else {
tmp = new String[1];
tmp[0] = line;
src = add(src,tmp);
}
}
return _parse( src, validate, blindquotes);
}
catch ( IOException iox){
iox.printStackTrace();
return null;
}
}
private final static String[][] _parse ( String[][] src, boolean validate, boolean blindquotes){
if ( null == src) return null;
// TODO (blindquotes)?(HTML-COMMENTS) [[for correct client side interp]]
/*
* Parse lines for tags and tagged- data
*/
char line[], quot = 0;
int llen, intag = 0, /* sgml_validity = 0, */ sgml_tagcount = 0, idx;
String[] linary;
for ( int c = 0, cc; c < src.length; c++){
linary = src[c];
if ( null != linary){
line = linary[0].toCharArray();
llen = line.length;
for ( cc = 0; cc < llen; cc++){
switch(line[cc]){
case '<':
if (blindquotes){
if ( 0 < quot)
break;
else if (0 < intag)
break; // '<' within '<'
}
// sgml_validity += 1;
intag += 1;
if ( 0 < cc)
src[c] = _parse_splitter(src[c],cc);
break;
case '>':
if ( blindquotes && 0 < quot)
break;
else if (0 < intag){
//sgml_validity -= 1;
sgml_tagcount += 1;
intag -= 1; // multiline tag
idx = cc+1;
if ( idx < llen)
src[c] = _parse_splitter( src[c], idx);
}
break;
case '\'':
case '`':
case '"':
if (blindquotes){
if (0 < intag){
if ( 0 == quot)
quot = line[cc];
else if ( quot == line[cc])
quot = 0;
}
else if ( 0 < quot && quot == line[cc])
quot = 0;
}
break;
default:
break;
}
}
}
}
if (validate){
if ( 0 < sgml_tagcount) /* && 0 == sgml_validity //JS comparison "<>" */
return src;
else
return null;
}
else
return src;
}
/**
* This doesn't check for appending a zero- length substring
*/
private final static String[] _parse_splitter( String[] line, int atidx){
int llen = line.length;
if ( 1 < llen){
for ( int clen = (llen-1), cc = 0; cc < clen; cc++)
atidx -= line[cc].length();
if ( 0 < atidx){
String s = line[llen-1];
String s0 = s.substring(0,atidx);
String s1 = s.substring(atidx);
String[] copier = new String[llen+1];
System.arraycopy( line, 0, copier, 0, llen);
copier[llen-1] = s0;
copier[llen] = s1;
return copier;
}
else
return line;
}
else if ( 0 < atidx){
String s = line[0];
String s0 = s.substring(0,atidx);
String s1 = s.substring(atidx);
line = new String[2];
line[0] = s0;
line[1] = s1;
return line;
}
else
return line;
}
/**
* Interned "="
*/
public final static String EQ = "=".intern();
/**
* Split a tag into tokens according to tag syntax, preserving
* quoted attribute values, stripping leading SGML tag "start" and
* "end" ('<', '>') characters. Does not require any
* particular elements of a tag, does not require a start or end
* character. Stops at an SGML tag end character that is not
* within a symmetrically quoted string.
*
* <p> Tag attributes are guaranteed as three tokens, as
* available: name string, equals character string, value string.
* This equals string is "interned" so that in processing the
* result, each element can be compared by value to a similarly
* interned string ("=".intern()) using the java equivalent value
* ("==") operator.
*
* <p> Returned tokens are all trim: no leading or trailing
* whitespace.
*
* @param tag A whole or part of an SGML tag. Ignores anything
* before the tag open character ('<'). */
public final static String[] tag_tokenizer ( String tag){
if ( null == tag)
return null;
else {
int len = tag.length();
if ( 0 >= len) return null;
linebuf buf = new linebuf();
char quot = 0, ch, cary[] = tag.toCharArray();
int mark = 0, tokl;
String tok;
for ( int c = 0; c < len; c++){
ch = cary[c];
switch(ch){
case '<':
if ( 0 == quot)
mark = c+1;
break;
case '>':
if ( 0 == quot){
tokl = c-mark;
if ( 0 < tokl){
tok = new String(cary,mark,tokl);
buf.append(tok);
}
return buf.toStringArray();
}
else
break;
case '=':
if ( 0 == quot){
tokl = c-mark;
if ( 0 < tokl){
tok = new String(cary,mark,tokl);
buf.append(tok);
}
buf.append(EQ); // "=".intern()
mark = c+1;
}
break;
case '\'':
case '`':
case '"':
if ( 0 == quot){
quot = ch;
mark = c;
}
else if ( ch == quot){
quot = 0;
tokl = c-mark;
if ( 0 < tokl){
tok = new String(cary,mark,tokl);
buf.append(tok);
}
mark = c+1;
}
break;
case ' ':
case '\t':
if ( 0 == quot){
tokl = c-mark;
if ( 0 < tokl){
tok = new String(cary,mark,tokl);
buf.append(tok);
}
mark = c+1;
}
break;
default:
break;
}
}
return buf.toStringArray();
}
}
/**
* Normalize an SGML tag attribute value, stripping symmetric
* quotes, returning null for empty strings.
*
* @param att_value Must not include leading or trailing
* whitespace, or otherwise be a string other than a bare tag
* attribute value. */
public final static String trim_value( String att_value){
if ( null == att_value)
return null;
else {
int len = att_value.length();
if ( 0 >= len) return null;
char ch = att_value.charAt(0);
switch(ch){
case '\'':
case '`':
case '"':
if ( ch == att_value.charAt(len-1))
return att_value.substring(1,len-2);
else
return att_value;
default:
return att_value;
}
}
}
private final static String TEXT_PREFIX = "{\\{\\{";
private final static String LINE_LEFT = "{\\{";
private final static String LINE_RIGHT = "}/}";
private final static String LINE_INFIX = "|,|";
private final static String TEXT_SUFFIX = "}/}/}";
private final static String[] helpary = {
null,
" Usage: sgmlp -f filename [ -s | -c ]",
null,
" Description",
null,
"\tDisplays parsed input file using tryglyph token delimiters.",
null,
"\tUses \""+TEXT_PREFIX+"\" before the text.",
null,
"\tUses \""+LINE_LEFT+"\" on the left hand side of a line.",
null,
"\tUses \""+LINE_INFIX+"\" among tokens within a line.",
null,
"\tUses \""+LINE_RIGHT+"\" on the right hand side of a line.",
null,
"\tUses \""+TEXT_SUFFIX+"\" after the text.",
null,
"\tIn this way, the tokenization of the text is readable.",
null,
" Options",
null,
"\t-s\t-- Use server side parsing model.",
"\t \t Enter quoted content.",
null,
"\t-c\t-- Use client side parsing model (default).",
"\t \t Ignore quoted content.",
null,
};
private final static String help = new linebuf(helpary).toString();
public static void main( String[] argv){
try {
File inf = null;
boolean validate_server = false, validate_client = false;
if ( null == argv || 1 > argv.length)
throw new IllegalArgumentException(help);
else {
int alen = argv.length;
String arg;
for ( int argc = 0; argc < alen; argc++){
arg = argv[argc];
if ( 2 > arg.length())
throw new IllegalArgumentException("Unrecognized argument `"+arg+"'");
else {
switch(arg.charAt(0)){
case '-':
case '/':
switch(arg.charAt(1)){
case 'h':
case 'H':
case '?':
throw new IllegalArgumentException(help);
case 'f':
argc += 1;
if ( argc < alen){
arg = argv[argc];
inf = new File(arg);
}
else
throw new IllegalArgumentException("Argument `-f' requires filename.");
break;
case 's':
validate_server = (!validate_server);
break;
case 'c':
validate_client = (!validate_client);
break;
default:
throw new IllegalArgumentException("Unrecognized argument `"+arg+"'");
}
break;
default:
throw new IllegalArgumentException("Unrecognized argument `"+arg+"'");
}
}
}
}
if ( null == inf)
throw new IllegalArgumentException(help);
else if (! (inf.exists() && inf.canRead()))
throw new IllegalArgumentException("Can't read file `"+inf+"'");
InputStream in = new FileInputStream(inf);
try {
String[][] pars = null;
if (validate_server)
pars = parseValidateServer(in);
else if (validate_client)
pars = parseValidateClient(in);
else
pars = parse(in);
if ( null != pars){
System.out.println(TEXT_PREFIX);
for ( int c = 0, cc; c < pars.length; c++){
System.out.print("\t"+LINE_LEFT);
for ( cc = 0; null != pars[c] && cc < pars[c].length; cc++){
if ( null != pars[c][cc])
System.out.print(pars[c][cc]);
if ( cc < pars[c].length-1)
System.out.print(LINE_INFIX);
}
System.out.println(LINE_RIGHT);
}
System.out.println(TEXT_SUFFIX);
}
else
System.out.println("Not SGML.");
}
finally {
in.close();
}
}
catch ( IllegalArgumentException ilx){
System.err.println(ilx.getMessage());
}
catch ( Exception exc){
exc.printStackTrace();
}
}
}