/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.slop.parsing; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.xml.XMLUtils; import org.apache.cocoon.slop.interfaces.SlopParser; import org.apache.cocoon.slop.interfaces.SlopConstants; /** * Simplistic SLOP parser, recognizes the following constructs: * * Field: a line starting with letters and : is considered a field * * Empty lines are detected. * Other lines are output as line elements * * This is sufficient for basic parsing of RFC 822 headers, * but a configurable rfc822 mode would be good to differentiate * between the header and body of the email message and parse them * with different rules. * * @author <a href="mailto:bdelacretaz@apache.org">Bertrand Delacretaz</a> * @version $Id$ */ public class SimpleSlopParser implements SlopParser,SlopConstants { private ContentHandler contentHandler; /** chars that can be part of a field name (other than letters) */ private final static String DEFAULT_TAGNAME_CHARS = "-_"; private String tagnameChars = DEFAULT_TAGNAME_CHARS; /** valid characters in an XML element name (in addition to letters and digits) */ final static String VALID_TAGNAME_CHARS = "_-"; final static String TAGNAME_REPLACEMENT_CHAR = "_"; /** optionally preserve whitespace in input */ private boolean preserveSpace = false; /** count lines */ private int lineCounter; /** result of parsing a line */ static class ParsedLine { final String name; final String contents; ParsedLine(String elementName, String elementContents) { name = filterElementName(elementName); contents = elementContents; } } /** make sure element names are valid XML */ static String filterElementName(String str) { final StringBuffer sb = new StringBuffer(); for(int i=0; i < str.length(); i++) { final char c = str.charAt(i); if(Character.isLetter(c)) { sb.append(c); } else if(Character.isDigit(c) && i > 0) { sb.append(c); } else if(VALID_TAGNAME_CHARS.indexOf(c) >= 0) { sb.append(c); } else { sb.append(TAGNAME_REPLACEMENT_CHAR); } } return sb.toString(); } /** set the list of valid chars for tag names (in addition to letters) */ public void setValidTagnameChars(String str) { tagnameChars = (str == null ? DEFAULT_TAGNAME_CHARS : str.trim()); } /** optionally preserve whitespace in input */ public void setPreserveWhitespace(boolean b) { preserveSpace = b; } /** must be called before any call to processLine() */ public void startDocument(ContentHandler destination) throws SAXException, ProcessingException { contentHandler = destination; contentHandler.startDocument(); contentHandler.startPrefixMapping("", SLOP_NAMESPACE_URI); contentHandler.startElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT, XMLUtils.EMPTY_ATTRIBUTES); } /** must be called once all calls to processLine() are done */ public void endDocument() throws SAXException, ProcessingException { contentHandler.endElement(SLOP_NAMESPACE_URI, SLOP_ROOT_ELEMENT, SLOP_ROOT_ELEMENT); contentHandler.endPrefixMapping(""); contentHandler.endDocument(); contentHandler = null; } /** add simple name-value attribute to attr */ private void setAttribute(AttributesImpl attr,String name,String value) { final String ATTR_TYPE = "NMTOKEN"; attr.addAttribute("",name,name,ATTR_TYPE,value); } /** call this to process input lines, does the actual parsing */ public void processLine(String line) throws SAXException, ProcessingException { if(contentHandler == null) { throw new ProcessingException("SimpleSlopParser content handler is null (startDocument not called?)"); } // find out which element name to use, based on the contents of the line final ParsedLine p = parseLine(line); // generate the element and its contents lineCounter++; final AttributesImpl atts = new AttributesImpl(); setAttribute(atts,SLOP_ATTR_LINENUMBER,String.valueOf(lineCounter)); contentHandler.startElement(SLOP_NAMESPACE_URI, p.name, p.name, atts); contentHandler.characters(p.contents.toCharArray(),0,p.contents.length()); contentHandler.endElement(SLOP_NAMESPACE_URI, p.name, p.name); } /** parse a line, extract element name and contents */ protected ParsedLine parseLine(String line) { ParsedLine result = null; // empty lines if(line == null || line.trim().length()==0) { result = new ParsedLine(SLOP_EMPTY_LINE_ELEMENT,""); } // simple extraction of field names, lines starting with alpha chars followed // by a colon are parsed as follows: // // input: // field-name: this line is a field // output: // <field-name>this line is a field</field-name> if(result == null) { final int colonPos = line.indexOf(':'); if(colonPos > 0) { boolean fieldFound = true; for(int i=0; i < colonPos; i++) { final char c = line.charAt(i); final boolean isFieldChar = Character.isLetter(c) || tagnameChars.indexOf(c) >= 0; if(!isFieldChar) { fieldFound = false; break; } } if(fieldFound) { String contents = ""; if(line.length() > colonPos + 1) { final String str = line.substring(colonPos+1); contents = (preserveSpace ? str : str.trim()); } result = new ParsedLine(line.substring(0,colonPos),contents); } } } // default: output a line element if(result == null) { final String str = (preserveSpace ? line : line.trim()); result = new ParsedLine(SLOP_LINE_ELEMENT,str); } return result; } }