CSVGenerator.java example

Explorer
cocoon-master
- cocoon-BRANCH_2_1_X
  - src
  - tools
    - src
      - anttasks
        DocumentCache.java
        ManifestToolTask.java
        PoolSetterTask.java
        SitemapTask.java
        XConfToolTask.java
      - loader
        Loader.java
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cocoon.generation;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

import org.apache.avalon.framework.parameters.Parameters;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.excalibur.source.Source;
import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * <p>A simple parser converting a Comma Separated Values (CSV) file into XML.</p>
 * 
 * <p>This parser is controlled by the following sitemap parameters:</p>
 * 
 * <ul>
 *   <li>
 *     <b>process-headers</b>: whether the first line in the CSV is considered
 *     to be the header defining column names (the resulting output will be
 *     different if this is <i>true</i> or <i>false</i> (default: <i>false</i>).
 *   </li>
 *   <li>
 *     <b>max-records</b>: the maximum number of records to read
 *     (default: <i>-1</i> read all records).
 *   </li>
 *   <li>
 *     <b>encoding</b>: the character encoding (UTF-8, ISO8859-1, ...) used to
 *     interpret the input CSV source file (default: <i>system default</i>).
 *   </li>
 *   <li>
 *     <b>separator</b>: the field-separator character in the CSV file (comma,
 *     tab, ...) (default: <i>,</i> <small>comma</small>).
 *   </li>
 *   <li>
 *     <b>escape</b>: the character used to escape fields, or part of them, in
 *     the CSV file (default: <i>"</i> <small>quote</small>).
 *   </li>
 *   <li>
 *     <b>buffer-size</b>: the size of the buffer used for reading the source
 *     CSV file (default: <i>4096 bytes</i>).
 *   </li>
 * </ul>
 *
 * <p>The generated output will look something like the following:</p>
 * 
 * <pre>
 * <?xml version="1.0" encoding="ISO-8859-1"?>
 * <csv:document xmlns:csv="http://apache.org/cocoon/csv/1.0">
 *   <csv:header>
 *     <csv:column number="1">Column A</csv:column>
 *     <csv:column number="2">Column B</csv:column>
 *     <csv:column number="3">Column C</csv:column>
 *   </csv:header>
 *   <csv:record number="1">
 *     <csv:field number="1" column="Column A">Field A1</csv:field>
 *     <csv:field number="2" column="Column B">Field B1</csv:field>
 *     <csv:field number="3" column="Column C">Field C1</csv:field>
 *   </csv:record>
 *   <csv:record number="2">
 *     <csv:field number="1" column="Column A">Field A2</csv:field>
 *     <csv:field number="2" column="Column B">Field B2</csv:field>
 *     <csv:field number="3" column="Column C">Field C2</csv:field>
 *   </csv:record>
 * </csv:document>
 * </pre>
 *
 * <p>Note that this generator has been thoroughly tested with CSV files generated
 * by <a href="http://office.microsoft.com/" target="_new">Microsoft Excel</a>.
 * Unfortunately no official CSV specification has ever been published by
 * any standard body, so the interpretation of the format might be slightly
 * different in cases.</p>
 *
 * @author <a href="mailto:pier@apache.org">Pier Fumagalli</a>
 */
public class CSVGenerator extends FileGenerator {

    /** <p>The namespace URI of XML generated by this instance.</p> */
    public static final String NAMESPACE_URI = "http://apache.org/cocoon/csv/1.0";
    /** <p>The namespace prefix of XML generated by this instance.</p> */
    public static final String NAMESPACE_PREFIX = "csv";

    /** <p>The default encoding configured in the Java VM.</p> */
    private static final String DEFAULT_ENCODING = 
        new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
    /** <p>The default field separator character.</p> */
    private static final String DEFAULT_SEPARATOR = ",";
    /** <p>The default field separator character.</p> */
    private static final String DEFAULT_ESCAPE = "\"";
    /** <p>The default field separator character.</p> */
    private static final int DEFAULT_BUFFER_SIZE = 4096;
    private static final int UNLIMITED_MAXRECORDS = -1;
    /** <p>A string used for indenting.</p> */
    private static final char INDENT_STRING[] = "\n          ".toCharArray();

    /** <p>The encoding used to read the CSV resource from a stream.</p> */
    private String encoding = DEFAULT_ENCODING;
    /** <p>The character used to separate fields.</p> */
    private char separator = DEFAULT_SEPARATOR.charAt(0);
    /** <p>The character used to initiate and terminate esacaped sequences.</p> */
    private char escape = DEFAULT_ESCAPE.charAt(0);
    /** <p>The size of the buffer used to read the input.</p> */
    private int buffersize = DEFAULT_BUFFER_SIZE;
    /** <p>The current field (column) number in the current record.</p> */
    private int fieldnumber = 1;
    /** <p>The current record (line) number in the current CSV.</p> */
    private int recordnumber = 1;
    /** <p>The maximum number of records to read (-1 = read all records)</p> */
    private int maxrecords;
    /** <p>A flag indicating whether the <record> tag was opened.</p> */
    private boolean openrecord = false;
    /** <p>The character buffer for the current field.</p> */
    private CharArrayWriter buffer = null;
    /** <p>A map of all known columns or null if no headers are processed.</p> */
    private Map columns = null;

    /**
     * <p>Create a new {@link CSVGenerator} instance.</p>
     */
    public CSVGenerator() {
        super();
    }

    /**
     * <p>Recycle this component.</p>.
     */
    public void recycle() {
        super.recycle();
        
        this.encoding = DEFAULT_ENCODING;
        this.separator = DEFAULT_SEPARATOR.charAt(0);
        this.escape = DEFAULT_ESCAPE.charAt(0);
        this.buffersize = DEFAULT_BUFFER_SIZE;
        this.buffer = null;
        this.columns = null;
        this.recordnumber = 1;
        this.fieldnumber = 1;
        this.openrecord = false;
    }

    /**
     * <p>Setup this {@link CSVGenerator} instance.</p>
     */
    public void setup(SourceResolver resolver, Map object_model, String source,
                      Parameters parameters)
    throws ProcessingException, SAXException, IOException {
        super.setup(resolver, object_model, source, parameters);

        boolean header = parameters.getParameterAsBoolean("process-headers", false);

        this.encoding = parameters.getParameter("encoding", DEFAULT_ENCODING);
        this.separator = parameters.getParameter("separator", DEFAULT_SEPARATOR).charAt(0);
        this.escape = parameters.getParameter("escape", DEFAULT_ESCAPE).charAt(0);
        this.buffersize = parameters.getParameterAsInteger("buffer-size", DEFAULT_BUFFER_SIZE);
        this.maxrecords = parameters.getParameterAsInteger("max-records", UNLIMITED_MAXRECORDS);
        this.buffer = new CharArrayWriter();
        this.columns =  (header ? new HashMap() : null);
        this.recordnumber = (header ? 0 : 1);
        this.fieldnumber = 1;
        this.openrecord = false;
    }

    /**
     * <p>Generate the unique key.</p>
     */
    public Serializable getKey() {
        StringBuffer key = new StringBuffer(this.inputSource.getURI());
        if (this.columns != null) key.append("headers");
        key.append(separator);
        key.append(maxrecords);
        key.append(escape);
        return key;
    }

    /**
     * <p>Generate XML data from a Comma Separated Value resource.</p>.
     */
    public void generate()
    throws IOException, SAXException, ProcessingException {

        /* Create a new Reader correctly decoding the source stream */
        CSVReader csv = new CSVReader(this.inputSource, this.encoding, this.buffersize);

        try {
            /* Start the document */
            this.contentHandler.setDocumentLocator(csv);
            this.contentHandler.startDocument();
            this.contentHandler.startPrefixMapping(NAMESPACE_PREFIX, NAMESPACE_URI);
            this.indent(0);
            this.startElement("document");

            /* Allocate buffer and status for parsing */
            boolean unescaped = true;
            int prev = -1;
            int curr = -1;

            /* Parse the file reading characters one-by-one */
            while ((curr = csv.read()) >= 0 && (this.maxrecords == UNLIMITED_MAXRECORDS || recordnumber <= this.maxrecords)) {

                /* Process any occurrence of the escape character */
                if (curr == this.escape) {
                    if ((unescaped) && (prev == this.escape)) {
                        this.buffer.write(this.escape);
                    }
                    unescaped = ! unescaped;
                    prev = curr;
                    continue;
                }

                /* Process any occurrence of the field separator */
                if ((unescaped) && (curr == this.separator)) {
                    this.dumpField();
                    prev = curr;
                    continue;
                }

                /* Process newline characters */
                if ((unescaped) && ((curr == '\r') || (curr == '\n'))) {
                    this.dumpField();
                    this.dumpRecord();

                    /* Record numbering */
                    if (((curr == '\n') && (prev != '\r')) || (curr == '\r')) {
                        this.recordnumber ++;
                    }
                    
                    /* Nothing else to do */
                    prev = curr;
                    continue;
                }

                /* Any other character simply gets added to the buffer */
                this.buffer.write(curr);
                prev = curr;
            }

            /* Terminate any hanging open record element (just in case) */
            this.dumpField();
            this.dumpRecord();

            /* Terminate the document */
            this.indent(0);
            this.endElement("document");
            this.contentHandler.endPrefixMapping(NAMESPACE_PREFIX);
            this.contentHandler.endDocument();

        } finally {
            csv.close();
        }
    }

    
    private void dumpField()
    throws SAXException {
        if (this.buffer.size() < 1) {
            this.fieldnumber ++;
            return;
        }

        if (! this.openrecord) {
            this.indent(4);

            if (this.recordnumber > 0) {
                AttributesImpl attributes = new AttributesImpl();
                String value = Integer.toString(this.recordnumber);
                attributes.addAttribute("", "number", "number", "CDATA", value);
                this.startElement("record", attributes);
            } else {
                this.startElement("header");
            }
            this.openrecord = true;
        }

        /* Enclode the field in the proper element */
        String element = "field";
        char array[] = this.buffer.toCharArray();
        this.indent(8);

        AttributesImpl attributes = new AttributesImpl();
        String value = Integer.toString(this.fieldnumber);
        attributes.addAttribute("", "number", "number", "CDATA", value);

        if (this.recordnumber < 1) {
            this.columns.put(new Integer(this.fieldnumber), new String(array));
            element = "column";
        } else if (this.columns != null) {
            String header = (String) this.columns.get(new Integer(this.fieldnumber));
            if (header != null) {
                attributes.addAttribute("", "column", "column", "CDATA", header);
            }
        }

        this.startElement(element, attributes);
        this.contentHandler.characters(array, 0, array.length);
        this.endElement(element);
        this.buffer.reset();

        this.fieldnumber ++;
    }

    private void dumpRecord()
    throws SAXException {
        if (this.openrecord) {
            this.indent(4);
            if (this.recordnumber > 0) {
                this.endElement("record");
            } else {
                this.endElement("header");
            }
            this.openrecord = false;
        }
        this.fieldnumber = 1;
    }

    private void indent(int level)
    throws SAXException {
        this.contentHandler.characters(INDENT_STRING, 0, level + 1);
    }

    private void startElement(String name)
    throws SAXException {
        this.startElement(name, new AttributesImpl());
    }

    private void startElement(String name, Attributes atts)
    throws SAXException {
        if (name == null) throw new NullPointerException("Null name");
        if (atts == null) atts = new AttributesImpl();
        String qual = NAMESPACE_PREFIX + ':' + name;
        this.contentHandler.startElement(NAMESPACE_URI, name, qual, atts);
    }

    private void endElement(String name)
    throws SAXException {
        String qual = NAMESPACE_PREFIX + ':' + name;
        this.contentHandler.endElement(NAMESPACE_URI, name, qual);
    }

    private static final class CSVReader extends Reader implements Locator {
        
        private String uri = null;
        private Reader input = null;
        private int column = 1;
        private int line = 1;
        private int last = -1;

        private CSVReader(Source source, String encoding, int buffer)
        throws IOException {
            InputStream stream = source.getInputStream();
            Reader reader = new InputStreamReader(stream, encoding);
            this.input = new BufferedReader(reader, buffer);
            this.uri = source.getURI();
        }

        public String getPublicId() {
            return null;
        }

        public String getSystemId() {
            return this.uri;
        }

        public int getLineNumber() {
            return this.line;
        }

        public int getColumnNumber() {
            return this.column;
        }

        public void close()
        throws IOException {
            this.input.close();
        }
        
        public int read()
        throws IOException {
            int c = this.input.read();
            if (c < 0) return c;

            if (((c == '\n') && (this.last != '\r')) || (c == '\r')) {
                this.column = 1;
                this.line ++;
            }

            this.last = c;
            return c;
        }

        public int read(char b[], int o, int l)
        throws IOException {
            if (b == null) throw new NullPointerException();
            if ((o<0)||(o>b.length)||(l<0)||((o+l)>b.length)||((o+l)<0)) {
                throw new IndexOutOfBoundsException();
            }
            if (l == 0) return 0;

            int c = read();
            if (c == -1) return -1;
            b[o] = (char)c;

            int i = 1;
            try {
                for (i = 1; i < l ; i++) {
                    c = read();
                    if (c == -1) break;
                    b[o + i] = (char)c;
                }
            } catch (IOException ee) {
                return i;
            }
            return i;
        }
    }
}