DelimitedDataLoader.java example

Explorer

recordloader-master
- src
  - java
    - com
      - marklogic
        ps
        Crypto.java
        JEncryption.java
        PasswordDecrypter.java
        PasswordEncrypter.java
        PropertyClientInterface.java
        RecordLoader.java
        SimpleLogger.java
        TestCipher.java
        TestCrypto.java
        Utilities.java
        timing
        TimedEvent.java
        TimedEventDurationComparator.java
        TimedEventTests.java
        Timer.java
        recordloader
        AbstractConfiguration.java
        AbstractContent.java
        AbstractInputHandler.java
        AbstractLoader.java
        Configuration.java
        ContentFactory.java
        ContentInterface.java
        DefaultInputHandler.java
        FatalException.java
        FileLoader.java
        InputHandlerInterface.java
        Loader.java
        LoaderException.java
        LoaderFactory.java
        LoaderInterface.java
        Monitor.java
        Producer.java
        ProducerFactory.java
        TranscodingFileLoader.java
        TranscodingLoader.java
        ZipReference.java
        http
        HttpModuleContent.java
        HttpModuleContentFactory.java
        junit
        ProducerTest.java
        svn
        SvnInputHandler.java
        xcc
        DelimitedDataConfiguration.java
        DelimitedDataLoader.java
        XccAbstractContent.java
        XccAbstractContentFactory.java
        XccConfiguration.java
        XccContent.java
        XccContentFactory.java
        XccModuleContent.java
        XccModuleContentFactory.java

/**
 * Copyright (c) 2008-2010 Mark Logic Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * The use of the Apache License does not indicate that this project is
 * affiliated with the Apache Software Foundation.
 */
package com.marklogic.recordloader.xcc;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

import com.marklogic.ps.Utilities;
import com.marklogic.ps.timing.TimedEvent;
import com.marklogic.recordloader.Configuration;
import com.marklogic.recordloader.FatalException;
import com.marklogic.recordloader.LoaderException;
import com.marklogic.recordloader.TranscodingLoader;

/**
 * @author Michael Blakeley, michael.blakeley@marklogic.com
 *
 */
public class DelimitedDataLoader extends TranscodingLoader {

    DelimitedDataConfiguration config;

    private String recordName;

    private String idName;

    private String fieldDelimiter;

    private int lineNumber;

    private boolean isFatalErrors;

    private String fields[];

    private String[] labels;

    private int labelIndex;

    private String charsetName;

    /*
     * (non-Javadoc)
     *
     * @see com.marklogic.recordloader.AbstractLoader#process()
     */
    @SuppressWarnings("unused")
    public void process() throws LoaderException {
        super.process();

        logger.fine("starting with decoder = " + decoder);
        if (null != decoder) {
            charsetName = decoder.charset().name();
            logger.fine("using " + charsetName);
        }

        if (!(super.config instanceof DelimitedDataConfiguration)) {
            throw new FatalException(
                    Configuration.CONFIGURATION_CLASSNAME_KEY
                            + " must be set to "
                            + DelimitedDataConfiguration.class.getName());
        }
        config = (DelimitedDataConfiguration) super.config;
        fieldDelimiter = config.getFieldDelimiter();
        idName = config.getIdNodeName();
        recordName = config.getRecordName();
        isFatalErrors = config.isFatalErrors();

        boolean downcaseLabels = config.isDowncaseLabels();

        BufferedReader br = new BufferedReader(new InputStreamReader(
                input, decoder));
        String line;
        String id;

        lineNumber = 0;
        labelIndex = 0;
        if (downcaseLabels) {
            recordName = recordName.toLowerCase();
        }

        try {
            // first line contains the labels
            line = br.readLine();
            lineNumber++;
            labels = line.split(fieldDelimiter);
            // match the configured idName with the input labels
            for (int i = 0; i < labels.length; i++) {
                if (downcaseLabels) {
                    labels[i] = labels[i].toLowerCase();
                }
                if (idName.equals(labels[i])) {
                    labelIndex = i;
                    // do not exit loop - must downcase remaining labels
                }
            }
            logger.info("found labels " + labels.length);

            while (null != (line = br.readLine())) {
                String xml = null;
                // line-by-line, so we can move on after errors
                try {
                    xml = handleRecord(line);
                    event.stop();
                } catch (Exception e) {
                    if (isFatalErrors) {
                        throw new FatalException(e);
                    }
                    event.stop(true);
                    logger.logException(e);
                } finally {
                    updateMonitor((null != xml) ? xml.length() : 0);
                    cleanupRecord();
                }
            }
        } catch (Exception e) {
            if (isFatalErrors) {
                throw new FatalException(e);
            }
            event.stop(true);
            logger.logException(e);
        } finally {
            try {
                br.close();
            } catch (IOException e) {
                // no point in doing anything...
                logger.logException(e);
            }
            cleanupInput(event.isError());
        }
    }

    /**
     * @param line
     * @return
     * @throws LoaderException
     * @throws IOException
     */
    private String handleRecord(String line) throws LoaderException,
            IOException {
        String xml;
        String id;
        event = new TimedEvent();
        lineNumber++;
        // TODO this is too simplistic for CSV with quoted values
        // by default, split() discards empty strings
        fields = line.split(fieldDelimiter, labels.length);

        // sanity check
        if (fields.length != labels.length) {
            String msg = "document mismatch:"
                + " fields=" + fields.length
                + ", labels=" + labels.length
                + " at "
                + ((null == currentRecordPath)
                   ? "stdin" : currentRecordPath)
                + ":" + lineNumber + ": " + line;
            // caller will decide if this is fatal or not
            throw new LoaderException(msg);
        }

        id = fields[labelIndex];
        currentUri = composeUri(id);
        content = contentFactory.newContent(currentUri);
        boolean skippingRecord = checkIdAndUri(currentRecordPath);

        xml = getXml(labels, fields);

        if (null != xml && !skippingRecord) {
            // write the xml
            // NB - getBytes will return the default-encoding bytes
            content.setBytes(null == decoder ? xml.getBytes() : xml
                    .getBytes(charsetName));
            insert();
        }
        return xml;
    }

    /**
     * @param labels
     * @param fields
     * @return
     */
    private String getXml(String[] labels, String[] fields) {
        // build the xml
        StringBuilder xml = new StringBuilder("<" + recordName + ">");
        for (int i = 0; i < labels.length; i++) {
            xml.append("<" + labels[i] + ">"
                    + Utilities.escapeXml(fields[i]) + "</" + labels[i]
                    + ">");
        }
        xml.append("</" + recordName + ">");
        return xml.toString();
    }

}