DSpaceCSV.java example

Explorer
DSpace-SVN-Deprecated-master
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.app.bulkedit;

import org.dspace.content.*;
import org.dspace.content.Collection;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Context;

import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.*;

/**
 * Utility class to read and write CSV files
 *
 * **************
 * Important Note
 * **************
 *
 * This class has been made serializable, as it is stored in a Session.
 * Is it wise to:
 *    a) be putting this into a user's session?
 *    b) holding an entire CSV upload in memory?
 *
 * @author Stuart Lewis
 */
public class DSpaceCSV implements Serializable
{
    /** The headings of the CSV file */
    private List<String> headings;

    /** An array list of CSV lines */
    private List<DSpaceCSVLine> lines;

    /** A counter of how many CSV lines this object holds */
    private int counter;

    /** The value separator (defaults to double pipe '||') */
    protected static String valueSeparator;

    /** The value separator in an escaped form for using in regexs */
    protected static String escapedValueSeparator;

    /** The field separator (defaults to comma) */
    protected static String fieldSeparator;

    /** The field separator in an escaped form for using in regexs */
    protected static String escapedFieldSeparator;

    /** Whether to export all metadata such as handles and provenance information */
    private boolean exportAll;

    /** A list of metadata elements to ignore */
    private Map<String, String> ignore;


    /**
     * Create a new instance of a CSV line holder
     *
     * @param exportAll Whether to export all metadata such as handles and provenance information
     */
    public DSpaceCSV(boolean exportAll)
    {
        // Initialise the class
        init();

        // Store the exportAll setting
        this.exportAll = exportAll;
    }

    /**
     * Create a new instance, reading the lines in from file
     *
     * @param f The file to read from
     * @param c The DSpace Context
     *
     * @throws Exception thrown if there is an error reading or processing the file
     */
    public DSpaceCSV(File f, Context c) throws Exception
    {
        // Initialise the class
        init();

        // Open the CSV file
        BufferedReader input = null;
        try
        {
            input = new BufferedReader(new InputStreamReader(new FileInputStream(f),"UTF-8"));

            // Read the heading line
            String head = input.readLine();
            String[] headingElements = head.split(escapedFieldSeparator);
            int columnCounter = 0;
            for (String element : headingElements)
            {
                columnCounter++;

                // Remove surrounding quotes if there are any
                if ((element.startsWith("\"")) && (element.endsWith("\"")))
                {
                    element = element.substring(1, element.length() - 1);
                }

                // Store the heading
                if ("collection".equals(element))
                {
                    // Store the heading
                    headings.add(element);
                }
                // Store the action
                else if ("action".equals(element))
                {
                    // Store the heading
                    headings.add(element);
                }
                else if (!"id".equals(element))
                {
                    // Verify that the heading is valid in the metadata registry
                    String[] clean = element.split("\\[");
                    String[] parts = clean[0].split("\\.");

                    if (parts.length < 2) {
                        throw new MetadataImportInvalidHeadingException(element,
                                                                        MetadataImportInvalidHeadingException.ENTRY,
                                                                        columnCounter);
                    }

                    String metadataSchema = parts[0];
                    String metadataElement = parts[1];
                    String metadataQualifier = null;
                    if (parts.length > 2) {
                        metadataQualifier = parts[2];
                    }

                    // Check that the scheme exists
                    MetadataSchema foundSchema = MetadataSchema.find(c, metadataSchema);
                    if (foundSchema == null) {
                        throw new MetadataImportInvalidHeadingException(clean[0],
                                                                        MetadataImportInvalidHeadingException.SCHEMA,
                                                                        columnCounter);
                    }

                    // Check that the metadata element exists in the schema
                    int schemaID = foundSchema.getSchemaID();
                    MetadataField foundField = MetadataField.findByElement(c, schemaID, metadataElement, metadataQualifier);
                    if (foundField == null) {
                        throw new MetadataImportInvalidHeadingException(clean[0],
                                                                        MetadataImportInvalidHeadingException.ELEMENT,
                                                                        columnCounter);
                    }

                    // Store the heading
                    headings.add(element);
                }
            }

            // Read each subsequent line
            StringBuilder lineBuilder = new StringBuilder();
            String lineRead;

            while ((lineRead = input.readLine()) != null)
            {
                if (lineBuilder.length() > 0) {
                    // Already have a previously read value - add this line
                    lineBuilder.append("\n").append(lineRead);

                    // Count the number of quotes in the buffer
                    int quoteCount = 0;
                    for (int pos = 0; pos < lineBuilder.length(); pos++) {
                        if (lineBuilder.charAt(pos) == '"') {
                            quoteCount++;
                        }
                    }

                    if (quoteCount % 2 == 0) {
                        // Number of quotes is a multiple of 2, add the item
                        addItem(lineBuilder.toString());
                        lineBuilder = new StringBuilder();
                    }
                } else if (lineRead.indexOf('"') > -1) {
                    // Get the number of quotes in the line
                    int quoteCount = 0;
                    for (int pos = 0; pos < lineRead.length(); pos++) {
                        if (lineRead.charAt(pos) == '"') {
                            quoteCount++;
                        }
                    }

                    if (quoteCount % 2 == 0) {
                        // Number of quotes is a multiple of 2, add the item
                        addItem(lineRead);
                    } else {
                        // Uneven quotes - add to the buffer and leave for later
                        lineBuilder.append(lineRead);
                    }
                } else {
                    // No previously read line, and no quotes in the line - add item
                    addItem(lineRead);
                }
            }
        }
        finally
        {
            if (input != null)
            {
                input.close();
            }
        }
    }

    /**
     * Initialise this class with values from dspace.cfg
     */
    private void init()
    {
        // Set the value separator
        setValueSeparator();

        // Set the field separator
        setFieldSeparator();

        // Create the headings
        headings = new ArrayList<String>();

        // Create the blank list of items
        lines = new ArrayList<DSpaceCSVLine>();

        // Initialise the counter
        counter = 0;

        // Set the metadata fields to ignore
        ignore = new HashMap<String, String>();
        String toIgnore = ConfigurationManager.getProperty("bulkedit", "ignore-on-export");
        if ((toIgnore == null) || ("".equals(toIgnore.trim())))
        {
            // Set a default value
            toIgnore = "dc.date.accessioned, dc.date.available, " +
                       "dc.date.updated, dc.description.provenance";
        }
        String[] toIgnoreArray = toIgnore.split(",");
        for (String toIgnoreString : toIgnoreArray)
        {
            if (!"".equals(toIgnoreString.trim()))
            {
                ignore.put(toIgnoreString.trim(), toIgnoreString.trim());
            }
        }
    }

    /**
     * Decide if this CSV file has an 'action' (case-dependent!) header.
     *
     * @return Whether or not there is an 'action' header
     */
    public boolean hasActions() {
        // Look for a heading called 'action'
        for (String header : headings) {
            if (header.equals("action")) {
                return true;
            }
        }
        return false;
    }

    /**
     * Set the value separator for multiple values stored in one csv value.
     *
     * Is set in bulkedit.cfg as valueseparator
     *
     * If not set, defaults to double pipe '||'
     */
    private void setValueSeparator()
    {
        // Get the value separator
        valueSeparator = ConfigurationManager.getProperty("bulkedit", "valueseparator");
        if ((valueSeparator != null) && (!"".equals(valueSeparator.trim())))
        {
            valueSeparator = valueSeparator.trim();
        }
        else
        {
            valueSeparator = "||";
        }

        // Now store the escaped version
        Pattern spchars = Pattern.compile("([\\\\*+\\[\\](){}\\$.?\\^|])");
        Matcher match = spchars.matcher(valueSeparator);
        escapedValueSeparator = match.replaceAll("\\\\$1");
    }

    /**
     * Set the field separator use to separate fields in the csv.
     *
     * Is set in bulkedit.cfg as fieldseparator
     *
     * If not set, defaults to comma ','.
     *
     * Special values are 'tab', 'hash' and 'semicolon' which will
     * get substituted from the text to the value.
     */
    private void setFieldSeparator()
    {
        // Get the value separator
        fieldSeparator = ConfigurationManager.getProperty("bulkedit", "fieldseparator");
        if ((fieldSeparator != null) && (!"".equals(fieldSeparator.trim())))
        {
            fieldSeparator = fieldSeparator.trim();
            if ("tab".equals(fieldSeparator))
            {
                fieldSeparator = "\t";
            }
            else if ("semicolon".equals(fieldSeparator))
            {
                fieldSeparator = ";";
            }
            else if ("hash".equals(fieldSeparator))
            {
                fieldSeparator = "#";
            }
            else
            {
                fieldSeparator = fieldSeparator.trim();
            }
        }
        else
        {
            fieldSeparator = ",";
        }

        // Now store the escaped version
        Pattern spchars = Pattern.compile("([\\\\*+\\[\\](){}\\$.?\\^|])");
        Matcher match = spchars.matcher(fieldSeparator);
        escapedFieldSeparator = match.replaceAll("\\\\$1");
    }

    /**
     * Add a DSpace item to the CSV file
     *
     * @param i The DSpace item
     *
     * @throws Exception if something goes wrong with adding the Item
     */
    public final void addItem(Item i) throws Exception
    {
        // Create the CSV line
        DSpaceCSVLine line = new DSpaceCSVLine(i.getID());

        // Add in owning collection
        String owningCollectionHandle = i.getOwningCollection().getHandle();
        line.add("collection", owningCollectionHandle);

        // Add in any mapped collections
        Collection[] collections = i.getCollections();
        for (Collection c : collections)
        {
            // Only add if it is not the owning collection
            if (!c.getHandle().equals(owningCollectionHandle))
            {
                line.add("collection", c.getHandle());
            }
        }

        // Populate it
        DCValue md[] = i.getMetadata(Item.ANY, Item.ANY, Item.ANY, Item.ANY);
        for (DCValue value : md)
        {
            // Get the key (schema.element)
            String key = value.schema + "." + value.element;

            // Add the qualifier if there is one (schema.element.qualifier)
            if (value.qualifier != null)
            {
                key = key + "." + value.qualifier;
            }

            // Add the language if there is one (schema.element.qualifier[langauge])
            //if ((value.language != null) && (!"".equals(value.language)))
            if (value.language != null)
            {
                key = key + "[" + value.language + "]";
            }

            // Store the item
            if (exportAll || okToExport(value))
            {
                line.add(key, value.value);
                if (!headings.contains(key))
                {
                    headings.add(key);
                }
            }
        }
        lines.add(line);
        counter++;
    }

    /**
     * Add an item to the CSV file, from a CSV line of elements
     *
     * @param line The line of elements
     * @throws Exception Thrown if an error occurs when adding the item
     */
    public final void addItem(String line) throws Exception
    {
        // Check to see if the last character is a field separator, which hides the last empy column
        boolean last = false;
        if (line.endsWith(fieldSeparator))
        {
            // Add a space to the end, then remove it later
            last = true;
            line += " ";
        }

        // Split up on field separator
        String[] parts = line.split(escapedFieldSeparator);
        ArrayList<String> bits = new ArrayList<String>();
        bits.addAll(Arrays.asList(parts));

        // Merge parts with embedded separators
        boolean alldone = false;
        while (!alldone)
        {
            boolean found = false;
            int i = 0;
            for (String part : bits)
            {
                int bitcounter = part.length() - part.replaceAll("\"", "").length();
                if ((part.startsWith("\"")) && ((!part.endsWith("\"")) || ((bitcounter & 1) == 1)))
                {
                    found = true;
                    String add = bits.get(i) + fieldSeparator + bits.get(i + 1);
                    bits.remove(i);
                    bits.add(i, add);
                    bits.remove(i + 1);
                    break;
                }
                i++;
            }
            alldone = !found;
        }

        // Deal with quotes around the elements
        int i = 0;
        for (String part : bits)
        {
            if ((part.startsWith("\"")) && (part.endsWith("\"")))
            {
                part = part.substring(1, part.length() - 1);
                bits.set(i, part);
            }
            i++;
        }

        // Remove embedded quotes
        i = 0;
        for (String part : bits)
        {
            if (part.contains("\"\""))
            {
                part = part.replaceAll("\"\"", "\"");
                bits.set(i, part);
            }
            i++;
        }

        // Add elements to a DSpaceCSVLine
        String id = parts[0].replaceAll("\"", "");
        DSpaceCSVLine csvLine;

        // Is this an existing item, or a new item (where id = '+')
        if ("+".equals(id))
        {
            csvLine = new DSpaceCSVLine();
        }
        else
        {
            try
            {
                csvLine = new DSpaceCSVLine(Integer.parseInt(id));
            }
            catch (NumberFormatException nfe)
            {
                System.err.println("Invalid item identifier: " + id);
                System.err.println("Please check your CSV file for information. " +
                                   "Item id must be numeric, or a '+' to add a new item");
                throw(nfe);
            }
        }

        // Add the rest of the parts
        i = 0;
        for (String part : bits)
        {
            if (i > 0)
            {
                // Is this a last empty item?
                if ((last) && (i == headings.size()))
                {
                    part = "";
                }

                // Make sure we register that this column was there
                if (headings.size() < i) {
                    throw new MetadataImportInvalidHeadingException("",
                                                                    MetadataImportInvalidHeadingException.MISSING,
                                                                    i + 1);
                }
                csvLine.add(headings.get(i - 1), null);
                String[] elements = part.split(escapedValueSeparator);
                for (String element : elements)
                {
                    if ((element != null) && (!"".equals(element)))
                    {
                        csvLine.add(headings.get(i - 1), element);
                    }
                }
            }
            i++;
        }
        lines.add(csvLine);
        counter++;
    }

    /**
     * Get the lines in CSV holders
     *
     * @return The lines
     */
    public final List<DSpaceCSVLine> getCSVLines()
    {
        // Return the lines
        return lines;
    }

    /**
     * Get the CSV lines as an array of CSV formatted strings
     *
     * @return the array of CSV formatted Strings
     */
    public final String[] getCSVLinesAsStringArray()
    {
        // Create the headings line
        String[] csvLines = new String[counter + 1];
        csvLines[0] = "id" + fieldSeparator + "collection";
        Collections.sort(headings);
        for (String value : headings)
        {
            csvLines[0] = csvLines[0] + fieldSeparator + value;
        }

        Iterator<DSpaceCSVLine> i = lines.iterator();
        int c = 1;
        while (i.hasNext())
        {
            csvLines[c++] = i.next().toCSV(headings);
        }

        return csvLines;
    }

    /**
     * Save the CSV file to the given filename
     *
     * @param filename The filename to save the CSV file to
     *
     * @throws IOException Thrown if an error occurs when writing the file
     */
    public final void save(String filename) throws IOException
    {
        // Save the file
        BufferedWriter out = new BufferedWriter(
                             new OutputStreamWriter(
                             new FileOutputStream(filename), "UTF-8"));
        for (String csvLine : getCSVLinesAsStringArray()) {
            out.write(csvLine + "\n");
        }
        out.flush();
        out.close();
    }

    /**
     * Is it Ok to export this value? When exportAll is set to false, we don't export
     * some of the metadata elements.
     *
     * The list can be configured via the key ignore-on-export in bulkedit.cfg
     *
     * @param md The DCValue to examine
     * @return Whether or not it is OK to export this element
     */
    private final boolean okToExport(DCValue md)
    {
        // Now compare with the list to ignore
        String key = md.schema + "." + md.element;
        if (md.qualifier != null)
        {
            key += "." + md.qualifier;
        }
        if (ignore.get(key) != null) {
            return false;
        }

        // Must be OK, so don't ignore
        return true;
    }

    /**
     * Get the headings used in this CSV file
     *
     * @return The headings
     */
    public List<String> getHeadings()
    {
        return headings;
    }

    /**
     * Return the csv file as one long formatted string
     *
     * @return The formatted String as a csv
     */
    public final String toString()
    {
        // Return the csv as one long string
        StringBuffer csvLines = new StringBuffer();
        String[] lines = this.getCSVLinesAsStringArray();
        for (String line : lines)
        {
            csvLines.append(line).append("\n");
        }
        return csvLines.toString();
    }
}