/* * DSpaceCSV.java * * Version: $Revision$ * * Date: $Date$ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.bulkedit; import org.dspace.content.*; import org.dspace.content.Collection; import org.dspace.core.ConfigurationManager; import java.util.*; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.io.*; /** * Utility class to read and write CSV files * * @author Stuart Lewis */ public class DSpaceCSV { /** The headings of the CSV file */ private ArrayList<String> headings; /** An array list of CSV lines */ private ArrayList<DSpaceCSVLine> lines; /** A counter of how many CSV lines this object holds */ private int counter; /** The value separator (defaults to double pipe '||') */ protected static String valueSeparator; /** The value separator in an escaped form for using in regexs */ protected static String escpaedValueSeparator; /** The field separator (defaults to comma) */ protected static String fieldSeparator; /** The field separator in an escaped form for using in regexs */ protected static String escapedFieldSeparator; /** Whether to export all metadata such as handles and provenance information */ private boolean exportAll; /** A list of metadata elements to ignore */ private Hashtable ignore; /** * Create a new instance of a CSV line holder * * @param exportAll Whether to export all metadata such as handles and provenance information */ public DSpaceCSV(boolean exportAll) { // Initalise the class init(); // Stoee the exportAll setting this.exportAll = exportAll; } /** * Create a new instance, reading the lines in from file * * @param f The file to read from * * @throws Exception thrown if there is an error reading or processing the file */ public DSpaceCSV(File f) throws Exception { // Initalise the class init(); // Open the CSV file BufferedReader input = new BufferedReader(new InputStreamReader(new FileInputStream(f),"UTF8")); // Read the heading line String head = input.readLine(); String[] headingElements = head.split(escapedFieldSeparator); for (String element : headingElements) { // Remove surrounding quotes if there are any if ((element.startsWith("\"")) && (element.endsWith("\""))) { element = element.substring(1, element.length() - 1); } if (!"id".equals(element)) { // Store the heading headings.add(element); } } // Read each subsequent line String line; while ((line = input.readLine()) != null){ // Are there an odd number of quotes? while (((" " + line + " ").split("\"").length)%2 == 0) { line = line + "\n" + input.readLine(); } // Parse the item metadata addItem(line); } } /** * Initalise this class with values from dspace.cfg */ private void init() { // Set the value separator setValueSeparator(); // Set the field separator setFieldSeparator(); // Create the headings headings = new ArrayList<String>(); // Create the blank list of items lines = new ArrayList<DSpaceCSVLine>(); // Initalise the counter counter = 0; // Set the metadata fields to ignore ignore = new Hashtable(); String toIgnore = ConfigurationManager.getProperty("bulkedit.ignore-on-export"); if ((toIgnore == null) || ("".equals(toIgnore.trim()))) { // Set a default value toIgnore = "dc.date.accession, dc.date.available, " + "dc.description.provenance"; } String[] toIgnoreArray = toIgnore.split(","); for (String toIgnoreString : toIgnoreArray) { if (!"".equals(toIgnoreString.trim())) { ignore.put(toIgnoreString.trim(), toIgnoreString.trim()); } } } /** * Set the value separator for multiple values stored in one csv value. * * Is set in dspace.cfg as bulkedit.valueseparator * * If not set, defaults to double pipe '||' */ private void setValueSeparator() { // Get the value separator valueSeparator = ConfigurationManager.getProperty("bulkedit.valueseparator"); if ((valueSeparator != null) && (!"".equals(valueSeparator.trim()))) { valueSeparator = valueSeparator.trim(); } else { valueSeparator = "||"; } // Now store the escaped version Pattern spchars = Pattern.compile("([\\\\*+\\[\\](){}\\$.?\\^|])"); Matcher match = spchars.matcher(valueSeparator); escpaedValueSeparator = match.replaceAll("\\\\$1"); } /** * Set the field separator use to separate fields in the csv. * * Is set in dspace.cfg as bulkedit.fieldseparator * * If not set, defaults to comma ','. * * Special values are 'tab', 'hash' and 'semicolon' which will * get substituted from the text to the value. */ private void setFieldSeparator() { // Get the value separator fieldSeparator = ConfigurationManager.getProperty("bulkedit.fieldseparator"); if ((fieldSeparator != null) && (!"".equals(fieldSeparator.trim()))) { fieldSeparator = fieldSeparator.trim(); if ("tab".equals(fieldSeparator)) { fieldSeparator = "\t"; } else if ("semicolon".equals(fieldSeparator)) { fieldSeparator = ";"; } else if ("hash".equals(fieldSeparator)) { fieldSeparator = "#"; } else { fieldSeparator = fieldSeparator.trim(); } } else { fieldSeparator = ","; } // Now store the escaped version Pattern spchars = Pattern.compile("([\\\\*+\\[\\](){}\\$.?\\^|])"); Matcher match = spchars.matcher(fieldSeparator); escapedFieldSeparator = match.replaceAll("\\\\$1"); } /** * Add a DSpace item to the CSV file * * @param i The DSpace item * * @throws Exception if something goes wrong with adding the Item */ public void addItem(Item i) throws Exception { // Create the CSV line DSpaceCSVLine line = new DSpaceCSVLine(i.getID()); // Add in owning collection String owningCollectionHandle = i.getOwningCollection().getHandle(); line.add("collection", owningCollectionHandle); // Add in any mapped collections Collection[] collections = i.getCollections(); for (Collection c : collections) { // Only add if it is not the owning collection if (!c.getHandle().equals(owningCollectionHandle)) { line.add("collection", c.getHandle()); } } // Populate it DCValue md[] = i.getMetadata(Item.ANY, Item.ANY, Item.ANY, Item.ANY); for (DCValue value : md) { // Get the key (schema.element) String key = value.schema + "." + value.element; // Add the qualifer if there is one (schema.element.qualifier) if (value.qualifier != null) { key = key + "." + value.qualifier; } // Add the language if there is one (schema.element.qualifier[langauge]) //if ((value.language != null) && (!"".equals(value.language))) if (value.language != null) { key = key + "[" + value.language + "]"; } // Store the item if (exportAll || okToExport(value)) { line.add(key, value.value); if (!headings.contains(key)) { headings.add(key); } } } lines.add(line); counter++; } /** * Add an item to the CSV file, from a CSV line of elements * * @param line The line of elements * @throws Exception Thrown if an error occurs when adding the item */ public void addItem(String line) throws Exception { // Check to see if the last character is a field separator, which hides the last empy column boolean last = false; if (line.endsWith(fieldSeparator)) { // Add a space to the end, then remove it later last = true; line += " "; } // Split up on field separator String[] parts = line.split(escapedFieldSeparator); ArrayList<String> bits = new ArrayList<String>(); bits.addAll(Arrays.asList(parts)); // Merge parts with embedded separators boolean alldone = false; while (!alldone) { boolean found = false; int i = 0; for (String part : bits) { int bitcounter = part.length() - part.replaceAll("\"", "").length(); if ((part.startsWith("\"")) && ((!part.endsWith("\"")) || ((bitcounter %2) == 1))) { found = true; String add = bits.get(i) + fieldSeparator + bits.get(i + 1); bits.remove(i); bits.add(i, add); bits.remove(i + 1); break; } i++; } alldone = !found; } // Deal with quotes around the elements int i = 0; for (String part : bits) { if ((part.startsWith("\"")) && (part.endsWith("\""))) { part = part.substring(1, part.length() - 1); bits.set(i, part); } i++; } // Remove embedded quotes i = 0; for (String part : bits) { if (part.contains("\"\"")) { part = part.replaceAll("\"\"", "\""); bits.set(i, part); } i++; } // Add elements to a DSpaceCSVLine String id = parts[0].replaceAll("\"", ""); DSpaceCSVLine csvLine; // Is this an existing item, or a new item (where id = '+') if ("+".equals(id)) { csvLine = new DSpaceCSVLine(); } else { try { csvLine = new DSpaceCSVLine(Integer.parseInt(id)); } catch (NumberFormatException nfe) { System.err.println("Invalid item identifier: " + id); System.err.println("Please check your CSV file for informaton. " + "Item id must be numeric, or a '+' to add a new item"); throw(nfe); } } // Add the rest of the parts i = 0; for (String part : bits) { if (i > 0) { // Is this a last empty item? if ((last) && (i == headings.size())) { part = ""; } // Make sure we register that this column was there csvLine.add(headings.get(i - 1), null); String[] elements = part.split(escpaedValueSeparator); for (String element : elements) { if ((element != null) && (!"".equals(element))) { csvLine.add(headings.get(i - 1), element); } } } i++; } lines.add(csvLine); counter++; } /** * Get the lines in CSV holders * * @return The lines */ public ArrayList<DSpaceCSVLine> getCSVLines() { // Return the lines return lines; } /** * Get the CSV lines as an array of CSV formatted strings * * @return the array of CSV formatted Strings */ public String[] getCSVLinesAsStringArray() { // Create the headings line String[] csvLines = new String[counter + 1]; csvLines[0] = "id" + fieldSeparator + "collection"; Collections.sort(headings); for (String value : headings) { csvLines[0] = csvLines[0] + fieldSeparator + value; } Iterator<DSpaceCSVLine> i = lines.iterator(); int c = 1; while (i.hasNext()) { csvLines[c++] = i.next().toCSV(headings); } return csvLines; } /** * Save the CSV file to the given filename * * @param filename The filename to save the CSV file to * * @throws IOException Thrown if an error occurs when writing the file */ public void save(String filename) throws IOException { // Save the file BufferedWriter out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream(filename), "UTF8")); for (String csvLine : getCSVLinesAsStringArray()) { out.write(csvLine + "\n"); } out.flush(); out.close(); } /** * Is it Ok to export this value? When exportAll is set to false, we don't export * some of the metadata elements. * * The list can be configured via the key bulkedit.ignore-on-export in dspace.cfg * * @param md The DCValue to examine * @return Whether or not it is OK to export this element */ private boolean okToExport(DCValue md) { // First check the metadata format, and K all non DC elements if (!"dc".equals(md.schema)) { return true; } // Now compare with the list to ignore String key = md.schema + "." + md.element; if (md.qualifier != null) { key += "." + md.qualifier; } if (ignore.get(key) != null) { return false; } // Must be OK, so don't ignore return true; } /** * Return the csv file as one long formatted string * * @return The formatted String as a csv */ public String toString() { // Return the csv as one long string StringBuffer csvLines = new StringBuffer(); String[] lines = this.getCSVLinesAsStringArray(); for (String line : lines) { csvLines.append(line).append("\n"); } return csvLines.toString(); } /** * Test main method to check the marshalling and unmarshalling of strings in and out of CSV format * * @param args Not used * @throws Exception Thrown if something goes wrong */ public static void main(String[] args) throws Exception { // Test the CSV parsing String[] csv = {"id,\"dc.title\",dc.contributor.author,dc.description.abstract", "1,Easy line,\"Lewis, Stuart\",A nice short abstract", "2,Two authors,\"Lewis, Stuart||Bloggs, Joe\",Two people wrote this item", "3,Three authors,\"Lewis, Stuart||Bloggs, Joe||Loaf, Meat\",Three people wrote this item", "4,\"Two line\ntitle\",\"Lewis, Stuart\",abstract", "5,\"\"\"Embedded quotes\"\" here\",\"Lewis, Stuart\",\"Abstract with\ntwo\nnew lines\"", "6,\"\"\"Unbalanced embedded\"\" quotes\"\" here\",\"Lewis, Stuart\",\"Abstract with\ntwo\nnew lines\"",}; // Write the string to a file String filename = "test.csv"; BufferedWriter out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream(filename), "UTF8")); for (String csvLine : csv) { out.write(csvLine + "\n"); } out.flush(); out.close(); System.gc(); // test the CSV parsing DSpaceCSV dcsv = new DSpaceCSV(new File(filename)); String[] lines = dcsv.getCSVLinesAsStringArray(); for (String line : lines) { System.out.println(line); } // Delete the test file File toDelete = new File(filename); toDelete.delete(); } }