/** * tarParser * Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 02.10.2009 at http://yacy.net * * $LastChangedDate$ * $LastChangedRevision$ * $LastChangedBy$ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.document.parser; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Date; import java.util.List; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.CommonPattern; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.VocabularyScraper; /** * a parser for comma-separated values * The values may also be separated by semicolon or tab, * the separator character is detected automatically */ public class csvParser extends AbstractParser implements Parser { public csvParser() { super("Comma Separated Value Parser"); this.SUPPORTED_EXTENSIONS.add("csv"); } @Override public Document[] parse( final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. final List<String[]> table = getTable(charset, source); if (table.isEmpty()) throw new Parser.Failure("document has no lines", location); final StringBuilder sb = new StringBuilder(); for (final String[] row: table) { sb.append(concatRow(row)).append(' '); } return new Document[]{new Document( location, mimeType, charset, this, null, null, singleList(concatRow(table.get(0))), null, "", null, null, 0.0d, 0.0d, sb.toString(), null, null, null, false, new Date())}; } private static String concatRow(String[] columns) { final StringBuilder sb = new StringBuilder(80); for (final String column : columns) { if (sb.length() > 0) sb.append(' '); sb.append(column); } sb.append('.'); return sb.toString(); } private static List<String[]> getTable(String charset, InputStream source) { final List<String[]> rows = new ArrayList<String[]>(); BufferedReader reader; try { reader = new BufferedReader(new InputStreamReader(source, charset)); } catch (final UnsupportedEncodingException e1) { reader = new BufferedReader(new InputStreamReader(source)); } String row; String separator = null; int columns = -1; try { while ((row = reader.readLine()) != null) { row = row.trim(); if (row.isEmpty()) continue; if (separator == null) { // try comma, semicolon and tab; take that one that results with more columns final String[] colc = CommonPattern.COMMA.split(row); final String[] cols = CommonPattern.SEMICOLON.split(row); final String[] colt = CommonPattern.TAB.split(row); if (colc.length >= cols.length && colc.length >= colt.length) separator = ","; if (cols.length >= colc.length && cols.length >= colt.length) separator = ";"; if (colt.length >= cols.length && colt.length >= colc.length) separator = "\t"; } row = stripQuotes(row, '\"', separator.charAt(0), ' '); row = stripQuotes(row, '\'', separator.charAt(0), ' '); final String[] cols = row.split(separator); if (columns == -1) columns = cols.length; //if (cols.length != columns) continue; // skip lines that have the wrong number of columns rows.add(cols); } } catch (final IOException e) { } return rows; } /** * remove quotes AND separator characters within the quotes * to make it possible to split the line using the String.split method * @param line * @param quote * @param separator * @param replacement * @return the line without the quotes */ private static String stripQuotes(final String line, final char quote, final char separator, final char replacement) { String ret = line; int p, q; // find left quote while ((p = ret.indexOf(quote)) >= 0) { q = ret.indexOf(quote, p + 1); if (q < 0) { // there is only a single quote but no 'right' quote. // This data is not well-formed. Just remove the quote and give up. return ret.substring(0, p) + ret.substring(p + 1); } ret = ret.substring(0, p) + ret.substring(p + 1, q).replace(separator, replacement) + ret.substring(q + 1); } return ret; } }