/*
* Copyright (c) 2016, Metron, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Metron, Inc. nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL METRON, INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.metsci.glimpse.dspl.parser;
import static com.metsci.glimpse.dspl.parser.util.ParserUtils.buildPropertyTableData;
import static com.metsci.glimpse.dspl.parser.util.ParserUtils.buildSliceTableData;
import static com.metsci.glimpse.dspl.parser.util.QuoteAwareStringSplitter.splitLine;
import static com.metsci.glimpse.util.logging.LoggerUtils.logWarning;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.bind.JAXBException;
import com.metsci.glimpse.dspl.DsplParser;
import com.metsci.glimpse.dspl.parser.column.CompactInternStringTableColumn;
import com.metsci.glimpse.dspl.parser.column.CompactStringTableColumn;
import com.metsci.glimpse.dspl.parser.column.TableColumn;
import com.metsci.glimpse.dspl.parser.table.PropertyTableData;
import com.metsci.glimpse.dspl.parser.table.SliceTableData;
import com.metsci.glimpse.dspl.parser.util.ParserUtils.DateColumnParser;
import com.metsci.glimpse.dspl.parser.util.ParserUtils.ParserFactory;
import com.metsci.glimpse.dspl.parser.util.ParserUtils.TableParserInfo;
import com.metsci.glimpse.dspl.schema.Concept;
import com.metsci.glimpse.dspl.schema.Data;
import com.metsci.glimpse.dspl.schema.Data.File;
import com.metsci.glimpse.dspl.schema.DataSet;
import com.metsci.glimpse.dspl.schema.DataType;
import com.metsci.glimpse.dspl.schema.Slice;
import com.metsci.glimpse.dspl.schema.Table;
import com.metsci.glimpse.dspl.schema.Table.Column;
import com.metsci.glimpse.dspl.util.DsplException;
import com.metsci.glimpse.util.primitives.BytesArray;
import com.metsci.glimpse.util.primitives.IntsArray;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
public class ExtendedCsvParser extends CsvParser implements MultipleFileTableParser
{
public ExtendedCsvParser( DsplParser dsplParser )
{
super( dsplParser );
}
public static final String META = "([{\\^-$|]})?*+.";
@Override
public ParserFactory createParserFactory( )
{
return new ExtendedCSVParserFactory( );
}
@Override
protected void parse( BufferedReader in, TableParserInfo info ) throws IOException, DsplException
{
CSVTableColumnParser[] parsers = ( CSVTableColumnParser[] ) info.getParsers( );
Column[] columns = info.getColumns( );
int size = parsers.length;
String line = null;
// extends the default parsing semantics to include a possible "remainder" format
// for the last column, indicating that that column should get all extra tokens as one string
String trailingFormat = columns[size - 1].getFormat( );
boolean allowExtraTrailingTokens = ( trailingFormat != null && trailingFormat.equals( "remainder" ) );
int splitLimit = allowExtraTrailingTokens ? size : Integer.MAX_VALUE;
while ( ( line = in.readLine( ) ) != null )
{
if ( line.isEmpty( ) ) continue;
String[] tokens = splitLine( line, splitLimit );
for ( int i = 0; i < size; i++ )
{
if ( i < tokens.length )
parsers[i].addData( tokens[i] );
else
parsers[i].addGap( );
}
}
}
@Override
public SliceTableData parse( Slice slice ) throws IOException, JAXBException, DsplException
{
List<URL> files = getDataFiles( slice );
BufferedReader in = new BufferedReader( new InputStreamReader( files.get( 0 ).openStream( ) ) );
TableParserInfo info = null;
try
{
info = newParserInfo( slice, in );
}
finally
{
in.close( );
}
for ( URL f : files )
{
in = new BufferedReader( new InputStreamReader( f.openStream( ) ) );
try
{
in.readLine( ); // read the header line, which must be the same in all the files
parse( in, info );
}
finally
{
in.close( );
}
}
return buildSliceTableData( slice, info, factory );
}
@Override
public PropertyTableData parse( Concept concept ) throws IOException, JAXBException, DsplException
{
List<URL> files = getDataFiles( concept );
BufferedReader in = new BufferedReader( new InputStreamReader( files.get( 0 ).openStream( ) ) );
TableParserInfo info = null;
try
{
info = newParserInfo( concept, in );
}
finally
{
in.close( );
}
for ( URL f : files )
{
in = new BufferedReader( new InputStreamReader( f.openStream( ) ) );
try
{
in.readLine( ); // read the header line, which must be the same in all the files
parse( in, info );
}
finally
{
in.close( );
}
}
return buildPropertyTableData( concept, info, factory );
}
@Override
public List<URL> getDataFiles( Slice slice ) throws JAXBException, IOException, DsplException
{
return getDataFiles( slice.getDataSet( ), slice.getTable( ), slice.getId( ) );
}
@Override
public List<URL> getDataFiles( Concept concept ) throws JAXBException, IOException, DsplException
{
return getDataFiles( concept.getDataSet( ), concept.getTable( ), concept.getId( ) );
}
protected List<URL> getDataFiles( DataSet dataset, Table table, String id ) throws JAXBException, IOException, DsplException
{
java.io.File baseFile = dataset.getFile( );
if ( baseFile == null )
{
throw new DsplException( "ExtendedCsvParser (csvx) can only be used with datasets whose getFile( ) is set." );
}
Data data = table.getData( );
if ( data == null )
{
throw new DsplException( "No assoicated table is defined for %s.", id );
}
File file = data.getFile( );
String name = file.getValue( );
String regexp = toRegularExpression( name );
Pattern pattern = Pattern.compile( regexp );
java.io.File parentFile = baseFile.getParentFile( );
if ( baseFile.getName( ).endsWith( ".xml" ) )
{
return toURLs( getAllFiles( parentFile, pattern ) );
}
else if ( baseFile.getName( ).endsWith( ".zip" ) )
{
return getAllFiles( new ZipFile( baseFile ), pattern );
}
throw new DsplException( "DataSet file field must reference xml file or zip archive." );
}
protected String toRegularExpression( String expression )
{
return expression;
}
protected List<URL> toURLs( List<java.io.File> files ) throws MalformedURLException
{
List<URL> urls = new ArrayList<URL>( );
for ( java.io.File file : files )
{
urls.add( file.toURI( ).toURL( ) );
}
return urls;
}
protected List<URL> getAllFiles( ZipFile file, Pattern pattern ) throws MalformedURLException
{
List<URL> urls = new ArrayList<URL>( );
Enumeration<? extends ZipEntry> entries = file.entries( );
while ( entries.hasMoreElements( ) )
{
ZipEntry entry = entries.nextElement( );
if ( pattern.matcher( entry.getName( ) ).find( ) )
{
urls.add( new URL( String.format( "jar:file:%s!/%s", file.getName( ), entry.getName( ) ) ) );
}
}
return urls;
}
protected List<java.io.File> getAllFiles( java.io.File file, Pattern pattern )
{
List<java.io.File> files = getAllFiles( file );
Iterator<java.io.File> iter = files.iterator( );
while ( iter.hasNext( ) )
{
java.io.File f = iter.next( );
if ( !pattern.matcher( f.getName( ) ).find( ) ) iter.remove( );
}
return files;
}
protected List<java.io.File> getAllFiles( java.io.File file )
{
List<java.io.File> files = new ArrayList<java.io.File>( );
getAllFiles( file, files );
return files;
}
protected void getAllFiles( java.io.File file, List<java.io.File> files )
{
if ( file.isDirectory( ) )
{
java.io.File[] childFiles = file.listFiles( );
for ( java.io.File childFile : childFiles )
{
getAllFiles( childFile, files );
}
}
else
{
files.add( file );
}
}
public class ExtendedCSVParserFactory extends CSVParserFactory
{
// replace the joda date parser which Google specifies with the standard Java date parser
// which handles string formatted time zone specifications better
@Override
public CSVTableColumnParser newDateParser( Column column )
{
return new ExtendedCSVDateColumnParser( column );
}
// allow the "intern" format string which provides control over when strings are interned
@Override
public CSVTableColumnParser newStringParser( Column column )
{
String format = column.getFormat( );
boolean intern = format != null && format.equals( "intern" );
if ( intern )
{
return new CSVCompactInternStringColumnParser( );
}
else
{
return new CSVCompactStringColumnParser( );
}
}
}
protected static class CSVCompactStringColumnParser implements CSVTableColumnParser
{
protected BytesArray allStrings;
protected IntsArray offsets;
public CSVCompactStringColumnParser( )
{
super( );
allStrings = new BytesArray( );
offsets = new IntsArray( );
}
@Override
public TableColumn createTableColumn( Column column, Concept concept, DataType type )
{
return new CompactStringTableColumn( column, concept, offsets.n, allStrings.string( ), offsets.copyOf( ) );
}
@Override
public String parse( String data )
{
return new String( data );
}
@Override
public void addData( String value )
{
offsets.append( allStrings.n );
if ( value != null )
{
allStrings.append( value );
}
}
@Override
public void addGap( )
{
addData( null );
}
}
protected static class CSVCompactInternStringColumnParser implements CSVTableColumnParser
{
protected Object2IntOpenHashMap<String> indexByString;
protected IntsArray indexByRow;
public CSVCompactInternStringColumnParser( )
{
indexByString = new Object2IntOpenHashMap<String>( );
indexByString.defaultReturnValue( -1 );
indexByRow = new IntsArray( );
}
@Override
public void addData( String value )
{
// We don't want to store a substring, which might hang on
// to more characters than it needs. So make a clean copy.
value = new String( value );
int index = indexByString.getInt( value );
if ( index == -1 )
{
index = indexByString.size( );
indexByString.put( value, index );
}
indexByRow.append( index );
}
@Override
public void addGap( )
{
indexByRow.append( -1 );
}
@Override
public TableColumn createTableColumn( Column column, Concept concept, DataType type )
{
// This seems a little dangerous, but works because we're careful
// to put each integer on [0,size) into the map exactly once
String[] uniqueStrings = new String[indexByString.size( )];
for ( Object2IntMap.Entry<String> entry : indexByString.object2IntEntrySet( ) )
{
int index = entry.getIntValue( );
uniqueStrings[index] = entry.getKey( );
}
return new CompactInternStringTableColumn( column, concept, indexByRow.n, uniqueStrings, indexByRow.copyOf( ) );
}
@Override
public String parse( String data )
{
return new String( data );
}
}
protected class ExtendedCSVDateColumnParser extends DateColumnParser implements CSVTableColumnParser
{
protected DateFormat alternateDateFormat;
public ExtendedCSVDateColumnParser( Column column )
{
super( column );
this.alternateDateFormat = new SimpleDateFormat( format );
}
@Override
public void addData( String token ) throws DsplException
{
if ( token.isEmpty( ) )
{
addGap( );
}
else
{
try
{
data.append( parse( token ) );
}
catch ( Exception e )
{
if ( dsplParser.isFailOnErrorMode( ) )
{
throw new DsplException( "Problem parsing: %s", e, token );
}
else
{
logWarning( logger, "Trouble parsing date: %s. Adding gap instead.", token );
addGap( );
}
}
}
}
//TODO we need a much better way to mark missing data
@Override
public void addGap( )
{
data.append( -1 );
}
@Override
public Long parse( String data ) throws DsplException
{
try
{
//XXX The Java parser doesn't handle "Z" as a time zone
//XXX this slows down the parse though, so it's an ugly hack
data = data.replaceFirst( "Z", "GMT" );
return alternateDateFormat.parse( data ).getTime( );
}
catch ( ParseException e )
{
throw new DsplException( "Trouble parsing date: %s.", e, data );
}
}
}
}