/* * Copyright (c) 2016, Metron, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Metron, Inc. nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL METRON, INC. BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package com.metsci.glimpse.dspl.parser; import static com.metsci.glimpse.dspl.parser.util.ParserUtils.buildPropertyTableData; import static com.metsci.glimpse.dspl.parser.util.ParserUtils.buildSliceTableData; import static com.metsci.glimpse.dspl.parser.util.QuoteAwareStringSplitter.splitLine; import static com.metsci.glimpse.util.logging.LoggerUtils.logWarning; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Enumeration; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import javax.xml.bind.JAXBException; import com.metsci.glimpse.dspl.DsplParser; import com.metsci.glimpse.dspl.parser.column.CompactInternStringTableColumn; import com.metsci.glimpse.dspl.parser.column.CompactStringTableColumn; import com.metsci.glimpse.dspl.parser.column.TableColumn; import com.metsci.glimpse.dspl.parser.table.PropertyTableData; import com.metsci.glimpse.dspl.parser.table.SliceTableData; import com.metsci.glimpse.dspl.parser.util.ParserUtils.DateColumnParser; import com.metsci.glimpse.dspl.parser.util.ParserUtils.ParserFactory; import com.metsci.glimpse.dspl.parser.util.ParserUtils.TableParserInfo; import com.metsci.glimpse.dspl.schema.Concept; import com.metsci.glimpse.dspl.schema.Data; import com.metsci.glimpse.dspl.schema.Data.File; import com.metsci.glimpse.dspl.schema.DataSet; import com.metsci.glimpse.dspl.schema.DataType; import com.metsci.glimpse.dspl.schema.Slice; import com.metsci.glimpse.dspl.schema.Table; import com.metsci.glimpse.dspl.schema.Table.Column; import com.metsci.glimpse.dspl.util.DsplException; import com.metsci.glimpse.util.primitives.BytesArray; import com.metsci.glimpse.util.primitives.IntsArray; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; public class ExtendedCsvParser extends CsvParser implements MultipleFileTableParser { public ExtendedCsvParser( DsplParser dsplParser ) { super( dsplParser ); } public static final String META = "([{\\^-$|]})?*+."; @Override public ParserFactory createParserFactory( ) { return new ExtendedCSVParserFactory( ); } @Override protected void parse( BufferedReader in, TableParserInfo info ) throws IOException, DsplException { CSVTableColumnParser[] parsers = ( CSVTableColumnParser[] ) info.getParsers( ); Column[] columns = info.getColumns( ); int size = parsers.length; String line = null; // extends the default parsing semantics to include a possible "remainder" format // for the last column, indicating that that column should get all extra tokens as one string String trailingFormat = columns[size - 1].getFormat( ); boolean allowExtraTrailingTokens = ( trailingFormat != null && trailingFormat.equals( "remainder" ) ); int splitLimit = allowExtraTrailingTokens ? size : Integer.MAX_VALUE; while ( ( line = in.readLine( ) ) != null ) { if ( line.isEmpty( ) ) continue; String[] tokens = splitLine( line, splitLimit ); for ( int i = 0; i < size; i++ ) { if ( i < tokens.length ) parsers[i].addData( tokens[i] ); else parsers[i].addGap( ); } } } @Override public SliceTableData parse( Slice slice ) throws IOException, JAXBException, DsplException { List<URL> files = getDataFiles( slice ); BufferedReader in = new BufferedReader( new InputStreamReader( files.get( 0 ).openStream( ) ) ); TableParserInfo info = null; try { info = newParserInfo( slice, in ); } finally { in.close( ); } for ( URL f : files ) { in = new BufferedReader( new InputStreamReader( f.openStream( ) ) ); try { in.readLine( ); // read the header line, which must be the same in all the files parse( in, info ); } finally { in.close( ); } } return buildSliceTableData( slice, info, factory ); } @Override public PropertyTableData parse( Concept concept ) throws IOException, JAXBException, DsplException { List<URL> files = getDataFiles( concept ); BufferedReader in = new BufferedReader( new InputStreamReader( files.get( 0 ).openStream( ) ) ); TableParserInfo info = null; try { info = newParserInfo( concept, in ); } finally { in.close( ); } for ( URL f : files ) { in = new BufferedReader( new InputStreamReader( f.openStream( ) ) ); try { in.readLine( ); // read the header line, which must be the same in all the files parse( in, info ); } finally { in.close( ); } } return buildPropertyTableData( concept, info, factory ); } @Override public List<URL> getDataFiles( Slice slice ) throws JAXBException, IOException, DsplException { return getDataFiles( slice.getDataSet( ), slice.getTable( ), slice.getId( ) ); } @Override public List<URL> getDataFiles( Concept concept ) throws JAXBException, IOException, DsplException { return getDataFiles( concept.getDataSet( ), concept.getTable( ), concept.getId( ) ); } protected List<URL> getDataFiles( DataSet dataset, Table table, String id ) throws JAXBException, IOException, DsplException { java.io.File baseFile = dataset.getFile( ); if ( baseFile == null ) { throw new DsplException( "ExtendedCsvParser (csvx) can only be used with datasets whose getFile( ) is set." ); } Data data = table.getData( ); if ( data == null ) { throw new DsplException( "No assoicated table is defined for %s.", id ); } File file = data.getFile( ); String name = file.getValue( ); String regexp = toRegularExpression( name ); Pattern pattern = Pattern.compile( regexp ); java.io.File parentFile = baseFile.getParentFile( ); if ( baseFile.getName( ).endsWith( ".xml" ) ) { return toURLs( getAllFiles( parentFile, pattern ) ); } else if ( baseFile.getName( ).endsWith( ".zip" ) ) { return getAllFiles( new ZipFile( baseFile ), pattern ); } throw new DsplException( "DataSet file field must reference xml file or zip archive." ); } protected String toRegularExpression( String expression ) { return expression; } protected List<URL> toURLs( List<java.io.File> files ) throws MalformedURLException { List<URL> urls = new ArrayList<URL>( ); for ( java.io.File file : files ) { urls.add( file.toURI( ).toURL( ) ); } return urls; } protected List<URL> getAllFiles( ZipFile file, Pattern pattern ) throws MalformedURLException { List<URL> urls = new ArrayList<URL>( ); Enumeration<? extends ZipEntry> entries = file.entries( ); while ( entries.hasMoreElements( ) ) { ZipEntry entry = entries.nextElement( ); if ( pattern.matcher( entry.getName( ) ).find( ) ) { urls.add( new URL( String.format( "jar:file:%s!/%s", file.getName( ), entry.getName( ) ) ) ); } } return urls; } protected List<java.io.File> getAllFiles( java.io.File file, Pattern pattern ) { List<java.io.File> files = getAllFiles( file ); Iterator<java.io.File> iter = files.iterator( ); while ( iter.hasNext( ) ) { java.io.File f = iter.next( ); if ( !pattern.matcher( f.getName( ) ).find( ) ) iter.remove( ); } return files; } protected List<java.io.File> getAllFiles( java.io.File file ) { List<java.io.File> files = new ArrayList<java.io.File>( ); getAllFiles( file, files ); return files; } protected void getAllFiles( java.io.File file, List<java.io.File> files ) { if ( file.isDirectory( ) ) { java.io.File[] childFiles = file.listFiles( ); for ( java.io.File childFile : childFiles ) { getAllFiles( childFile, files ); } } else { files.add( file ); } } public class ExtendedCSVParserFactory extends CSVParserFactory { // replace the joda date parser which Google specifies with the standard Java date parser // which handles string formatted time zone specifications better @Override public CSVTableColumnParser newDateParser( Column column ) { return new ExtendedCSVDateColumnParser( column ); } // allow the "intern" format string which provides control over when strings are interned @Override public CSVTableColumnParser newStringParser( Column column ) { String format = column.getFormat( ); boolean intern = format != null && format.equals( "intern" ); if ( intern ) { return new CSVCompactInternStringColumnParser( ); } else { return new CSVCompactStringColumnParser( ); } } } protected static class CSVCompactStringColumnParser implements CSVTableColumnParser { protected BytesArray allStrings; protected IntsArray offsets; public CSVCompactStringColumnParser( ) { super( ); allStrings = new BytesArray( ); offsets = new IntsArray( ); } @Override public TableColumn createTableColumn( Column column, Concept concept, DataType type ) { return new CompactStringTableColumn( column, concept, offsets.n, allStrings.string( ), offsets.copyOf( ) ); } @Override public String parse( String data ) { return new String( data ); } @Override public void addData( String value ) { offsets.append( allStrings.n ); if ( value != null ) { allStrings.append( value ); } } @Override public void addGap( ) { addData( null ); } } protected static class CSVCompactInternStringColumnParser implements CSVTableColumnParser { protected Object2IntOpenHashMap<String> indexByString; protected IntsArray indexByRow; public CSVCompactInternStringColumnParser( ) { indexByString = new Object2IntOpenHashMap<String>( ); indexByString.defaultReturnValue( -1 ); indexByRow = new IntsArray( ); } @Override public void addData( String value ) { // We don't want to store a substring, which might hang on // to more characters than it needs. So make a clean copy. value = new String( value ); int index = indexByString.getInt( value ); if ( index == -1 ) { index = indexByString.size( ); indexByString.put( value, index ); } indexByRow.append( index ); } @Override public void addGap( ) { indexByRow.append( -1 ); } @Override public TableColumn createTableColumn( Column column, Concept concept, DataType type ) { // This seems a little dangerous, but works because we're careful // to put each integer on [0,size) into the map exactly once String[] uniqueStrings = new String[indexByString.size( )]; for ( Object2IntMap.Entry<String> entry : indexByString.object2IntEntrySet( ) ) { int index = entry.getIntValue( ); uniqueStrings[index] = entry.getKey( ); } return new CompactInternStringTableColumn( column, concept, indexByRow.n, uniqueStrings, indexByRow.copyOf( ) ); } @Override public String parse( String data ) { return new String( data ); } } protected class ExtendedCSVDateColumnParser extends DateColumnParser implements CSVTableColumnParser { protected DateFormat alternateDateFormat; public ExtendedCSVDateColumnParser( Column column ) { super( column ); this.alternateDateFormat = new SimpleDateFormat( format ); } @Override public void addData( String token ) throws DsplException { if ( token.isEmpty( ) ) { addGap( ); } else { try { data.append( parse( token ) ); } catch ( Exception e ) { if ( dsplParser.isFailOnErrorMode( ) ) { throw new DsplException( "Problem parsing: %s", e, token ); } else { logWarning( logger, "Trouble parsing date: %s. Adding gap instead.", token ); addGap( ); } } } } //TODO we need a much better way to mark missing data @Override public void addGap( ) { data.append( -1 ); } @Override public Long parse( String data ) throws DsplException { try { //XXX The Java parser doesn't handle "Z" as a time zone //XXX this slows down the parse though, so it's an ugly hack data = data.replaceFirst( "Z", "GMT" ); return alternateDateFormat.parse( data ).getTime( ); } catch ( ParseException e ) { throw new DsplException( "Trouble parsing date: %s.", e, data ); } } } }