GDataImporter.java example

Explorer
OpenRefine-master
/*
 * Copyright (c) 2010, Thomas F. Morris
 *        All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions are met:
 * - Redistributions of source code must retain the above copyright notice, this 
 *   list of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright notice, 
 *   this list of conditions and the following disclaimer in the documentation 
 *   and/or other materials provided with the distribution.
 * 
 * Neither the name of Google nor the names of its contributors may be used to 
 * endorse or promote products derived from this software without specific 
 * prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package com.google.refine.extension.gdata;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.json.JSONObject;

import com.google.gdata.client.spreadsheet.CellQuery;
import com.google.gdata.client.spreadsheet.SpreadsheetService;
import com.google.gdata.data.spreadsheet.Cell;
import com.google.gdata.data.spreadsheet.CellEntry;
import com.google.gdata.data.spreadsheet.CellFeed;
import com.google.gdata.data.spreadsheet.SpreadsheetEntry;
import com.google.gdata.data.spreadsheet.WorksheetEntry;
import com.google.gdata.util.ServiceException;

import com.google.refine.ProjectMetadata;
import com.google.refine.importers.TabularImportingParserBase;
import com.google.refine.importers.TabularImportingParserBase.TableDataReader;
import com.google.refine.importing.ImportingJob;
import com.google.refine.model.Project;
import com.google.refine.util.JSONUtilities;

/**
 * OpenRefine parser for Google Spreadsheets.
 * 
 * @author Tom Morris <tfmorris@gmail.com>
 * @copyright 2010 Thomas F. Morris
 * @license New BSD http://www.opensource.org/licenses/bsd-license.php
 */
public class GDataImporter {
    static public void parse(
        String token,
        Project project,
        ProjectMetadata metadata,
        final ImportingJob job,
        int limit,
        JSONObject options,
        List<Exception> exceptions) {
    
        String docType = JSONUtilities.getString(options, "docType", null);
        if ("spreadsheet".equals(docType)) {
            SpreadsheetService service = GDataExtension.getSpreadsheetService(token);
            parse(
                service,
                project,
                metadata,
                job,
                limit,
                options,
                exceptions
            );
        } else if ("table".equals(docType)) {
            FusionTableImporter.parse(token, 
                project,
                metadata,
                job,
                limit,
                options,
                exceptions
            );
        }
    }
    
    static public void parse(
        SpreadsheetService service,
        Project project,
        ProjectMetadata metadata,
        final ImportingJob job,
        int limit,
        JSONObject options,
        List<Exception> exceptions) {
        
        String docUrlString = JSONUtilities.getString(options, "docUrl", null);
        String worksheetUrlString = JSONUtilities.getString(options, "sheetUrl", null);
        if (docUrlString != null && worksheetUrlString != null) {
            try {
                parseOneWorkSheet(
                    service,
                    project,
                    metadata,
                    job,
                    new URL(docUrlString),
                    new URL(worksheetUrlString),
                    limit,
                    options,
                    exceptions);
            } catch (MalformedURLException e) {
                e.printStackTrace();
                exceptions.add(e);
            }
        }
    }
    
    static public void parseOneWorkSheet(
        SpreadsheetService service,
        Project project,
        ProjectMetadata metadata,
        final ImportingJob job,
        URL docURL,
        URL worksheetURL,
        int limit,
        JSONObject options,
        List<Exception> exceptions) {
        
        try {
            WorksheetEntry worksheetEntry = service.getEntry(worksheetURL, WorksheetEntry.class);
            String spreadsheetName = docURL.toExternalForm();
            try {
                SpreadsheetEntry spreadsheetEntry = service.getEntry(docURL, SpreadsheetEntry.class);
                spreadsheetName = spreadsheetEntry.getTitle().getPlainText();
            } catch (ServiceException e) { // RedirectRequiredException among others
                // fall back to just using the URL (better for traceability anyway?)
            }
            
            String fileSource = spreadsheetName + " # " +
                worksheetEntry.getTitle().getPlainText();
            
            setProgress(job, fileSource, 0);
            TabularImportingParserBase.readTable(
                project,
                metadata,
                job,
                new WorksheetBatchRowReader(job, fileSource, service, worksheetEntry, 20),
                fileSource,
                limit,
                options,
                exceptions
            );
            setProgress(job, fileSource, 100);
        } catch (IOException e) {
            e.printStackTrace();
            exceptions.add(e);
        } catch (ServiceException e) {
            e.printStackTrace();
            exceptions.add(e);
        }
    }
    
    static private void setProgress(ImportingJob job, String fileSource, int percent) {
        job.setProgress(percent, "Reading " + fileSource);
    }
    
    static private class WorksheetBatchRowReader implements TableDataReader {
        final ImportingJob job;
        final String fileSource;
        
        final SpreadsheetService service;
        final WorksheetEntry worksheet;
        final int batchSize;
        
        final int totalRows;
        
        int nextRow = 0; // 0-based
        int batchRowStart = 0; // 0-based
        List<List<Object>> rowsOfCells = null;
        
        public WorksheetBatchRowReader(ImportingJob job, String fileSource,
                SpreadsheetService service, WorksheetEntry worksheet,
                int batchSize) {
            this.job = job;
            this.fileSource = fileSource;
            this.service = service;
            this.worksheet = worksheet;
            this.batchSize = batchSize;
            
            this.totalRows = worksheet.getRowCount();
        }
        
        @Override
        public List<Object> getNextRowOfCells() throws IOException {
            if (rowsOfCells == null || (nextRow >= batchRowStart + rowsOfCells.size() && nextRow < totalRows)) {
                int newBatchRowStart = batchRowStart + (rowsOfCells == null ? 0 : rowsOfCells.size());
                try {
                    rowsOfCells = getRowsOfCells(
                        service,
                        worksheet,
                        newBatchRowStart + 1, // convert to 1-based
                        batchSize);
                    
                    batchRowStart = newBatchRowStart;
                    
                    setProgress(job, fileSource, batchRowStart * 100 / totalRows);
                } catch (ServiceException e) {
                    throw new IOException(e);
                }
            }
            
            if (rowsOfCells != null && nextRow - batchRowStart < rowsOfCells.size()) {
                return rowsOfCells.get(nextRow++ - batchRowStart);
            } else {
                return null;
            }
        }
        
        
        List<List<Object>> getRowsOfCells(
            SpreadsheetService service,
            WorksheetEntry worksheet,
            int startRow, // 1-based
            int rowCount
        ) throws IOException, ServiceException {
            URL cellFeedUrl = worksheet.getCellFeedUrl();
            
            int minRow = startRow;
            int maxRow = Math.min(worksheet.getRowCount(), startRow + rowCount - 1);
            int cols = worksheet.getColCount();
            int rows = worksheet.getRowCount();
            
            CellQuery cellQuery = new CellQuery(cellFeedUrl);
            cellQuery.setMinimumRow(minRow);
            cellQuery.setMaximumRow(maxRow);
            cellQuery.setMaximumCol(cols);
            cellQuery.setMaxResults(rows * cols);
            cellQuery.setReturnEmpty(false);
            
            CellFeed cellFeed = service.query(cellQuery, CellFeed.class);
            List<CellEntry> cellEntries = cellFeed.getEntries();
            
            List<List<Object>> rowsOfCells = new ArrayList<List<Object>>(rowCount);
            for (CellEntry cellEntry : cellEntries) {
                Cell cell = cellEntry.getCell();
                if (cell != null) {
                    int row = cell.getRow() - startRow;
                    int col = cell.getCol() - 1;
                    
                    while (row >= rowsOfCells.size()) {
                        rowsOfCells.add(new ArrayList<Object>());
                    }
                    List<Object> rowOfCells = rowsOfCells.get(row);
                    
                    while (col >= rowOfCells.size()) {
                        rowOfCells.add(null);
                    }
                    rowOfCells.set(col, cell.getValue());
                }
            }
            return rowsOfCells;
        }
    }
}