package org.aksw.sparqlify.csv; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.io.Reader; import java.sql.ResultSet; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Map.Entry; import org.aksw.jena_sparql_api.core.ConstructIterator; import org.aksw.jena_sparql_api.core.ResultSetCloseable; import org.aksw.jena_sparql_api.utils.QuadPatternUtils; import org.aksw.jena_sparql_api.utils.SparqlFormatterUtils; import org.aksw.jena_sparql_api.views.RestrictedExpr; import org.aksw.jena_sparql_api.views.SparqlSubstitute; import org.aksw.sparqlify.config.lang.TemplateConfigParser; import org.aksw.sparqlify.config.syntax.NamedViewTemplateDefinition; import org.aksw.sparqlify.config.syntax.TemplateConfig; import org.aksw.sparqlify.config.syntax.ViewTemplateDefinition; import org.aksw.sparqlify.core.ResultSetSparqlify; import org.aksw.sparqlify.core.sparql.IteratorResultSetSparqlifyBinding; import org.aksw.sparqlify.util.SparqlifyCoreInit; import org.aksw.sparqlify.validation.LoggerCount; import org.aksw.sparqlify.web.SparqlifyCliHelper; import org.antlr.runtime.RecognitionException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.Options; import org.apache.commons.lang.StringUtils; import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; import org.apache.jena.sparql.core.BasicPattern; import org.apache.jena.sparql.core.QuadPattern; import org.apache.jena.sparql.core.Var; import org.apache.jena.sparql.core.VarExprList; import org.apache.jena.sparql.engine.binding.Binding; import org.apache.jena.sparql.expr.Expr; import org.apache.jena.sparql.syntax.Template; import org.h2.tools.Csv; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; import com.google.common.io.InputSupplier; import au.com.bytecode.opencsv.CSVReader; import jxl.Sheet; import jxl.Workbook; import jxl.WorkbookSettings; import jxl.read.biff.BiffException; public class CsvMapperCliMain { private static final Logger logger = LoggerFactory .getLogger(CsvMapperCliMain.class); private static final Options cliOptions = new Options(); public static File extractFile(CommandLine commandLine, String optionName) { // Option option = commandLine.getO(optionName); String filename = commandLine.getOptionValue(optionName); String optionLabel = optionName; if (filename == null) { logger.error("No file given for option: " + optionLabel); SparqlifyCliHelper.printHelpAndExit(cliOptions, -1); } File file = new File(filename); if (!file.exists()) { logger.error("File given as argument for option " + optionLabel + " does not exist: " + file.getAbsolutePath()); SparqlifyCliHelper.printHelpAndExit(cliOptions, -1); } return file; } public static boolean isNullOrVar(Node node) { return node == null || node.isVariable(); } public static boolean containsNullOrVar(Triple triple) { boolean s = isNullOrVar(triple.getSubject()); boolean p = isNullOrVar(triple.getPredicate()); boolean o = isNullOrVar(triple.getObject()); boolean result = s || p || o; return result; } public static void countVariable(Node node, Map<Var, Integer> countMap) { if(node == null) { //MapUtils.increment(countMap, null); countMap.merge(null, 1, Integer::sum); } else if(node.isVariable()) { //MapUtils.increment(countMap, (Var)node); countMap.merge((Var)node, 1, Integer::sum); } } public static void show(Reader reader) { BufferedReader br = new BufferedReader(reader); String line; try { while((line = br.readLine()) != null) { System.out.println(line); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static TemplateConfig readTemplateConfig(InputStream in, Logger loggerCount) throws IOException, RecognitionException { TemplateConfigParser parser = new TemplateConfigParser(); TemplateConfig config; try { config = parser.parse(in, loggerCount); } finally { in.close(); } return config; } public static Character parseChar(String str) { Character result; str = str.trim(); if(str.startsWith("\\")) { String charValue = str.substring(1); int val = Integer.parseInt(charValue); if(val < 0 || val > Character.MAX_VALUE) { throw new RuntimeException("Character value must be in the range 0-" + (int)Character.MAX_VALUE); } result = (char)val; } else if(str.startsWith("0x")) { String hex = str.substring(2); int val = 0; for (int i = 0; i < hex.length(); ++i) { String s = hex.substring(i, i + 1); char part = (char)Integer.parseInt(s, 16); val <<= 4; val |= part; if(val < 0 || val > Character.MAX_VALUE) { throw new RuntimeException("Character must be in the range 0x0-0x" + Integer.toHexString(Character.MAX_VALUE)); } } result = (char)val; } else if(str.length() > 1) { throw new RuntimeException("Only a singe character allowed."); } else { result = str.charAt(0); } return result; } public static Character getChar(Logger logger, CommandLine commandLine, String opt) { Character result = null; String resultStr = commandLine.getOptionValue(opt, null); if(!StringUtils.isEmpty(resultStr)) { try { result = parseChar(resultStr); } catch(Exception e) { logger.error("Error parsing command line argument -" + opt + " " + resultStr + ": " + e.getClass().getSimpleName() + " for " + e.getMessage()); } } return result; } @SuppressWarnings("static-access") public static void main(String[] args) throws Exception { /* PropertyConfigurator.configure("log4j.properties"); LogManager.getLogManager().readConfiguration( new FileInputStream("jdklog.properties")); */ CommandLineParser cliParser = new GnuParser(); // cliOptions.addOption("P", "port", true, "Server port"); // cliOptions.addOption("C", "context", true, // "Context e.g. /sparqlify"); // cliOptions.addOption("B", "backlog", true, // "Maximum number of connections"); // // cliOptions.addOption("t", "type", true, // "Database type (posgres, mysql,...)"); // cliOptions.addOption("d", "database", true, "Database name"); // cliOptions.addOption("u", "username", true, ""); // cliOptions.addOption("p", "password", true, ""); // cliOptions.addOption("h", "hostname", true, ""); cliOptions.addOption("c", "config", true, "Sparqlify config file"); cliOptions.addOption("f", "config", true, "Input data file"); cliOptions.addOption("v", "config", true, "View name (only needed if config contains more than one view)"); cliOptions.addOption("h", "config", false, "Use first row as headers"); cliOptions.addOption("s", "config", true, "CSV field separator (default is ',')"); cliOptions.addOption("d", "config", true, "CSV field delimiter (default is '\"')"); cliOptions.addOption("e", "config", true, "CSV field escape delimiter (escapes the field delimiter) (default is '\\')"); // TODO NULL value string -n // TODO offset -t (top) // TODO limit -b (bottom) CommandLine commandLine = cliParser.parse(cliOptions, args); File configFile = extractFile(commandLine, "c"); File dataFile = extractFile(commandLine, "f"); String viewName = StringUtils.trim(commandLine.getOptionValue("v")); LoggerCount loggerCount = new LoggerCount(logger); CsvParserConfig csvConfig = new CsvParserConfig(); csvConfig.setFieldDelimiter(getChar(loggerCount, commandLine, "d")); csvConfig.setEscapeCharacter(getChar(loggerCount, commandLine, "e")); csvConfig.setFieldSeparator(getChar(loggerCount, commandLine, "s")); boolean useFirstRowAsHeaderNames = commandLine.hasOption("h"); InputStream in = new FileInputStream(configFile); TemplateConfig config; try { config = readTemplateConfig(in, loggerCount); } finally { in.close(); } logger.info("Errors: " + loggerCount.getErrorCount() + ", Warnings: " + loggerCount.getWarningCount()); if(loggerCount.getErrorCount() > 0) { throw new RuntimeException("Encountered " + loggerCount.getErrorCount() + " errors that need to be fixed first."); } List<NamedViewTemplateDefinition> views = config.getDefinitions(); if(views.isEmpty()) { logger.warn("No view definitions found"); } // Index the views by name Map<String, NamedViewTemplateDefinition> viewIndex = indexViews(views, loggerCount); ViewTemplateDefinition view; view = pickView(viewIndex, viewName); // if(view == null) { // logger.error("View '" + viewName + "' not found in config file"); // System.exit(1); // } Reader fileReader = new FileReader(dataFile); //convertCsvToRdf(fileReader, view); InputSupplier<CSVReader> csvReaderSupplier = new InputSupplierCSVReader(dataFile, csvConfig); ResultSet resultSet = createResultSetFromCsv(csvReaderSupplier, useFirstRowAsHeaderNames, 100); //csv.setEscapeCharacter('/'); //ResultSet resultSet = csv.read(fileReader, null); TripleIteratorTracking trackingIt = createTripleIterator(resultSet, view); //writeTriples(System.out, trackingIt); SparqlFormatterUtils.writeText(System.out, trackingIt); writeSummary(System.err, trackingIt.getState()); //convertCsvToRdf(resultSet, view); } public static Map<String, NamedViewTemplateDefinition> indexViews(List<NamedViewTemplateDefinition> views, Logger loggerCount) { // Index the views by name Map<String, NamedViewTemplateDefinition> nameToView = new HashMap<String, NamedViewTemplateDefinition>(); for(NamedViewTemplateDefinition view : views) { String name = view.getName(); if(nameToView.containsKey(name)) { loggerCount.warn("Omitting duplicate view definition: " + name); } nameToView.put(name, view); } return nameToView; } public static ViewTemplateDefinition pickView(Map<String, NamedViewTemplateDefinition> index, String viewName) { ViewTemplateDefinition view = null; if(StringUtils.isEmpty(viewName)) { if(index.size() == 1) { view = index.values().iterator().next(); } else { throw new RuntimeException("Multiple views exist. Please specify which one to use"); //logger.error("Multiple views present in config file; please specify which to use"); //printHelpAndExit(1); } } else { view = index.get(viewName); if(view == null) { throw new RuntimeException("View '" + viewName + "' not found"); // //logger.error("View '" + viewName + "' not found in config file"); // //System.exit(1); // } } return view; } /** * * @param readerSupplier We need to read the file twice: Once for figuring out the column headers - and if there are none, again for the data * @param fieldSeparator * @param quoteCharacter * @return * @throws IOException */ public static ResultSet createResultSetFromCsv(InputSupplier<? extends CSVReader> csvReaderSupplier, boolean useHeaders, Integer sampleSize) throws IOException { sampleSize = (sampleSize == null) ? 100 : sampleSize; CSVReader headerReader = csvReaderSupplier.getInput(); List<String> columnNames = new ArrayList<String>(); try { String row[]; int i = 0; while((row = headerReader.readNext()) != null && (i < sampleSize)) { if(i == 0 && useHeaders) { columnNames.addAll(Arrays.asList(row)); } int delta = row.length - columnNames.size(); for(int j = 0; j < delta; ++j) { columnNames.add("" + (i + j)); } ++i; } } finally { headerReader.close(); } CSVReader dataReader = csvReaderSupplier.getInput(); if(useHeaders) { // Skip header row dataReader.readNext(); } Reader reader = new ReaderCSVReader(dataReader); // BufferedReader br = new BufferedReader(reader); // String tmp; // while((tmp = br.readLine()) != null) { // System.out.println(tmp); // } Csv csv = new Csv(); csv.setEscapeCharacter('\\'); String[] colNames = columnNames.toArray(new String[0]); ResultSet result = csv.read(reader, colNames); logger.debug("Detected column names: " + columnNames); return result; } public static TripleIteratorTracking createTripleIterator(ResultSet rs, ViewTemplateDefinition view) { //System.out.println("Test here"); // TODO Move the method to a better place SparqlifyCoreInit.initSparqlifyFunctions(); //ResultSetMetaData meta = rs.getMetaData(); /* for(int i = 1; i <= meta.getColumnCount(); ++i) { System.out.println(meta.getColumnName(i)); } */ VarExprList varExprs = view.getVarExprList(); List<String> vars = new ArrayList<String>(); for(Var var : varExprs.getVars()) { vars.add(var.getName()); } Multimap<Var, RestrictedExpr> sparqlVarMap = HashMultimap.create(); for(Entry<Var, Expr> entry : varExprs.getExprs().entrySet()) { Expr e = SparqlSubstitute.substituteExpr(entry.getValue()); //Expr e = FunctionExpander.transform(ex); //System.out.println(e); sparqlVarMap.put(entry.getKey(), new RestrictedExpr(e)); } Iterator<Binding> itBinding = new IteratorResultSetSparqlifyBinding(null, rs, sparqlVarMap, 1, "rowId"); ResultSetSparqlify rss = new ResultSetSparqlify(itBinding, vars, 0); // insertPrefixesInto(result) ; //Template template = view.getConstructTemplate(); QuadPattern quadPattern = view.getConstructTemplate(); BasicPattern bgp = QuadPatternUtils.toBasicPattern(quadPattern); Template template = new Template(bgp); //System.out.println(template.getTriples()); ResultSetCloseable closableRs = new ResultSetCloseable(rss); Iterator<Triple> it = new ConstructIterator(template, closableRs); TripleIteratorTracking result = new TripleIteratorTracking(it); return result; } public static void writeSummary(PrintStream out, TripleIteratorState state) { int totalTripleCount = state.getTotalTripleCount(); int tripleGenCount = state.getTripleGenCount(); Map<Var, Integer> varCountMap = state.getVarCountMap(); int omittedTripleCount = totalTripleCount - tripleGenCount; System.err.println("Variable\t#Unbound"); for(Entry<Var, Integer> entry : varCountMap.entrySet()) { Var var = entry.getKey(); Integer count = entry.getValue(); System.err.println(var + ":\t" + count); } System.err.println("Triples generated:\t" + tripleGenCount); System.err.println("Potential triples omitted:\t" + omittedTripleCount); System.err.println("Triples total:\t" + totalTripleCount); } // public static void writeTriples(PrintStream out, Iterator<Triple> it) { // // while(it.hasNext()) { // // Triple t = it.next(); // String str = TripleUtils.toNTripleString(t); // // out.println(str); // } /* System.exit(0); for (ViewTemplateDefinition x : config.getDefinitions()) { System.out.println(x); // x.getConstructTemplate(). VarExprList vel = x.getVarExprList(); System.out.println(vel); }*/ // // } public static Iterator<List<String>> getCsvIterator(File file, String columnSeparator) throws FileNotFoundException { // BufferedReader reader = new BufferedReader(new InputStreamReader(new // FileInputStream(file))); Iterator<List<String>> result = new CsvRowIterator(file); return result; } public static Iterator<List<String>> getXlsCsvIterator(File file, int sheetIndex) throws BiffException, IOException { WorkbookSettings ws = new WorkbookSettings(); ws.setLocale(new Locale("en", "EN")); Workbook w = Workbook.getWorkbook(file, ws); // File f = new File("/tmp/new.csv"); // OutputStream os = (OutputStream) new FileOutputStream(f); // String encoding = "UTF8"; // OutputStreamWriter osw = new OutputStreamWriter(os, encoding); // BufferedWriter bw = new BufferedWriter(osw); if (sheetIndex >= w.getNumberOfSheets()) { throw new IndexOutOfBoundsException("Attemp to access sheet " + sheetIndex + "/" + w.getNumberOfSheets()); } Sheet s = w.getSheet(sheetIndex); Iterator<List<String>> result = new XlsRowIterator<String>(s, 0, CellToStringTransformer.getInstance(), w); return result; } }