/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.knittingboar.conf.cmdline; import java.io.IOException; import java.io.PrintWriter; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.commons.cli2.util.HelpFormatter; import com.cloudera.knittingboar.utils.DatasetConverter; public class DataConverterCmdLineDriver { private static String strInputFile; private static String strOutputFile; private static String strrecordsPerBlock; public static void main(String[] args) throws Exception { mainToOutput(args, new PrintWriter(System.out, true)); } static void mainToOutput(String[] args, PrintWriter output) throws Exception { if (!parseArgs(args)) { output.write("Parse:Incorrect"); return; } // if output.write("Parse:correct"); int shard_rec_count = Integer.parseInt(strrecordsPerBlock); System.out.println("Converting "); System.out.println("From: " + strInputFile); System.out.println("To: " + strOutputFile); System.out.println("File shard size (record count/file): " + shard_rec_count); int count = DatasetConverter.ConvertNewsgroupsFromSingleFiles(strInputFile, strOutputFile, shard_rec_count); output.write("Total Records Converted: " + count); } // mainToOutput private static boolean parseArgs(String[] args) throws IOException { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription( "print this list").create(); // Option quiet = // builder.withLongName("quiet").withDescription("be extra quiet").create(); // Option scores = // builder.withLongName("scores").withDescription("output score diagnostics during training").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFileOption = builder .withLongName("input") .withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription("where to get input data").create(); Option outputFileOption = builder.withLongName("output").withRequired(true) .withArgument( argumentBuilder.withName("output").withMaximum(1).create()) .withDescription("where to write output data").create(); Option recordsPerBlockOption = builder.withLongName("recordsPerBlock") .withArgument( argumentBuilder.withName("recordsPerBlock").withDefault("20000") .withMaximum(1).create()).withDescription( "the number of records per output file shard to write").create(); // optionally can be { 20Newsgroups, rcv1 } Option RecordFactoryType = builder.withLongName("datasetType") .withArgument( argumentBuilder.withName("recordFactoryType").withDefault( "20Newsgroups").withMaximum(1).create()).withDescription( "the type of dataset to convert").create(); /* * Option passes = builder.withLongName("passes") .withArgument( * argumentBuilder.withName("passes") .withDefault("2") * .withMaximum(1).create()) * .withDescription("the number of times to pass over the input data") * .create(); * * Option lambda = builder.withLongName("lambda") * .withArgument(argumentBuilder * .withName("lambda").withDefault("1e-4").withMaximum(1).create()) * .withDescription("the amount of coefficient decay to use") .create(); * * Option rate = builder.withLongName("rate") * .withArgument(argumentBuilder.withName * ("learningRate").withDefault("1e-3").withMaximum(1).create()) * .withDescription("the learning rate") .create(); * * Option noBias = builder.withLongName("noBias") * .withDescription("don't include a bias term") .create(); */ Group normalArgs = new GroupBuilder().withOption(help).withOption( inputFileOption).withOption(outputFileOption).withOption( recordsPerBlockOption).withOption(RecordFactoryType).create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { System.out.println("null!"); return false; } // "/Users/jpatterson/Downloads/datasets/20news-bydate/20news-bydate-train/" strInputFile = getStringArgument(cmdLine, inputFileOption); // "/Users/jpatterson/Downloads/datasets/20news-kboar/train4/" strOutputFile = getStringArgument(cmdLine, outputFileOption); strrecordsPerBlock = getStringArgument(cmdLine, recordsPerBlockOption); return true; } private static boolean getBooleanArgument(CommandLine cmdLine, Option option) { return cmdLine.hasOption(option); } private static String getStringArgument(CommandLine cmdLine, Option inputFile) { return (String) cmdLine.getValue(inputFile); } }