SchemaDictionary.java example

Explorer
RecordBreaker-master
- src
/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.schemadict;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;

import java.util.List;
import java.util.Random;
import java.util.ArrayList;

import org.apache.avro.Schema;

/******************************************
 * A SchemaDictionary holds a number of serialized SchemaDictionaryElt objects, each of
 * which holds some schema info, a SchemaStatisticalSummary, and a user comment.
 * 
 * A SchemaDictionary is meant to be the "clean schema reference" that helps users give
 * a name to novel schemas.
 *
 * @author mjc
 ******************************************/
public class SchemaDictionary {
  File dir;
  Random r = new Random();
  List<SchemaDictionaryEntry> dictElts = new ArrayList<SchemaDictionaryEntry>();

  /**
   * Load the schema dictionary from the given directory.
   */
  public SchemaDictionary(File dir) throws IOException {
    this.dir = dir.getCanonicalFile();
    if (! dir.exists()) {
      if (! dir.mkdirs()) {
        throw new IOException("Could not create: " + dir);
      }
    }

    File dictFiles[] = dir.listFiles(new FilenameFilter() {
      public boolean accept(File dir, String name) {
        return name.endsWith(SchemaDictionaryEntry.SUMMARY_ENDING);
      }
    });

    for (int i = 0; i < dictFiles.length; i++) {
      String name = dictFiles[i].getName();
      String fileRoot = name.substring(0, name.length() - SchemaDictionaryEntry.SUMMARY_ENDING.length());
      SchemaDictionaryEntry sde = new SchemaDictionaryEntry();
      sde.loadDictionaryEntry(dir, fileRoot);
      dictElts.add(sde);
    }
  }

  /**
   * Store a novel dictionary element (which is constructed with the original datafile and a user's comment).
   */
  public synchronized void addDictionaryElt(File avroFile, String infoText) throws IOException {
    SchemaDictionaryEntry entry = new SchemaDictionaryEntry(avroFile, infoText);
    dictElts.add(entry);

    String fileRoot = "" + Math.abs(r.nextInt());
    entry.saveDictionaryEntry(dir, fileRoot);
  }

  /**
   * Iterate through objects already in the directory.
   */
  public List<SchemaDictionaryEntry> contents() {
    return dictElts;
  }

  //////////////////////////////////////////
  // main()
  //////////////////////////////////////////
  public static void main(String argv[]) throws IOException {
    boolean shouldDump = false;
    boolean shouldAdd = false;
    File avroDataFile = null;
    String dictMessage = null;

    CommandLine cmd = null;
    Options options = new Options();
    options.addOption("?", false, "Help for command-line");
    options.addOption("d", false, "Dump contents of schema dictionary");
    options.addOption("a", true, "Add datafile to new schema dictionary element");
    options.addOption("m", true, "Add comment message as part of new schema dictionary element");

    try {
      CommandLineParser parser = new PosixParser();
      cmd = parser.parse(options, argv);
    } catch (ParseException e) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("SchemaDictionary", options, true);
      System.err.println("Required input: <schemadictionary>");
      System.exit(-1);
    }

    if (cmd.hasOption("?")) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("SchemaDictionary", options, true);
      System.err.println("Required input: <schemadictionary>");
      System.exit(0);
    }

    if (cmd.hasOption("d")) {
      shouldDump = true;
    }

    if (cmd.hasOption("a")) {
      avroDataFile = new File(cmd.getOptionValue("a")).getCanonicalFile();
    }
    if (cmd.hasOption("m")) {
      dictMessage = cmd.getOptionValue("m");
      if (cmd.hasOption("a")) {
        shouldAdd = true;
      }
    }
    if ((! shouldAdd) && (cmd.hasOption("a") || cmd.hasOption("m"))) {
      System.err.println("Must indicate -a AND -m to add new schema dictionary item");
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("SchemaDictionary", options, true);
      System.err.println("Required input: <schemadictionary>");
      System.exit(0);
    }

    String[] argArray = cmd.getArgs();
    if (argArray.length == 0) {
      System.err.println("No schema dictionary path provided.");
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("SchemaDictionary", options, true);
      System.err.println("Required input: <schemadictionary>");
      System.exit(0);
    }

    File dictionaryDir = new File(argArray[0]).getCanonicalFile();
    SchemaDictionary dict = new SchemaDictionary(dictionaryDir);

    if (shouldAdd) {
      dict.addDictionaryElt(avroDataFile, dictMessage);
    }

    if (shouldDump) {
      int counter = 1;
      for (SchemaDictionaryEntry entry: dict.contents()) {
        System.err.println("" + counter + ".  " + entry.getInfo());
        System.err.println(entry.getSchema());
        System.err.println();
        counter++;
      }
      int numItems = counter-1;
      System.err.println("Dictionary at " + dictionaryDir.getCanonicalPath() + " has " + numItems + " item(s).");
    }
  }
}