LearnStructure.java example

Explorer
RecordBreaker-master
- src
/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.learnstructure;

import java.io.*;
import java.util.*;
import org.apache.avro.Schema;
import org.apache.avro.io.JsonEncoder;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericDatumWriter;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.Configuration;

/*********************************************************
 * LearnStructure is the main file for figuring out pattern-extractors and schemas for a text file.
 *
 * This code operates on a raw text file and emits the extractors/schemas.  The user
 * may decide to remove certain extractors/schemas if they only apply to a tiny number of
 * potential lines in the target text file.
 *
 *********************************************************/
public class LearnStructure {
  public static String SCHEMA_FILENAME = "schema.json";
  public static String JSONDATA_FILENAME = "data.avro.json";
  public static String DATA_FILENAME = "data.avro";
  public static String PARSER_FILENAME = "parser.dat";

  public LearnStructure() {
  }
  
  /**
   */
  public void inferRecordFormat(FileSystem fs, Path p, FileSystem fs2, Path schemaFile, Path parseTreeFile, Path jsonDataFile, Path avroDataFile, boolean verbose, int maxLines) throws IOException {
    // Store parse errors and results
    List<Integer> unparseableLineNos = new ArrayList<Integer>();
    List<String> unparseableStrs = new ArrayList<String>();
    List<Integer> parseableLineNos = new ArrayList<Integer>();
    List<List<Token.AbstractToken>> allChunks = new ArrayList<List<Token.AbstractToken>>();

    //
    // Transform the text into a list of "chunks".  A single chunk corresponds to a line of text.  A chunk is a list of Tokens.
    //
    long startRead = System.currentTimeMillis();
    BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
    try {
      String s = in.readLine();
      int lineno = 0;
      while (s != null) {
        if (maxLines >= 0 && lineno >= maxLines) {
          break;
        }
        List<Token.AbstractToken> chunkToks = Tokenizer.tokenize(s);
        if (chunkToks != null) {
          allChunks.add(chunkToks);
          parseableLineNos.add(lineno);
        } else {
          unparseableStrs.add(s);
          unparseableLineNos.add(lineno);
        }
        s = in.readLine();
        lineno++;
      }
    } finally {
      in.close();
    }

    //
    // Infer type structure from the tokenized chunks
    //
    long start = System.currentTimeMillis();
    InferredType typeTree = TypeInference.infer(allChunks);
    long end = System.currentTimeMillis();
    double loadTime = (start - startRead) / 1000.0;
    double inferTime = (end - start) / 1000.0;
    double totalTime = (end - startRead) / 1000.0;
    if (verbose) {
      System.err.println("Number of chunks: " + allChunks.size());    
      System.err.println("Elapsed load time: " + loadTime);
      System.err.println("Elapsed inference time: " + inferTime);
      System.err.println("Total execution time: " + totalTime);
    }

    //
    // The existing type tree is now correct, but could probably be more succinct.
    // We can now improve/rewrite it.
    //

    //
    // Should every top-level type be ARRAY, so as to allow repeated log lines?
    // Or does the Avro format allow an implict top-level repeating structure?
    //

    //
    // Dump the results.  We emit:
    // 1) A JSON/Avro schema
    // 2) A serialized parser program that can consume data and emit Avro files using the given schema
    //
    Schema s = typeTree.getAvroSchema();
    if (schemaFile != null) {
      BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs2.create(schemaFile)));
      try {
        out.write(s.toString(true));
      } finally {
        out.close();
      }
    }
    if (parseTreeFile != null) {
      DataOutputStream outd = new DataOutputStream(new BufferedOutputStream(fs2.create(parseTreeFile)));
      try {
        typeTree.write(outd);
      } finally {
        outd.close();
      }
    }

    //
    // Apply the typetree's parser.
    //
    if (jsonDataFile != null) {
      Schema schema = typeTree.getAvroSchema();
      GenericDatumWriter jsonGDWriter = new GenericDatumWriter(schema);      
      BufferedOutputStream outJson = new BufferedOutputStream(fs2.create(jsonDataFile));
      JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, outJson);
      try {
        in = new BufferedReader(new InputStreamReader(fs.open(p)));
        try {
          String str = in.readLine();
          while (str != null) {
            GenericContainer gct = typeTree.parse(str);

            if (gct != null) {
              jsonGDWriter.write(gct, encoder);
            }
            str = in.readLine();
          }      
        } finally {
          in.close();
        }
      } finally {
        encoder.flush();
        outJson.close();
      }
    }

    if (avroDataFile != null) {
      int numGoodParses = 0;
      int lineno = 0;
      Schema schema = typeTree.getAvroSchema();

      GenericDatumWriter gdWriter = new GenericDatumWriter(schema);
      DataFileWriter outData = new DataFileWriter(gdWriter);
      outData = outData.create(schema, fs2.create(avroDataFile));

      try {
        in = new BufferedReader(new InputStreamReader(fs.open(p)));
        try {
          String str = in.readLine();
          while (str != null) {
            GenericContainer gct = typeTree.parse(str);
            if (gct != null) {
              numGoodParses++;
              outData.append(gct);
            } else {
              if (verbose) {
                System.err.println("unparsed line: '" + str + "'");
              }
            }
            str = in.readLine();
            lineno++;
          }      
        } finally {
          in.close();
        }
      } finally {
        outData.close();
      }
      if (verbose) {
        System.err.println();
        System.err.println("Total # input lines: " + lineno);
        System.err.println("Total # lines parsed correctly: " + numGoodParses);
      }
    }
  }

  //////////////////////////////////////////
  // main()
  //////////////////////////////////////////
  public static void main(String argv[]) throws IOException {
    if (argv.length < 2) {
      System.err.println("Usage: LearnStructure <input-datafile> <outdir> (-emitAvro (true)|false)");
      return;
    }
    FileSystem localFS = FileSystem.getLocal(new Configuration());
    boolean emitAvro = true;
    int i = 0;
    Path f = new Path(new File(argv[i++]).getCanonicalPath());
    File outdir = new File(argv[i++]).getCanonicalFile();
    for (; i < argv.length; i++) {
      if ("-emitAvro".equals(argv[i])) {
        i++;
        emitAvro = "true".equals(argv[i]);
      }
    }

    System.err.println("Input file: " + f.toString());
    System.err.println("Output directory: " + outdir.getCanonicalPath());
    if (outdir.exists()) {
      throw new IOException("Output directory already exists: " + outdir);
    }
    outdir.mkdirs();
    Path schemaFile = new Path(outdir.getCanonicalPath(), SCHEMA_FILENAME);
    Path parseTreeFile = new Path(outdir.getCanonicalPath(), PARSER_FILENAME);    
    Path jsonDataFile = null;
    Path avroDataFile = null;
    if (emitAvro) {
      jsonDataFile = new Path(outdir.getCanonicalPath(), JSONDATA_FILENAME);    
      avroDataFile = new Path(outdir.getCanonicalPath(), DATA_FILENAME);
    }

    LearnStructure ls = new LearnStructure();
    ls.inferRecordFormat(localFS, f, localFS, schemaFile, parseTreeFile, jsonDataFile, avroDataFile, true, -1);
  }
}