/*********************************************************************************************************************** * * Copyright (C) 2010 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * **********************************************************************************************************************/ package eu.stratosphere.addons.hbase.example; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import eu.stratosphere.addons.hbase.TableInputFormat; import eu.stratosphere.addons.hbase.common.HBaseKey; import eu.stratosphere.addons.hbase.common.HBaseResult; import eu.stratosphere.api.common.Plan; import eu.stratosphere.api.common.Program; import eu.stratosphere.api.common.ProgramDescription; import eu.stratosphere.api.java.record.operators.FileDataSink; import eu.stratosphere.api.java.record.operators.GenericDataSource; import eu.stratosphere.api.java.record.io.CsvOutputFormat; import eu.stratosphere.configuration.Configuration; import eu.stratosphere.types.Record; import eu.stratosphere.types.StringValue; /** * Implements a word count which takes the input file and counts the number of * the occurrences of each word in the file. */ public class HBaseReadExample implements Program, ProgramDescription { public static class MyTableInputFormat extends TableInputFormat { private static final long serialVersionUID = 1L; private final byte[] META_FAMILY = "meta".getBytes(); private final byte[] USER_COLUMN = "user".getBytes(); private final byte[] TIMESTAMP_COLUMN = "timestamp".getBytes(); private final byte[] TEXT_FAMILY = "text".getBytes(); private final byte[] TWEET_COLUMN = "tweet".getBytes(); public MyTableInputFormat() { super(); } @Override protected HTable createTable(Configuration parameters) { return super.createTable(parameters); } @Override protected Scan createScanner(Configuration parameters) { Scan scan = new Scan (); scan.addColumn (META_FAMILY, USER_COLUMN); scan.addColumn (META_FAMILY, TIMESTAMP_COLUMN); scan.addColumn (TEXT_FAMILY, TWEET_COLUMN); return scan; } StringValue row_string = new StringValue(); StringValue user_string = new StringValue(); StringValue timestamp_string = new StringValue(); StringValue tweet_string = new StringValue(); @Override public void mapResultToRecord(Record record, HBaseKey key, HBaseResult result) { Result res = result.getResult(); res.getRow(); record.setField(0, toString(row_string, res.getRow())); record.setField(1, toString (user_string, res.getValue(META_FAMILY, USER_COLUMN))); record.setField(2, toString (timestamp_string, res.getValue(META_FAMILY, TIMESTAMP_COLUMN))); record.setField(3, toString (tweet_string, res.getValue(TEXT_FAMILY, TWEET_COLUMN))); } private final StringValue toString (StringValue string, byte[] bytes) { string.setValueAscii(bytes, 0, bytes.length); return string; } } @Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String output = (args.length > 1 ? args[1] : ""); GenericDataSource<TableInputFormat> source = new GenericDataSource<TableInputFormat>(new MyTableInputFormat(), "HBase Input"); source.setParameter(TableInputFormat.INPUT_TABLE, "twitter"); source.setParameter(TableInputFormat.CONFIG_LOCATION, "/etc/hbase/conf/hbase-site.xml"); FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, source, "HBase String dump"); CsvOutputFormat.configureRecordFormat(out) .recordDelimiter('\n') .fieldDelimiter(' ') .field(StringValue.class, 0) .field(StringValue.class, 1) .field(StringValue.class, 2) .field(StringValue.class, 3); Plan plan = new Plan(out, "HBase access Example"); plan.setDefaultParallelism(numSubTasks); return plan; } @Override public String getDescription() { return "Parameters: [numSubStasks] [input] [output]"; } }