package de.l3s.common;
import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;
import com.google.common.collect.Lists;
import au.com.bytecode.opencsv.CSVReader;
public class WikiEventsHBaseImport {
Configuration conf;
HBaseAdmin admin;
public void init() throws MasterNotRunningException, ZooKeeperConnectionException {
conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "master.hadoop,node01.hadoop,node02.hadoop");
conf.set("hbase.zookeeper.property.clientPort","2181");
conf.set("hbase.master", "master.hadoop");
admin = new HBaseAdmin(conf);
}
/**
*
* @param tableName e.g., WikiEvent
* @return
* @throws IOException
*/
public HTable createTable(String tableName) throws IOException {
HTable hTable = new HTable(conf, tableName);
return hTable;
}
public void readCSV(String path) throws IOException {
CSVReader reader = new CSVReader(new InputStreamReader(WikiEventsHBaseImport.class.getResourceAsStream(path)),'\t');
String[] entry;
ArrayList<String[]> entries = new ArrayList<String[]>();
while ((entry = reader.readNext()) != null) {
entries.add(entry);
}
}
/**
* schema:
* "event:"{
* "lang" : {
* "date1":~
* "date2":~
* }
* }
* @param table
* @param csvPath
* @throws FileNotFoundException
*/
public void putTimeSeriesDataToHBase(HTable table, File csvFile) throws FileNotFoundException {
//get event name frome file path
String eventName = csvFile.getName().replace(".vtime", "").replace(" ","_");
System.out.println(csvFile.getAbsolutePath());
//transpose csv file
HashMap<String, String[]> dataMap = new HashMap<String, String[]>();
CSVReader reader = new CSVReader(new FileReader(csvFile),'\t');
String[] titles = null;
String[] entry;
try {
//csv titles
titles = reader.readNext();
while ((entry = reader.readNext()) != null) {
//key: date, value: entry
dataMap.put(entry[0], entry);
}
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Set<String> dates = dataMap.keySet();
//eventName is a row key
Put put = new Put(Bytes.toBytes(eventName));
for (String date : dates) {
for (int idx = 1; idx < titles.length; idx++) {
//column family:lang, column quantifier:date
put.add(Bytes.toBytes(titles[idx]), Bytes.toBytes(date), Bytes.toBytes(dataMap.get(date)[idx]));
}
}
try {
table.put(put);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
WikiEventsHBaseImport we = new WikiEventsHBaseImport();
HTable htable = null;
try {
we.init();
htable = we.createTable("WikiEvents");
} catch (MasterNotRunningException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ZooKeeperConnectionException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
File inputDir = new File(args[0]);
Set<String> filenameSet = new HashSet<String>();
Set<File> fileSet_ = new HashSet<File>();
//iterative read all files in the directory tree
Set<File> fileSet = we.listFileTree(inputDir);
for (File f : fileSet) {
if (!f.isDirectory() && !filenameSet.contains(f.getName())) {
filenameSet.add(f.getName());
fileSet_.add(f);
}
}
//dump data
for (File f : fileSet_) {
try {
we.putTimeSeriesDataToHBase(htable, f);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
try {
we.admin.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
*
* @param dir
* @return
*/
public Set<File> listFileTree(File dir) {
Set<File> fileTree = new HashSet<File>();
for (File entry : dir.listFiles(filter)) {
if (entry.isFile()) fileTree.add(entry);
else fileTree.addAll(listFileTree(entry));
}
return fileTree;
}
final FileFilter filter = new FileFilter() {
@Override
public boolean accept(File file) {
return file.isDirectory() || file.getName().endsWith(".vtime");
}
};
/**
* Sometimes, you won't know the row you're looking for. In this case, you
* use a Scanner. This will give you cursor-like interface to the contents
* of the table. To set up a Scanner, do like you did above making a Put
* and a Get, create a Scan. Adorn it with column names, etc.
*/
public List<Result> scanHBase(Scan s, String family, String qualifier, HTable table) {
List<Result> l = Lists.newArrayList();
s.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier));
try {
ResultScanner scanner = table.getScanner(s);
try {
for (Result rr = scanner.next(); rr != null; rr = scanner.next()) {
l.add(rr);
}
} finally {
scanner.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return l;
}
}