package edu.nd.nina.io;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import edu.jhu.nlp.wikipedia.PageCallbackHandler;
import edu.jhu.nlp.wikipedia.WikiPage;
import edu.jhu.nlp.wikipedia.WikiXMLParser;
import edu.jhu.nlp.wikipedia.WikiXMLParserFactory;
public class WikidumpToHbase {
public static void main(String[] args) {
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", "dmserv3.cs.illinois.edu");
try {
createTable(config, "wikipedia", new String[] {"p", "c", "ol"});
final HTable table = new HTable(config, "wikipedia");
table.setAutoFlush(false);
WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser("C:\\Users\\weninger\\Downloads\\enwiki-latest-pages-articles.xml.bz2");
wxsp.setPageCallback(new PageCallbackHandler() {
int i = 0;
public void process(WikiPage page) {
String title = page.getTitle();
title = title.trim();
System.out.println(i + ": " + title);
Put p = new Put(Bytes.toBytes(title));
p.add(Bytes.toBytes("p"), Bytes.toBytes("t"),
Bytes.toBytes(title));
p.add(Bytes.toBytes("p"), Bytes.toBytes("id"),
Bytes.toBytes(page.getID()));
p.add(Bytes.toBytes("p"), Bytes.toBytes("text"),
Bytes.toBytes(page.getText()));
p.add(Bytes.toBytes("p"), Bytes.toBytes("wt"),
Bytes.toBytes(page.getWikiText()));
p.add(Bytes.toBytes("p"), Bytes.toBytes("isDis"),
Bytes.toBytes(page.isDisambiguationPage()));
p.add(Bytes.toBytes("p"), Bytes.toBytes("isRed"),
Bytes.toBytes(page.isRedirect()));
p.add(Bytes.toBytes("p"), Bytes.toBytes("isSpec"),
Bytes.toBytes(page.isSpecialPage()));
p.add(Bytes.toBytes("p"), Bytes.toBytes("isStub"),
Bytes.toBytes(page.isStub()));
for (String s : page.getCategories()) {
s = s.trim();
p.add(Bytes.toBytes("c"), Bytes.toBytes(s),
Bytes.toBytes(s));
}
for (String s : page.getLinks()) {
s = s.trim();
p.add(Bytes.toBytes("ol"), Bytes.toBytes(s),
Bytes.toBytes(s));
}
try {
table.put(p);
} catch (IOException e) {
e.printStackTrace();
}
// graph.addVertex(ins);
i++;
}
});
wxsp.parse();
table.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Create a table
*/
public static void createTable(Configuration conf, String tableName, String[] familys)
throws Exception {
HBaseAdmin admin = new HBaseAdmin(conf);
if (admin.tableExists(tableName)) {
System.out.println("table already exists!");
} else {
HTableDescriptor tableDesc = new HTableDescriptor(tableName);
for (int i = 0; i < familys.length; i++) {
tableDesc.addFamily(new HColumnDescriptor(familys[i]));
}
admin.createTable(tableDesc);
System.out.println("create table " + tableName + " ok.");
}
admin.close();
}
}