package edu.nd.nina.io;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.NavigableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;
import edu.nd.nina.DirectedGraph;
import edu.nd.nina.graph.DefaultDirectedGraph;
import edu.nd.nina.graph.DefaultEdge;
import edu.nd.nina.types.Instance;
public class WikiHBaseToCatGraph {
public static Instance loadCategoryFeatureGraphFromWikiHbase(
String category,
DirectedGraph<String, DefaultEdge> graph, int depth) {
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", "dmserv3.cs.illinois.edu");
HashMap<String, Integer> tempMap = new HashMap<String, Integer>();
Instance root = null;
HTable testTable = null;
try {
testTable = new HTable(config, "wikipedia");
} catch (IOException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
DirectedGraph<String, DefaultEdge> dag = new DefaultDirectedGraph<String, DefaultEdge>(DefaultEdge.class);
recur(category, testTable, dag, "", depth);
List<Get> gets = new ArrayList<Get>();
int i=0;
for(String v : dag.vertexSet()){
if(v.startsWith("Category:")) continue;
tempMap.put(v, i++);
Get g = new Get(Bytes.toBytes(v));
g.addFamily(Bytes.toBytes("ol"));
g.addColumn(Bytes.toBytes("p"), Bytes.toBytes("text"));
gets.add(g);
graph.addVertex(v);
}
PrintWriter pw;
try {
pw = new PrintWriter("./data/hdtm/" + category.replaceAll(":", "_") + ".dot");
for(DefaultEdge e : dag.edgeSet()){
pw.println( dag.getEdgeSource(e).replaceAll(" ", "_") + "->" + dag.getEdgeTarget(e).replaceAll(" ", "_"));
}
pw.close();
} catch (FileNotFoundException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
try {
pw = new PrintWriter("./data/hdtm/" + category.replaceAll(":", "_") + ".txt");
StringBuilder sb = new StringBuilder();
for (Result r : testTable.get(gets)) {
String str = Bytes.toString(r.getRow());
System.out.println(str.replaceAll(" ", "_"));
String text = Bytes.toString(r.getColumn(Bytes.toBytes("p"), Bytes.toBytes("text")).get(0).getValue());
text = text.replaceAll("\\\n", "");
text = text.replaceAll("\\{(.*?)\\}", "");
pw.println(tempMap.get(str) + " " + str.replaceAll(" ", "_") + " " + text.replaceAll("[^a-zA-Z0-9 _.]+", "").trim());
NavigableMap<byte[], byte[]> ols = r.getFamilyMap(Bytes.toBytes("ol"));
for(byte[] ol : ols.values()){
if(tempMap.containsKey(Bytes.toString(ol))){
sb.append(tempMap.get(str) + " -> " + tempMap.get(Bytes.toString(ol)) + System.getProperty("line.separator"));
}
}
}
pw.println();
pw.println(sb);
pw.close();
testTable.close();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
return root;
}
private static void recur(String cat, HTable testTable, DirectedGraph<String, DefaultEdge> dag, String tab, int maxdepth){
if(tab.length() > maxdepth) return;
Scan scan = new Scan();
scan.addColumn(Bytes.toBytes("c"), Bytes.toBytes(cat.replaceAll("Category:", "")));
dag.addVertex(cat);
ResultScanner rs = null;
try {
rs = testTable.getScanner(scan);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
int i=0;
for (Result r = rs.next(); r != null; r = rs.next()) {
String str = Bytes.toString(r.getRow());
dag.addVertex(str);
dag.addEdge(cat, str);
System.out.println(tab + str);
if (str.startsWith("Category:")) {
recur(str, testTable, dag, tab + "\t", maxdepth);
}
i++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
String[] cats = {
"Sports", "Technology" };
for (String cat : cats) {
DirectedGraph<String, DefaultEdge> dag = new DefaultDirectedGraph<String, DefaultEdge>(
DefaultEdge.class);
loadCategoryFeatureGraphFromWikiHbase("Category:" + cat, dag, 2);
}
}
}