package guang.crawler.connector; import guang.crawler.commons.DataField; import guang.crawler.commons.DataFields; import guang.crawler.commons.WebURL; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MasterNotRunningException; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HConnection; import org.apache.hadoop.hbase.client.HConnectionManager; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.util.Bytes; /** * 连接HBase中存储爬取的数据的连接器 * * @author sun * */ public class WebDataTableConnector { /** * 每个采集点都有自己独立的表,该常量是表的名称的前缀. */ private static final String TABLE_PREFIX = "site-"; /** * HBase的配置信息 */ private Configuration hbaseConfig; /** * 用来对HBase进行增删改查操作的管理器 */ private HBaseAdmin hbaseAdmin; /** * HBase连接 */ private HConnection hConnection; /** * 缓存的数据表,为了防止重复打开关闭. */ private HashMap<String, HTableInterface> webDataTables; /** * 存储主要数据的簇 */ public final static String FAMILY_MAIN_DATA = "MAIN"; /** * 对主要数据进行支撑的簇 */ public final static String FAMILY_SUPPORT_DATA = "SUPPORT"; /** * 是否打开是对HBase的连接 */ private boolean opened = false; /** * 缓冲区大小 */ private final static long bufferSize = 1024; /** * HBase需要的Zookeeper的连接字符串 */ private String zookeeperQuorum; /** * 创建一个连接器 * * @param zookeeperQuorum * HBase所需的Zookeeper连接器的地址. */ public WebDataTableConnector(final String zookeeperQuorum) { this.zookeeperQuorum = zookeeperQuorum; this.webDataTables = new HashMap<String, HTableInterface>(); } /** * 向HBase中插入一系列的域 * * @param webUrl * @param dataFields * @throws IOException */ public void addDataFields(final WebURL webUrl, final DataFields dataFields) throws IOException { if (!this.opened) { throw new IOException("data base should be opened first."); } String tableName = WebDataTableConnector.TABLE_PREFIX + webUrl.getSiteId(); HTableInterface webDataTable = this.webDataTables.get(tableName); if (webDataTable == null) { if (!this.tableExists(tableName)) { webDataTable = this.createTable(tableName); } else { webDataTable = this.loadTable(tableName); } if (webDataTable != null) { this.webDataTables.put(tableName, webDataTable); } } HashMap<String, LinkedList<DataField>> fields = dataFields.getAllFileds(); Set<String> keys = fields.keySet(); for (String key : keys) { LinkedList<DataField> data = fields.get(key); if ((data == null) || (data.size() == 0)) { continue; } Put put = new Put(Bytes.toBytes(key));// 设置键值 for (DataField field : data) { put.add(Bytes.toBytes(field.getDataFamily()), Bytes.toBytes(field.getColumnName()), Bytes.toBytes(field.getData())); } webDataTable.put(put); } webDataTable.flushCommits(); } /** * 关闭连接和所有打开的表. * * @throws IOException */ public void close() throws IOException { if (!this.opened) { return; } Iterator<Entry<String, HTableInterface>> tables = this.webDataTables.entrySet() .iterator(); while (tables.hasNext()) { tables.next() .getValue() .close(); } if (this.hbaseAdmin != null) { this.hbaseAdmin.close(); } if (this.hConnection != null) { this.hConnection.close(); } this.opened = false; } /** * 创建一个HBase表. * * @param tableName * @return * @throws IOException */ public HTableInterface createTable(final String tableName) throws IOException { HTableDescriptor tableDesc = new HTableDescriptor( TableName.valueOf(tableName)); HColumnDescriptor dataFamily = new HColumnDescriptor( WebDataTableConnector.FAMILY_MAIN_DATA); dataFamily.setMaxVersions(1); dataFamily.setBlockCacheEnabled(false); tableDesc.addFamily(dataFamily); HColumnDescriptor supportDataFamily = new HColumnDescriptor( WebDataTableConnector.FAMILY_SUPPORT_DATA); supportDataFamily.setMaxVersions(1); supportDataFamily.setBlockCacheEnabled(false); tableDesc.addFamily(supportDataFamily); this.hbaseAdmin.createTable(tableDesc); HTableInterface webDataTable = this.hConnection.getTable(tableName); webDataTable.setAutoFlush(true, true); webDataTable.setWriteBufferSize(WebDataTableConnector.bufferSize); return webDataTable; } /** * 删除一个HBase表. * * @param tableName * @return * @throws IOException */ public boolean deleteTable(final String tableName) throws IOException { boolean disabled = this.hbaseAdmin.isTableDisabled(tableName); if (!disabled) { this.hbaseAdmin.disableTable(tableName); } this.hbaseAdmin.deleteTable(tableName); return true; } /** * 刷新缓冲的数据. * * @throws IOException */ public void flush() throws IOException { if (!this.opened) { return; } Iterator<Entry<String, HTableInterface>> tables = this.webDataTables.entrySet() .iterator(); while (tables.hasNext()) { tables.next() .getValue() .flushCommits(); } } /** * 获取具有某种类型名称的表 * * @return * @throws IOException */ public List<String> getAllTables(final String pattern) throws IOException { HTableDescriptor[] tableDescriptors = this.hbaseAdmin.listTables(pattern); if ((tableDescriptors == null) || (tableDescriptors.length == 0)) { return null; } ArrayList<String> result = new ArrayList<String>(); for (HTableDescriptor table : tableDescriptors) { String tableName = new String(table.getName()); result.add(tableName); } return result; } /** * 获取当前已经爬取了数据的站点的ID * * @return * @throws IOException */ public Long[] getAvailableSiteIds() throws IOException { HTableDescriptor[] tableDescriptors = this.hbaseAdmin.listTables("site-\\d*"); if ((tableDescriptors == null) || (tableDescriptors.length == 0)) { return null; } ArrayList<Long> result = new ArrayList<Long>(); for (HTableDescriptor table : tableDescriptors) { String tableName = new String(table.getName()); try { long siteId = Long.parseLong(tableName.substring(WebDataTableConnector.TABLE_PREFIX.length())); result.add(siteId); } catch (NumberFormatException e) { continue; } } Long[] resultArray = new Long[result.size()]; return result.toArray(resultArray); } /** * 该方法已经不用了,因为数据库表的结构发生了改变. * * @param tableName * @param docid * @return * @throws IOException */ @Deprecated public String[] getHtmlData(final String tableName, final int docid) throws IOException { if (!this.opened) { throw new IOException("data base should be opened first."); } HTableInterface webDataTable = this.webDataTables.get(tableName); if (webDataTable == null) { if (!this.tableExists(tableName)) { webDataTable = this.createTable(tableName); } else { webDataTable = this.loadTable(tableName); } if (webDataTable != null) { this.webDataTables.put(tableName, webDataTable); } } Get get = new Get(Bytes.toBytes(docid)); get.addFamily(Bytes.toBytes(WebDataTableConnector.FAMILY_MAIN_DATA)); Result result = webDataTable.get(get); return this.resultToHtmlData(result); } /** * 加载已经存在的表,如果表不存在,会发生异常 * * @param tableName * @return * @throws IOException */ public HTableInterface loadTable(final String tableName) throws IOException { return this.hConnection.getTable(tableName); } /** * 打开连接 * * @throws MasterNotRunningException * @throws ZooKeeperConnectionException * @throws IOException */ public void open() throws MasterNotRunningException, ZooKeeperConnectionException, IOException { if (this.opened) { return; } Configuration config = new Configuration(); config.set("hbase.zookeeper.quorum", this.zookeeperQuorum); this.hbaseConfig = HBaseConfiguration.create(config); this.hbaseAdmin = new HBaseAdmin(this.hbaseConfig); this.hConnection = HConnectionManager.createConnection(this.hbaseConfig); this.opened = true; } public String[] resultToHtmlData(final Result result) { if (result != null) { String[] data = new String[2]; byte[] urlData = result.getValue(Bytes.toBytes(WebDataTableConnector.FAMILY_MAIN_DATA), Bytes.toBytes("url")); if (urlData != null) { data[0] = Bytes.toString(urlData); } byte[] htmlData = result.getValue(Bytes.toBytes(WebDataTableConnector.FAMILY_MAIN_DATA), Bytes.toBytes("html")); if (htmlData != null) { data[1] = Bytes.toString(htmlData); } return data; } return null; } /** * read data according to the site id * * @param siteId * @return * @throws IOException */ public ResultScanner scanTable(final long siteId) throws IOException { HTableInterface iface = this.loadTable(WebDataTableConnector.TABLE_PREFIX + siteId); if (iface != null) { ResultScanner scanner = iface.getScanner("data".getBytes()); return scanner; } return null; } public boolean tableExists(final String tableName) throws IOException { return this.hbaseAdmin.tableExists(tableName); } }