package org.commoncrawl.service.crawler.filters; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.InetAddress; import java.net.URI; import java.nio.charset.Charset; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.rpc.base.shared.BinaryProtocol; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.crawler.filters.DomainFilterData; import org.commoncrawl.service.directory.BlockingClient; public class Utils { private static final Log LOG = LogFactory.getLog(Utils.class); static Path getPathForSession(JobConf job) { return new Path(job.get("mapred.temp.dir", ".") + "/list-cache-" + getJobCacheSessionId(job)); } static void setJobCacheSessionId(JobConf conf,long sessionId) { conf.setLong("list.cache.session.id", sessionId); } static long getJobCacheSessionId(JobConf conf) { return conf.getLong("list.cache.session.id",0); } public static void initializeCacheSession(JobConf job, long sessionId) throws IOException { if (job.getLong("list.cache.session.id", -1) == -1) { setJobCacheSessionId(job,sessionId); FileSystem fs = FileSystem.get(job); LOG.info("Initialize Cache Session. Path is:" + getPathForSession(job)); fs.mkdirs(getPathForSession(job)); } else { LOG.info("Initialize Cache Session - Session Already Initialized Previously"); } } public static void publishListToCache(JobConf job,byte[] streamData,String itemPath)throws IOException { FileSystem fs = FileSystem.get(job); Path fileSystemPath = null; if (itemPath.startsWith("/")) { fileSystemPath = new Path(getPathForSession(job),itemPath.substring(1)); } else { fileSystemPath = new Path(getPathForSession(job),itemPath); } LOG.info("Publishing Filter at:" + itemPath + " to hdfs location:" + fileSystemPath); FSDataOutputStream outputStream = fs.create(fileSystemPath); try { outputStream.write(streamData); } finally { if (outputStream != null) { outputStream.close(); } } LOG.info("publishListToCache calling addCacheFile with path:" + fileSystemPath.toString()); DistributedCache.addCacheFile(fileSystemPath.toUri(), job); } public static void loadFilterFromStream(InputStream inputStream,Filter filterObject) throws IOException { // read filter items BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String filterItemLine = null; while ((filterItemLine = reader.readLine()) != null) { // LOG.info("Got Filter Line:" + filterItemLine); if (!filterItemLine.startsWith("#")) { filterObject.loadFilterItem(filterItemLine); } } } public static void loadFilterFromCache(JobConf job,String cacheFilePath,Filter filterObject)throws IOException { LOG.info("Loading filter data from:" + cacheFilePath); byte data[] = Utils.loadListFromCache(job, cacheFilePath); // create an input stream based on the loaded data ByteArrayInputStream inputStream = new ByteArrayInputStream(data); // load from stream loadFilterFromStream(inputStream,filterObject); } public static void loadFilterFromMasterFile(JobConf job,Filter filterObject,String filterFilePath) throws IOException { String directoryServiceIp = job.get(CrawlEnvironment.DIRECTORY_SERVICE_ADDRESS_PROPERTY, "10.0.20.21"); ByteArrayInputStream inputStream = new ByteArrayInputStream(buildConsolidatedStreamFromMasterStream(job, filterFilePath)); loadFilterFromStream(inputStream,filterObject); } public static void loadFilterFromPath(InetAddress directoryServer,Filter filterObject,String filterFilePath,boolean hasMasterFile) throws IOException { ByteArrayInputStream inputStream = null; if (hasMasterFile) { inputStream = new ByteArrayInputStream(buildConsolidatedStreamFromMasterStream(directoryServer, filterFilePath)); } else { inputStream = new ByteArrayInputStream(BlockingClient.loadDataFromPath(directoryServer,filterFilePath)); } loadFilterFromStream(inputStream,filterObject); } public static byte[] buildConsolidatedStreamFromMasterStream(JobConf job,String masterFilePath) throws IOException { String directoryServiceIp = job.get(CrawlEnvironment.DIRECTORY_SERVICE_ADDRESS_PROPERTY, "10.0.20.21"); return buildConsolidatedStreamFromMasterStream(InetAddress.getByName(directoryServiceIp),masterFilePath); } public static byte[] buildConsolidatedStreamFromMasterStream(InetAddress directoryServiceServer,String masterFilePath) throws IOException { //create and output stream.. ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); BufferedWriter charWriter = new BufferedWriter(new OutputStreamWriter(outputStream,Charset.forName("UTF8"))); LOG.info("Loading master file for filter at:" + masterFilePath + " ServerAddress:" + directoryServiceServer.toString()); // load primary path via directory service client BufferedReader reader = BlockingClient.createReaderFromPath(directoryServiceServer,masterFilePath); try { // load streams referenced in master stream String line = null; while ((line = reader.readLine()) != null) { if (!line.startsWith("#")) { LOG.info("Loading referenced Filter file at:" + line); // load inner stream BufferedReader innerReader = BlockingClient.createReaderFromPath(directoryServiceServer, line); try { String innerLine = null; while ((innerLine = innerReader.readLine()) != null) { charWriter.write(innerLine); charWriter.write('\n'); } } finally { if (innerReader != null) { innerReader.close(); } } } } } finally { if (reader != null) { reader.close(); } } charWriter.flush(); return outputStream.toByteArray(); } public static void publishFilterToCache(JobConf job,Filter filterObject,String filterFilePath,boolean isMasterFile)throws IOException { String directoryServiceIp = job.get(CrawlEnvironment.DIRECTORY_SERVICE_ADDRESS_PROPERTY, "10.0.20.21"); byte dataBuffer[] = null; if (isMasterFile) { LOG.info("Loading Master File at:" + filterFilePath + " Server:" + directoryServiceIp); dataBuffer = buildConsolidatedStreamFromMasterStream(job,filterFilePath); } else { LOG.info("Loading Single Filter File at:" + filterFilePath +" Server:" + directoryServiceIp); dataBuffer = BlockingClient.loadDataFromPath(InetAddress.getByName(directoryServiceIp),filterFilePath); } LOG.info("Publishing Filter at:" + filterFilePath + " to Cache"); publishListToCache(job,dataBuffer,filterFilePath); } public static byte[] loadListFromCache(JobConf job,String itemPath) throws IOException { // tweak the item path to reflect the bizarre way in which distributed cache stores paths Path localPath = new Path(itemPath); //localPath = new Path(localPath,localPath.getName()); itemPath = localPath.toString(); Path paths[] = DistributedCache.getLocalCacheFiles(job); LOG.info("loadListFromCache returned path count:" + paths.length); for (Path path: paths) { LOG.info("loadListFromCache - trying to match current path:" + path.toString() + " to target:" + itemPath); if (path.toString().endsWith(itemPath)) { FileStatus fileStatus = FileSystem.getLocal(job).getFileStatus(path); LOG.info("match found! Loading Data"); if (fileStatus != null && fileStatus.getLen() != 0) { byte data[] = new byte[(int)fileStatus.getLen()]; FSDataInputStream dataInput = FileSystem.getLocal(job).open(path); try { dataInput.read(data); return data; } finally { if (dataInput != null) { dataInput.close(); } } } else { LOG.error("Unable to load file at:" + itemPath); throw new IOException("Unable to load File at:" + itemPath); } } } LOG.error("Unable to locate target item:" + itemPath); throw new IOException("Unable to locate File:" + itemPath + " in local cache!"); } public void destoryCache(JobConf job)throws IOException { if (getJobCacheSessionId(job) != 0) { FileSystem fs = FileSystem.get(job); fs.delete(getPathForSession(job), true); } } }