/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.admin.scheduling; import java.util.Arrays; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.FileStatus; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.nutch.admin.GuiConfigUtil; import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.crawl.Generator; import org.apache.nutch.crawl.LinkDb; import org.apache.nutch.fetcher.Fetcher; import org.apache.nutch.indexer.Indexer; import org.quartz.JobDataMap; import org.quartz.JobExecutionContext; import org.quartz.JobExecutionException; import org.quartz.StatefulJob; public class AdminCrawl implements StatefulJob { private static Log LOG = LogFactory.getLog(AdminCrawl.class); private static class RunningPathFilter implements PathFilter { public boolean accept(Path path) { String name = path.getName().toLowerCase(); return name.endsWith("running"); } } public void execute(JobExecutionContext context) throws JobExecutionException { JobDataMap jobDataMap = context.getJobDetail().getJobDataMap(); Path crawldbFile = (PathSerializable) jobDataMap.get("crawldb"); Path linkdbFile = (PathSerializable) jobDataMap.get("linkdb"); Path segments = (PathSerializable) jobDataMap.get("segments"); Path segment = null; Path configurationFile = (PathSerializable) jobDataMap.get("configuration"); Configuration configuration = null; try { configuration = GuiConfigUtil.loadNewConfiguration(configurationFile); segment = generateSegment(crawldbFile, segments, configuration); fetchSegment(segment, configuration); updateCrawldb(crawldbFile, segment, configuration); updateLinkDb(linkdbFile, new Path[] { segment }, configuration); index( new Path(segment, "index"), crawldbFile, linkdbFile, new Path[] { segment }, configuration ); } catch (Exception e) { throw new JobExecutionException(e.getMessage()); } finally { try { FileSystem fileSystem = FileSystem.get(configuration); RunningPathFilter filter = new RunningPathFilter(); // deleteFiles(fileSystem.listPaths(crawldbFile, filter), fileSystem); // deleteFiles(fileSystem.listPaths(linkdbFile, filter), fileSystem); // deleteFiles(fileSystem.listPaths(segments, filter), fileSystem); // deleteFiles(fileSystem.listPaths(segment, filter), fileSystem); // deleteFiles(fileSystem.listPaths(new Path(segment, "index"), filter), fileSystem); deleteFiles(fileSystem.listStatus(crawldbFile, filter), fileSystem); deleteFiles(fileSystem.listStatus(linkdbFile, filter), fileSystem); deleteFiles(fileSystem.listStatus(segments, filter), fileSystem); deleteFiles(fileSystem.listStatus(segment, filter), fileSystem); deleteFiles(fileSystem.listStatus(new Path(segment, "index"), filter), fileSystem); } catch (IOException e) { throw new JobExecutionException(e.getMessage()); } } } private void deleteFiles(Path[] files, FileSystem fileSystem) throws IOException { for (int i = 0; i < files.length; i++) { Path file = files[i]; fileSystem.delete(file); } } private void deleteFiles(FileStatus[] statuses, FileSystem fileSystem) throws IOException { for (int i = 0; i < statuses.length; i++) { Path file = statuses[i].getPath(); fileSystem.delete(file); } } private void index(Path index, Path crawldbFile, Path linkdbFile, Path[] segments, Configuration configuration) throws IOException { FileSystem fileSystem = FileSystem.get(configuration); // create running files in segments for (int i = 0; i < segments.length; i++) { Path file = segments[i]; fileSystem.createNewFile(new Path(file, "index.running")); } // create running files in linkdb fileSystem.createNewFile(new Path(linkdbFile, "index.running")); // create running files in crawldb fileSystem.createNewFile(new Path(crawldbFile, "index.running")); Indexer indexer = new Indexer(configuration); indexer.index(index, crawldbFile, linkdbFile, Arrays.asList(segments)); } private void updateLinkDb(Path linkdbFile, Path[] segments, Configuration configuration) throws IOException { FileSystem fileSystem = FileSystem.get(configuration); Path running = new Path(linkdbFile, "linkdb.running"); // create lock files in segments for (int i = 0; i < segments.length; i++) { Path file = segments[i]; fileSystem.createNewFile(new Path(file, "linkdb.running")); } // create lockfile in linkdb fileSystem.createNewFile(running); LinkDb linkDb = new LinkDb(configuration); linkDb.invert(linkdbFile, segments,true, true, false); for (int i = 0; i < segments.length; i++) { Path file = segments[i]; fileSystem.createNewFile(new Path(file, "invert.done")); } } private void updateCrawldb(Path crawldbFile, Path segment, Configuration configuration) throws IOException { FileSystem fileSystem = FileSystem.get(configuration); Path runningSegment = new Path(segment, "crawldb.running"); Path runningDB = new Path(crawldbFile, "crawldb.running"); fileSystem.createNewFile(runningSegment); fileSystem.createNewFile(runningDB); CrawlDb crawlDb = new CrawlDb(configuration); // crawlDb.update(crawldbFile, segment, true, true); Path[] segments = new Path[1]; segments[0] = segment; crawlDb.update(crawldbFile, segments, true, true); } private void fetchSegment(Path segment, Configuration configuration) throws IOException { FileSystem fileSystem = FileSystem.get(configuration); Path running = new Path(segment, "fetch.running"); fileSystem.createNewFile(running); Fetcher fetcher = new Fetcher(configuration); fetcher.fetch(segment, configuration.getInt("fetcher.threads.fetch", 10), true); fileSystem.createNewFile(new Path(segment, "fetch.done")); if (configuration.getBoolean("fetcher.parse", true)) { fileSystem.createNewFile(new Path(segment, "parse.done")); } } private Path generateSegment(Path crawldbFile, Path segments, Configuration configuration) throws IOException { FileSystem system = FileSystem.get(configuration); Path runningGenerateSegment = new Path(segments, "generate.running"); Path runningGenerateDB = new Path(crawldbFile, "generate.running"); system.createNewFile(runningGenerateSegment); system.createNewFile(runningGenerateDB); Generator generator = new Generator(configuration); // Path segment = generator.generate(crawldbFile, segments); long topN = Long.MAX_VALUE; Path segment = generator.generate(crawldbFile, segments, -1, topN, System.currentTimeMillis()); return segment; } }