/*
* Copyright (c) 2012, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.analyzer;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Date;
import java.util.List;
import java.util.TreeSet;
import java.util.Iterator;
import java.util.Hashtable;
import java.util.ArrayList;
import java.text.SimpleDateFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.Configuration;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import com.almworks.sqlite4java.SQLiteException;
/***********************************************************
* FSCrawler crawls a filesystem and stuffs the results into
* an FSAnalyzer's store.
*
* @author "Michael Cafarella" <mjc@cloudera.com>
***********************************************************/
public class FSCrawler {
final static int INFINITE_CRAWL_DEPTH = -1;
private static final Log LOG = LogFactory.getLog(FSCrawler.class);
static SimpleDateFormat fileDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Hashtable<Long, Thread> pendingCrawls = new Hashtable<Long, Thread>();
Hashtable<Long, CrawlRuntimeStatus> crawlStatusInfo = new Hashtable<Long, CrawlRuntimeStatus>();
FSAnalyzer analyzer;
FileSystem fs;
/**
* Needs an analyzer to work
*/
public FSCrawler(FSAnalyzer analyzer) {
this.analyzer = analyzer;
this.fs = null;
}
/**
* Traverse an entire region of the filesystem, analyzing files.
* This code should:
* a) Navigate the directory hierarchy
* b) Run analysis code to figure out the file details
* c) Invoke addSingleFile() appropriately.
*/
protected void recursiveCrawlBuildList(FileSystem fs, Path p, int subdirDepth, long crawlId, List<Path> todoFileList, List<Path> todoDirList) throws IOException {
FileStatus fstatus = fs.getFileStatus(p);
if (! fstatus.isDir()) {
todoFileList.add(p);
} else {
if (subdirDepth > 0 || subdirDepth < 0) {
todoDirList.add(p);
Path paths[] = new Path[1];
paths[0] = p;
for (FileStatus subfilestatus: fs.listStatus(p)) {
Path subfile = subfilestatus.getPath();
try {
recursiveCrawlBuildList(fs, subfile, subdirDepth-1, crawlId, todoFileList, todoDirList);
} catch (IOException iex) {
iex.printStackTrace();
}
}
}
}
}
/**
* <code>getStartNonblockingCrawl</code> traverses a given filesystem. It returns immediately
* and does not wait for the crawl to complete.
* If the crawl is created or is already ongoing, it returns true.
* If the crawl is not currently going and cannot start, it returns false.
*/
public synchronized boolean getStartNonblockingCrawl(final URI fsURI) {
try {
final int subdirDepth = INFINITE_CRAWL_DEPTH;
long fsId = analyzer.getCreateFilesystem(fsURI, true);
if (fsId < 0) {
return false;
}
LOG.info("Grabbing filesystem: " + fsURI);
final FileSystem fs = FileSystem.get(fsURI, new Configuration());
final Path startDir = fs.makeQualified(new Path(fsURI.getPath()));
final long crawlid = analyzer.getCreatePendingCrawl(fsId, true);
Thread pendingThread = pendingCrawls.get(crawlid);
if (pendingThread == null) {
Thread t = new Thread() {
public void run() {
try {
synchronized (pendingCrawls) {
pendingCrawls.put(crawlid, this);
}
synchronized (crawlStatusInfo) {
crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl"));
}
// Build the file and dir-level todo lists
List<Path> todoFileList = new ArrayList<Path>();
List<Path> todoDirList = new ArrayList<Path>();
recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList);
// Get the files to process
TreeSet<String> observedFilenames = new TreeSet<String>();
for (Path p: analyzer.getFilesForCrawl(crawlid)) {
observedFilenames.add(p.toString());
}
for (Iterator<Path> it = todoFileList.iterator(); it.hasNext(); ) {
Path p = it.next();
if (observedFilenames.contains(p.toString())) {
it.remove();
}
}
// Get the dirs to process
TreeSet<String> observedDirnames = new TreeSet<String>();
for (Path p: analyzer.getDirsForCrawl(crawlid)) {
observedDirnames.add(p.toString());
}
for (Iterator<Path> it = todoDirList.iterator(); it.hasNext(); ) {
Path p = it.next();
if (observedDirnames.contains(p.toString())) {
it.remove();
}
}
synchronized (crawlStatusInfo) {
CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
cstatus.setMessage("Processing files");
cstatus.setNumToProcess(todoFileList.size());
cstatus.setNumDone(0);
}
int numDone = 0;
for (Path p: todoDirList) {
try {
analyzer.addSingleFile(fs, p, crawlid);
} catch (IOException iex) {
iex.printStackTrace();
}
}
for (Path p: todoFileList) {
synchronized (crawlStatusInfo) {
CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
cstatus.setMessage("Processing file " + p.toString());
}
try {
analyzer.addSingleFile(fs, p, crawlid);
} catch (Exception iex) {
iex.printStackTrace();
}
numDone++;
synchronized (crawlStatusInfo) {
CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
cstatus.setNumDone(numDone);
if (cstatus.shouldFinish()) {
break;
}
}
}
} catch (IOException iex) {
iex.printStackTrace();
} finally {
try {
synchronized (pendingCrawls) {
pendingCrawls.remove(crawlid);
analyzer.completeCrawl(crawlid);
}
} catch (SQLiteException sle) {
}
}
}
};
t.start();
}
return true;
} catch (Exception iex) {
iex.printStackTrace();
}
return false;
}
/**
* Is there an ongoing (running) crawl for the given filesystem?
*/
public CrawlRuntimeStatus isCrawlOngoing(URI fsURI) {
long fsId = analyzer.getCreateFilesystem(fsURI, false);
if (fsId < 0) {
return null;
}
synchronized (pendingCrawls) {
final long crawlid = analyzer.getCreatePendingCrawl(fsId, false);
Thread pendingThread = pendingCrawls.get(crawlid);
if (pendingThread != null && pendingThread.isAlive()) {
synchronized (crawlStatusInfo) {
return crawlStatusInfo.get(crawlid);
}
}
return null;
}
}
/**
* waitForCrawl() will block until the given crawl is complete. If there
* is an ongoing crawl that completes, it will return true.
* If there was no ongoing crawl, it will return false.
*/
protected boolean waitForOngoingCrawl(URI fsURI, boolean shouldKill) {
long fsId = analyzer.getCreateFilesystem(fsURI, false);
if (fsId < 0) {
return false;
}
synchronized (pendingCrawls) {
final long crawlid = analyzer.getCreatePendingCrawl(fsId, false);
if (crawlid < 0) {
return false;
}
if (shouldKill) {
synchronized (crawlStatusInfo) {
CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
cstatus.setShouldFinish(true);
}
}
Thread pendingThread = pendingCrawls.get(crawlid);
if (pendingThread != null) {
try {
pendingThread.join();
} catch (InterruptedException iex) {
}
}
return true;
}
}
public void killOngoingCrawl(URI fsURI) {
long fsId = analyzer.getCreateFilesystem(fsURI, false);
if (fsId >= 0) {
synchronized (pendingCrawls) {
final long crawlid = analyzer.getCreatePendingCrawl(fsId, false);
synchronized (crawlStatusInfo) {
CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
if (cstatus != null) {
cstatus.setShouldFinish(true);
}
}
}
}
}
/**
* Kick off a crawl at the indicated directory and filesystem,
* to the indicated depth.
*/
public boolean blockingCrawl(URI fsURI) throws IOException, SQLiteException {
boolean crawlStarted = getStartNonblockingCrawl(fsURI);
if (crawlStarted) {
waitForOngoingCrawl(fsURI, false);
}
return crawlStarted;
}
////////////////////////////////////////
// Main()
////////////////////////////////////////
public static void main(String argv[]) throws Exception {
if (argv.length < 4) {
System.err.println("Usage: FSCrawler <metadataStoreDir> <schemaDbDir> (--crawl <dir>)");
return;
}
int i = 0;
File metadataStoreDir = new File(argv[i++]).getCanonicalFile();
File schemadbdir = new File(argv[i++]).getCanonicalFile();
String op = argv[i++];
FSAnalyzer fsa = new FSAnalyzer(metadataStoreDir, schemadbdir);
try {
if ("--crawl".equals(op)) {
File crawlTarget = new File(argv[i++]).getCanonicalFile();
System.err.println("About to crawl " + crawlTarget);
FSCrawler crawler = new FSCrawler(fsa);
crawler.blockingCrawl(new URI("file://" + crawlTarget));
} else if ("--test".equals(op)) {
List<SchemaSummary> summaryList = fsa.getSchemaSummaries();
System.err.println("Schema summary list has " + summaryList.size() + " entries");
}
} finally {
fsa.close();
}
}
}