/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.admin.management;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.ParseOutputFormat;
public class FileUtil {
private static Log LOG = LogFactory.getLog(FileUtil.class);
private static class RunningPathFilter implements PathFilter {
public boolean accept(Path file) {
String name = file.getName().toLowerCase();
return name.endsWith("running");
}
}
private static class DirectoryPathFilter implements PathFilter {
private FileSystem fFileSystem;
public DirectoryPathFilter(FileSystem fileSystem) {
this.fFileSystem = fileSystem;
}
public boolean accept(Path file) {
boolean ret = false;
try {
ret = this.fFileSystem.isDirectory(file);
} catch (IOException e) {
LOG.warn(e.toString());
}
return ret;
}
}
public static long size(Path folder, Configuration configuration)
throws IOException {
FileSystem fileSystem = FileSystem.get(configuration);
// Path[] files = fileSystem.listPaths(folder);
FileStatus[] filestatuses = fileSystem.listStatus(folder);
int len = filestatuses.length;
Path[] files = new Path[len];
for (int i=0; i < len; i++) {
files[i] = filestatuses[i].getPath();
}
long size = 0;
for (int i = 0; files != null && i < files.length; i++) {
Path file = files[i];
if (fileSystem.isDirectory(file)) {
size = size + size(file, configuration);
}
size = size + fileSystem.getLength(file);
}
return size + fileSystem.getLength(folder);
}
/**
* @return true if fetch.done exists
*/
public static boolean isFetched(Path segment, Configuration configuration)
throws IOException {
//return exists(configuration, segment, "fetch.done");
FileSystem fs = FileSystem.get(configuration);
return fs.exists(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
}
/**
* @return true if invert.done exists
*/
public static boolean isInverted(Path segment, Configuration configuration)
throws IOException {
return exists(configuration, segment, "invert.done");
//FileSystem fs = FileSystem.get(configuration);
//return fs.exists(new Path(segment, CrawlDatum.));
}
/**
* @return true if parse.done exists
*/
public static boolean isParsed(Path segment, Configuration configuration)
throws IOException {
//return exists(configuration, segment, "parse.done");
FileSystem fs = FileSystem.get(configuration);
return fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
}
/**
* @return true if parse.done exists
*/
public static boolean isIndexed(Path segment, Configuration configuration)
throws IOException {
FileSystem system = FileSystem.get(configuration);
// Path[] files = system.listPaths(new Path(segment, "index"));
FileStatus[] filestatuses = system.listStatus(new Path(segment, "index"));
int len = filestatuses.length;
Path[] files = new Path[len];
for (int i=0; i < len; i++) {
files[i] = filestatuses[i].getPath();
}
boolean ret = false;
for (int i = 0; i < files.length; i++) {
//e.g. file = part-00000
Path file = files[i];
if(system.isDirectory(file) && file.getName().startsWith("part-")) {
ret = exists(configuration, file, "index.done");
if(!ret) {
break;
}
}
}
return ret;
}
/**
* @return true if parse.done exists
*/
public static boolean isInjected(Path instanceFolder,
Configuration configuration) throws IOException {
Path crawlDir = new Path(configuration.get("crawl.dir"));
return exists(configuration, crawlDir, "crawldb");
}
/**
* @return true if search.done exists
*/
public static boolean isReadyToSearch(Path segment,
Configuration configuration) throws IOException {
return exists(configuration, segment, "search.done");
}
/**
* @return true if fileName in folder exists
*/
private static boolean exists(Configuration configuration, Path folder,
String fileName) throws IOException {
FileSystem fileSystem = FileSystem.get(configuration);
return fileSystem.exists(new Path(folder, fileName));
}
/**
* @return true if parse.done exists
*/
public static List<String> getRunningFiles(Path folder, Configuration configuration)
throws IOException {
FileSystem fileSystem = FileSystem.get(configuration);
// Path[] files = fileSystem.listPaths(folder, new RunningPathFilter());
FileStatus[] filestatuses = fileSystem.listStatus(folder, new RunningPathFilter());
int len = filestatuses.length;
Path[] files = new Path[len];
for (int i=0; i < len; i++) {
files[i] = filestatuses[i].getPath();
}
List<String> list = new LinkedList<String>();
for (int i = 0; i < files.length; i++) {
Path file = files[i];
list.add(file.getName());
}
return list;
}
/**
* @return folders in this folder
*/
public static Path[] listFolders(Path folder,
Configuration configuration)
throws IOException {
FileSystem system = FileSystem.get(configuration);
// return system.listPaths(folder, new DirectoryPathFilter(system));
FileStatus[] filestatuses = system.listStatus(folder, new DirectoryPathFilter(system));
int len = filestatuses.length;
Path[] files = new Path[len];
for (int i=0; i < len; i++) {
files[i] = filestatuses[i].getPath();
}
return files;
}
}