package focusedCrawler.target.repository; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Iterator; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.compress.utils.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.core.JsonGenerationException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.cbor.CBORFactory; import focusedCrawler.target.model.Page; import focusedCrawler.target.model.TargetModelCbor; import focusedCrawler.target.model.TargetModelJson; /** * A target repository that stores pages in the file system. Files are organized per domain, * one folder for each domain. Supports serialization of data different formats: * - Plain HTML (may lose some data, about headers) * - JSON * - CBOR * * @author aeciosantos * */ public class FileSystemTargetRepository implements TargetRepository { public enum DataFormat { HTML, JSON, CBOR } private static final Logger logger = LoggerFactory.getLogger(FileSystemTargetRepository.class); private static final ObjectMapper jsonMapper = new ObjectMapper(); private static final ObjectMapper cborMapper = new ObjectMapper(new CBORFactory()); private final Path directory; private final DataFormat dataFormat; private final boolean hashFilename; private final boolean compressData; public FileSystemTargetRepository(String directory, DataFormat dataFormat, boolean hashFilename) { this(Paths.get(directory), dataFormat, hashFilename, false); } public FileSystemTargetRepository(String directory, DataFormat dataFormat, boolean hashFilename, boolean compressData) { this(Paths.get(directory), dataFormat, hashFilename, compressData); } public FileSystemTargetRepository(Path directory, DataFormat dataFormat, boolean hashFilename, boolean compressData) { File fileDir = directory.toFile(); if(!fileDir.exists()) { fileDir.mkdirs(); } this.directory = directory; this.dataFormat = dataFormat; this.hashFilename = hashFilename; this.compressData = compressData; } @Override public void close() {} public boolean insert(Page target) { try { String id = target.getURL().toString(); URL url = new URL(id); Path hostPath = getHostPath(url); File hostDirectory = hostPath.toFile(); if (!hostDirectory.exists()) { hostDirectory.mkdirs(); } Path filePath = getFilePath(id, hostPath); try(OutputStream fileStream = new PrintStream(filePath.toFile())) { if(compressData) { try(OutputStream gzipStream = new DeflaterOutputStream(fileStream)) { serializeData(target, url, gzipStream); } } else { serializeData(target, url, fileStream); } } } catch (IOException e) { logger.error("Failed to store object in repository.", e); } return false; } private void serializeData(Page target, URL url, OutputStream fileStream) throws IOException, JsonGenerationException, JsonMappingException { switch(dataFormat) { case HTML: { fileStream.write(target.getContent()); break; } case JSON: { TargetModelJson targetModel = new TargetModelJson(target); jsonMapper.writeValue(fileStream, targetModel); break; } case CBOR: { TargetModelCbor targetModel = new TargetModelCbor("", "", url, target.getContentAsString()); cborMapper.writeValue(fileStream, targetModel); break; } } } public boolean exists(String urlString) { try { Path hostPath = getHostPath(urlString); File hostDirectory = hostPath.toFile(); if (!hostDirectory.exists()) { return false; } Path filePath = getFilePath(urlString, hostPath); if (filePath.toFile().exists()) { return true; } } catch (UnsupportedEncodingException | MalformedURLException e) { return false; } return false; } private Path getHostPath(URL url) throws MalformedURLException, UnsupportedEncodingException { String host = url.getHost(); Path hostPath = directory.resolve(URLEncoder.encode(host, "UTF-8")); return hostPath; } private Path getHostPath(String url) throws MalformedURLException, UnsupportedEncodingException { return getHostPath(new URL(url)); } private Path getFilePath(String url, Path hostPath) throws UnsupportedEncodingException { Path filePath; if(hashFilename) { String filenameEncoded = DigestUtils.sha256Hex(url); filePath = hostPath.resolve(filenameEncoded); } else { filePath = hostPath.resolve(URLEncoder.encode(url, "UTF-8")); } return filePath; } public <T> T get(String url) { try { Path hostPath = getHostPath(url); Path filePath = getFilePath(url, hostPath); return readFile(filePath); } catch (IOException e) { return null; } } private <T> T readFile(Path filePath) throws IOException, FileNotFoundException { if (!Files.exists(filePath)) { return null; } try (InputStream fileStream = new FileInputStream(filePath.toFile())) { if(compressData) { try(InputStream gzipStream = new InflaterInputStream(fileStream)) { return unserializeData(gzipStream); } } else { return unserializeData(fileStream); } } } @SuppressWarnings("unchecked") private <T> T unserializeData(InputStream inputStream) { T object = null; try { if (dataFormat.equals(DataFormat.CBOR)) { object = (T) cborMapper.readValue(inputStream, TargetModelCbor.class); } else if (dataFormat.equals(DataFormat.JSON)) { object = (T) jsonMapper.readValue(inputStream, TargetModelJson.class); } else if (dataFormat.equals(DataFormat.HTML)) { byte[] fileData = IOUtils.toByteArray(inputStream); object = (T) new String(fileData); } } catch (IOException e) { throw new RuntimeException("Failed to unserialize object.", e); } return object; } public <T> FileContentIterator<T> iterator() { return new FileContentIterator<T>(new FilesIterator(directory)); } public FilesIterator filesIterator() { return new FilesIterator(directory); }; public class FileContentIterator<T> implements Iterator<T>, Closeable { private FilesIterator fileIterator; public FileContentIterator(FilesIterator fileIterator) { this.fileIterator = fileIterator; } @Override public boolean hasNext() { return fileIterator.hasNext(); } @Override public T next() { if(!fileIterator.hasNext()) { return null; } Path filePath = fileIterator.next(); try { return readFile(filePath); } catch (Exception e) { String f = filePath == null ? null : filePath.toString(); throw new IllegalStateException("Failed to read file: "+f, e); } } @Override public void remove() { throw new UnsupportedOperationException("remove"); } @Override public void close() { fileIterator.close(); } } public class FilesIterator implements Iterator<Path>, Closeable { private Path next; private Iterator<Path> fileIt; private Iterator<Path> hostIt; private DirectoryStream<Path> hostsStream; private DirectoryStream<Path> filesStream; public FilesIterator(Path directory) { try { hostsStream = Files.newDirectoryStream(directory); hostIt = hostsStream.iterator(); if(hostIt.hasNext()) { filesStream = Files.newDirectoryStream(hostIt.next()); fileIt = filesStream.iterator(); } } catch (IOException e) { throw new IllegalArgumentException("Failed to open target repository folder: "+directory, e); } this.next = readNext(); } private Path readNext() { if(fileIt != null && !fileIt.hasNext()) { IOUtils.closeQuietly(filesStream); // no more files on this folder, close it. if(!hostIt.hasNext()) { IOUtils.closeQuietly(hostsStream); return null; // no more file and folders available } // iterate over next folder available Path hostPath = null; try { hostPath = hostIt.next(); filesStream = Files.newDirectoryStream(hostPath); fileIt = filesStream.iterator(); } catch (IOException e) { String f = hostPath == null ? null : hostPath.toString(); throw new IllegalArgumentException("Failed to open host folder: "+f, e); } } if(fileIt != null && fileIt.hasNext()) { return fileIt.next(); } return null; } @Override public boolean hasNext() { return this.next != null; } @Override public Path next() { if(this.next == null) { return null; } else { Path returnValue = this.next; this.next = readNext(); return returnValue; } } @Override public void remove() { throw new UnsupportedOperationException("remove"); } @Override public void close() { IOUtils.closeQuietly(filesStream); IOUtils.closeQuietly(hostsStream); } } }