package focusedCrawler.target.repository; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintStream; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Iterator; import java.util.zip.DeflaterOutputStream; import java.util.zip.InflaterInputStream; import org.apache.commons.compress.utils.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.io.CountingOutputStream; import focusedCrawler.target.model.Page; import focusedCrawler.target.model.TargetModelJson; /** * A target repository that stores pages in compressed (DEFLATE) text files containing one JSON * object per line. Files have a maximum size, and additional files are created when the size limit * is reached. * * @author aeciosantos * */ public class FilesTargetRepository implements TargetRepository { private static final long DEFAULT_MAX_FILE_SIZE = 256*1024*1024; private static final Logger logger = LoggerFactory.getLogger(FilesTargetRepository.class); private static final ObjectMapper jsonMapper = new ObjectMapper(); private final Path directory; private final long maxFileSize; private DeflaterOutputStream currentFile; private CountingOutputStream bytesCounter; public FilesTargetRepository(String directory) { this(Paths.get(directory), DEFAULT_MAX_FILE_SIZE); } public FilesTargetRepository(String directory, long maxFileSize) { this(Paths.get(directory), maxFileSize); } public FilesTargetRepository(Path directory, long maxFileSize) { this.maxFileSize = maxFileSize; if (!Files.exists(directory)) { directory.toFile().mkdirs(); } this.directory = directory; } public boolean insert(Page target) { return insert(new TargetModelJson(target)); } public boolean insert(TargetModelJson target) { try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); jsonMapper.writeValue(baos, target); baos.write("\n".getBytes()); synchronized (this) { DeflaterOutputStream currentFile = getCurrentFile(baos); baos.writeTo(currentFile); currentFile.flush(); } return true; } catch (IOException e) { logger.error("Failed to store object in repository.", e); return false; } } private DeflaterOutputStream getCurrentFile(ByteArrayOutputStream baos) throws IOException { if(this.currentFile == null) { openNewFile(); } else { if(bytesCounter.getCount() + baos.size() > maxFileSize) { openNewFile(); } } return currentFile; } private synchronized void openNewFile() throws IOException { if(currentFile != null) { // flush and automatically closes file try(OutputStream out = this.currentFile) { out.flush(); } } long timestamp = System.currentTimeMillis(); long count = 0; Path filePath; do { String file = String.format("crawl_data-%d-%d.deflate", timestamp, count++); filePath = directory.resolve(file); } while (Files.exists(filePath)); OutputStream fileStream = new PrintStream(filePath.toFile()); this.bytesCounter = new CountingOutputStream(fileStream); this.currentFile = new DeflaterOutputStream(this.bytesCounter, true); } public void close() { IOUtils.closeQuietly(currentFile); } public RepositoryIterator iterator() { return new RepositoryIterator(new JsonLinesIterator(directory)); } public JsonLinesIterator jsonLinesIterator() { return new JsonLinesIterator(directory); }; public class RepositoryIterator implements Iterator<TargetModelJson>, Closeable { private JsonLinesIterator jsonLinesIterator; public RepositoryIterator(JsonLinesIterator fileIterator) { this.jsonLinesIterator = fileIterator; } @Override public boolean hasNext() { return jsonLinesIterator.hasNext(); } @Override public TargetModelJson next() { if(!jsonLinesIterator.hasNext()) { return null; } String jsonLine = jsonLinesIterator.next(); try { return jsonMapper.readValue(jsonLine, TargetModelJson.class); } catch (Exception e) { String json = jsonLine == null ? null : jsonLine.toString(); throw new IllegalStateException("Failed to unserialize json: "+json, e); } } @Override public void remove() { throw new UnsupportedOperationException("remove"); } @Override public void close() { jsonLinesIterator.close(); } } public class JsonLinesIterator implements Iterator<String>, Closeable { private String next; private Iterator<Path> filesIt; private DirectoryStream<Path> filesStream; private BufferedReader linesReader; public JsonLinesIterator(Path directory) { try { filesStream = Files.newDirectoryStream(directory); filesIt = filesStream.iterator(); if(filesIt.hasNext()) { linesReader = openFile(filesIt.next()); } } catch (IOException e) { throw new IllegalArgumentException( "Failed to open target repository folder: "+directory, e); } this.next = readNext(); } private BufferedReader openFile(Path filePath) throws FileNotFoundException { return new BufferedReader(new InputStreamReader( new InflaterInputStream(new FileInputStream(filePath.toFile())))); } private String readNext() { String nextLine = null; if(linesReader != null) { try { nextLine = linesReader.readLine(); } catch (IOException e) { nextLine = null; } if(nextLine == null) { // end of file reached IOUtils.closeQuietly(linesReader); if(!filesIt.hasNext()) { IOUtils.closeQuietly(filesStream); return null; // no more file and lines available } // read next file available Path filePath = null; try { filePath = filesIt.next(); linesReader = openFile(filePath); nextLine = linesReader.readLine(); } catch (IOException e) { String f = filePath == null ? null : filePath.toString(); throw new IllegalStateException("Failed to open file: "+f, e); } } } return nextLine; } @Override public boolean hasNext() { return this.next != null; } @Override public String next() { if(this.next == null) { return null; } else { String returnValue = this.next; this.next = readNext(); return returnValue; } } @Override public void remove() { throw new UnsupportedOperationException("remove"); } @Override public void close() { IOUtils.closeQuietly(linesReader); IOUtils.closeQuietly(filesStream); } } }