/****************************************************************************** * Copyright (c) 2010 Basis Technology Corp. * * Basis Technology Corp. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.basistech.readability; import java.io.File; import java.io.FileOutputStream; import java.io.FilenameFilter; import java.io.IOException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /* * At the moment this class will take all html files from the flat directory: ./src/test/resources/htmlInput/ * and write them to: ./src/test/resources/textOutput/ * * In the future, a nice thing to do might be to abstract this so that it can process just about anything you * throw in there. It's a matter of using the appropriate PageReaders. Also, having the directories be hard-coded * might be a problem in the future. */ public final class ReadabilityDriver { //the logger private static final Logger LOG = LoggerFactory.getLogger(ReadabilityDriver.class); //the paths private static final String INPUT_PATH = "./src/test/resources/htmlInput/"; private static final String OUTPUT_PATH = "target"; //private constructor private ReadabilityDriver() { } public static void main(String[] args) throws IOException { //input directory file File inputDir = new File(INPUT_PATH); //create the FilePageReader for Readability FilePageReader reader = new FilePageReader(); reader.setBaseDirectory(inputDir); //instantiate Readability and set reader Readability readability = new Readability(); readability.setPageReader(reader); readability.setReadAllPages(false); reader.setCharsetDetector(new TikaCharsetDetector()); //instantiate a file array File[] htmlFiles; //get all html files in directory if (inputDir.exists()) { htmlFiles = inputDir.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.matches(".*\\.html$"); } }); } else { htmlFiles = new File[0]; } //iterate over the files and run Readability on them for (File page : htmlFiles) { //get the page path String path = page.getPath(); //process the page try { LOG.info("processing page: " + path); readability.processDocument(path); } catch (PageReadException e) { LOG.error("PageReadError while processing: " + path); e.printStackTrace(); continue; } //write the output, forcing a sentence break between title and body with \u2029. String title = readability.getTitle().trim() + "\u2029"; String content = readability.getArticleText(); String returnText = OUTPUT_PATH + page.getName().replaceAll("html$", "txt"); FileOutputStream fos = new FileOutputStream(returnText); fos.write((title + System.getProperty("line.separator") + content).getBytes("UTF8")); fos.flush(); fos.close(); } } }