ReadabilityDriver.java example

Explorer

Java-readability-master
- src
  - main
    - java
      - com
        basistech
        readability
        AbstractPageReader.java
        FilePageReader.java
        HtmlPage.java
        HttpPageReader.java
        NekoJsoupParser.java
        OffsetRange.java
        PageCharsetDetector.java
        PageInfo.java
        PageLinkInfo.java
        PageReadException.java
        PageReader.java
        Patterns.java
        Readability.java
        ReadabilityDriver.java
        TikaCharsetDetector.java
        XmlDataMap.java

/******************************************************************************
 * Copyright (c) 2010 Basis Technology Corp.
 * 
 * Basis Technology Corp. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.basistech.readability;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * At the moment this class will take all html files from the flat directory: ./src/test/resources/htmlInput/
 * and write them to: ./src/test/resources/textOutput/
 * 
 * In the future, a nice thing to do might be to abstract this so that it can process just about anything you
 * throw in there.  It's a matter of using the appropriate PageReaders.  Also, having the directories be hard-coded
 * might be a problem in the future.
 */

public final class ReadabilityDriver {
    
    //the logger
    private static final Logger LOG = LoggerFactory.getLogger(ReadabilityDriver.class);
    
    //the paths
    private static final String INPUT_PATH = "./src/test/resources/htmlInput/";
    private static final String OUTPUT_PATH = "target";
    
    //private constructor
    private ReadabilityDriver() { }
    
    public static void main(String[] args) throws IOException {
        
        //input directory file
        File inputDir = new File(INPUT_PATH);
        
        //create the FilePageReader for Readability
        FilePageReader reader = new FilePageReader();
        reader.setBaseDirectory(inputDir);
        
        //instantiate Readability and set reader
        Readability readability = new Readability();
        readability.setPageReader(reader);
        readability.setReadAllPages(false);
        reader.setCharsetDetector(new TikaCharsetDetector());
        
        //instantiate a file array
        File[] htmlFiles;
        
        //get all html files in directory
        if (inputDir.exists()) {
            htmlFiles = inputDir.listFiles(new FilenameFilter() {
                public boolean accept(File dir, String name) {
                    return name.matches(".*\\.html$");
                }
            });
        } else {
            htmlFiles = new File[0];
        }
        
        //iterate over the files and run Readability on them
        for (File page : htmlFiles) {
            
            //get the page path
            String path = page.getPath();
            
            //process the page
            try {
                LOG.info("processing page: " + path);
                readability.processDocument(path);
            } catch (PageReadException e) {
                LOG.error("PageReadError while processing: " + path);
                e.printStackTrace();
                continue;
            }
            
            //write the output, forcing a sentence break between title and body with \u2029.
            String title = readability.getTitle().trim() + "\u2029";
            String content = readability.getArticleText();
            String returnText = OUTPUT_PATH + page.getName().replaceAll("html$", "txt");
            FileOutputStream fos = new FileOutputStream(returnText);
            fos.write((title + System.getProperty("line.separator") + content).getBytes("UTF8"));
            fos.flush();
            fos.close();
        }
    }
}