StandardContentHandler.java example

Explorer
catma-core-master
- src
  - de
    - catma
- test
  - de
    - catma
      - document
        source
        contenthandler
        Doc2TxtExtractor.java
        UnixToDosTest.java
/*   
 *   CATMA Computer Aided Text Markup and Analysis
 *   
 *   Copyright (C) 2009  University Of Hamburg
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */ 

package de.catma.document.source.contenthandler;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;

/**
 * The standard content handler which handles plain text files.
 *
 * @author marco.petris@web.de
 * @see de.catma.document.source.TechInfoSet
 */
public class StandardContentHandler extends AbstractSourceContentHandler {
	
	/* (non-Javadoc)
	 * @see de.catma.document.source.contenthandler.SourceContentHandler#load(java.io.InputStream)
	 */
	public void load(InputStream is) throws IOException {
		
		Charset charset = 
			getSourceDocumentInfo().getTechInfoSet().getCharset();
		
		StringBuilder contentBuffer = new StringBuilder(); 
		
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        BufferedInputStream bis = new BufferedInputStream(is);
        byte[] byteBuffer = new byte[65536];
        int bCount = -1;
        while ((bCount=bis.read(byteBuffer)) != -1) {
            bos.write(byteBuffer, 0, bCount);
        }

        byte[] byteBuf = bos.toByteArray();
        ByteArrayInputStream toCharBis = new ByteArrayInputStream(byteBuf);

		InputStream fr = null; 
		if (BOMFilterInputStream.hasBOM(byteBuf)) {
			fr = new BOMFilterInputStream( toCharBis, charset );
		}
		else {
			fr = toCharBis;
		}

		BufferedReader reader = new BufferedReader(
				new InputStreamReader( fr, charset ) );
		
		char[] charBuf = new char[65536];
		int cCount = -1;
        while((cCount=reader.read(charBuf)) != -1) {
        	contentBuffer.append( charBuf, 0, cCount);
        }

        // some texts seem to include non valid unicode characters
        // and this causes problems when converting text to HTML
        // for GUI delivery and during indexing 
		setContent(
			contentBuffer.toString().replaceAll(
				"[^\\x09\\x0A\\x0D\\x20-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]", "?"));
	}

    /* (non-Javadoc)
     * @see de.catma.document.source.contenthandler.SourceContentHandler#load()
     */
    public void load() throws IOException {
        BufferedInputStream bis = null;
        try {
        	
            bis = new BufferedInputStream(
            		getSourceDocumentInfo().getTechInfoSet().getURI().toURL().openStream());

            load(bis);
        }
        finally {
            if (bis != null) {
				bis.close();
            }
        }
    }
    
	@SuppressWarnings("unused")
	private void showBytes( File file, int byteCount ) {
		FileInputStream fis = null;
		try {
			fis = new FileInputStream( file );
			for( int i=0; i<byteCount; i++ ) {
				System.out.printf( "%1$x\n", fis.read() );
			}
			
		}
		catch( Exception exc ) {
			exc.printStackTrace();
		}
		finally {
			if( fis != null ) {
				try {
					fis.close();
				} catch( IOException ignored) {}
			}
		}
	}
}