NGramProfileImpl.java example

Explorer
morphadorner-opensource-master
- src
package de.spieleck.app.cngram ;

/*	Please see the license information in the header below. */

/*
 NGramJ - n-gram based text classification
 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published
 by the Free Software Foundation; either version 2.1 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU Lesser General Public License
 along with this program (lesser.txt); if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

import java.io.BufferedInputStream ;
import java.io.BufferedReader ;
import java.io.IOException ;
import java.io.InputStream ;
import java.io.InputStreamReader ;
import java.io.OutputStream ;
import java.util.Date ;
import java.util.Collections ;
import java.util.HashMap ;
import java.util.Iterator ;
import java.util.Arrays ;
import java.util.Set ;
import edu.northwestern.at.utils.UnicodeReader;

/**
 * Actual implementation of a NGramProfile
 *
 * Methods are provided to build new NGramProfiles profiles.
 * @author frank nestel
 * @author $Author: nestefan $
 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
 */
@SuppressWarnings("unchecked")
public class NGramProfileImpl
    implements NGramProfile
{

    /** separator char */
    public static final char SEPARATOR = '_' ;

    /** default min length of ngram. */
    public static final int DEFAULT_MIN_NGRAM_LENGTH = 1 ;

    /** default max length of ngram */
    public static final int DEFAULT_MAX_NGRAM_LENGTH = 5 ;

    /** Name this Profile. */
    private String name ;

    /** Internal to provide sorted access. */
    private volatile NGram[] sorted = null ;

    /** Internal to provide ordered access. */
    private volatile NGram[] ordered = null ;

    /** Minimum length for an ngram. */
    private int minNGramLength ;

    /** Maximum length for an ngram. */
    private int maxNGramLength ;

    /** Norm factor for this profile. */
    private int normalization = 0 ;

    /** Map to store char sequences and ngrams */
    private HashMap ngrams = null ;

    /** */
    private Set restricted = null ;

    /**
     * Create a new ngram profile with default lengths.
     *
     * @param name Name of profile
     */
    public NGramProfileImpl( String name )
    {
        this( name , DEFAULT_MIN_NGRAM_LENGTH , DEFAULT_MAX_NGRAM_LENGTH ) ;
    }

    /**
     * Create a new ngram profile
     *
     * @param name Name of profile
     * @param minlen min length of ngram sequences
     * @param maxlen max length of ngram sequences
     */
    public NGramProfileImpl( String name , int minlen , int maxlen )
    {
        ngrams = new HashMap() ;
        this.maxNGramLength = maxlen ;
        this.minNGramLength = minlen ;
        setName( name ) ;
    }

    public void setRestricted( Set restricted )
    {
        this.restricted = restricted ;
    }

    /**
     * Analyze a piece of text
     *
     * @param text the text to be analyzed
     */
    public void analyze( CharSequence text )
    {
        StringBuffer word = new StringBuffer( 30 ).append( SEPARATOR ) ;

        for ( int i = 0 ; i < text.length() ; i++ )
        {
            char c = Character.toLowerCase( text.charAt( i ) ) ;

            if ( Character.isLetter( c ) )
            {
                word.append( c ) ;
            }
            else
            {
                addAnalyze( word ) ;
                word.setLength( 1 ) ;
            }
        }
        addAnalyze( word ) ;
    }

    private void addAnalyze( StringBuffer word )
    {
        if ( word.length() > 1 )
        {
            word.append( SEPARATOR ) ;
            addNGrams( word ) ;
        }
    }

    public void clear()
    {
        if ( ngrams != null )
        {
            ngrams.clear() ;
        }
        normalization = 0 ;
        ordered = sorted = null ;
    }

    public int getCount()
    {
        return ngrams.size() ;
    }

    public int getNormalization()
    {
        return normalization ;
    }

    /**
     * Add ngrams from a single word to this profile
     *
     * @param word
     */
    public void addNGrams( CharSequence word )
    {
        for ( int i = minNGramLength ; i <= maxNGramLength &&
            i < word.length() ; i++ )
        {
            addNGrams( word , i ) ;
        }
    }

    /**
     * @param word
     * @param n sequence length
     */
    private void addNGrams( CharSequence word , int n )
    {
        for ( int i = 0 , end = word.length() - n ; i <= end ; i++ )
        {
            CharSequence cs = word.subSequence( i , i + n ) ;
            NGram nge = (NGram)ngrams.get( cs ) ;

            if ( nge == null )
            {
                nge = new NGramImpl( cs ) ;
                if ( restricted != null && !restricted.contains( nge ) )
                {
                    continue ;
                }
                ngrams.put( cs , nge ) ;
                ordered = null ; // A new element invalidates the ordered access
            }
            nge.inc() ;
            normalization++ ;
            sorted = null ;
        }
    }

    public Iterator getSorted()
    {
        if ( sorted == null )
        {
            sorted = (NGram[])ngrams.values().toArray( NO_NGRAM ) ;
            Arrays.sort( sorted ) ;
        }
        return Arrays.asList( sorted ).iterator() ;
    }

    public NGram get( CharSequence seq )
    {
        if ( ordered == null )
        {
            ordered = (NGram[])ngrams.values().toArray( NO_NGRAM ) ;
            Arrays.sort( ordered , CHAR_SEQ_COMPARATOR ) ;
        }
        int i = Arrays.binarySearch( ordered , seq , CHAR_SEQ_COMPARATOR ) ;

        if ( i < 0 )
        {
            return null ;
        }
        return ordered[ i ] ;
    }

    /**
     * Return ngramprofile as text
     *
     * @return ngramprofile as text
     */
    public String toString()
    {
        StringBuffer s = new StringBuffer( 2000 ) ;

        Iterator i = getSorted() ;

        s.append( "NGramProfile: " ).append( name ).append( '\n' ) ;
        while ( i.hasNext() )
        {
            NGram entry = (NGram)i.next() ;

            s.append( entry ).append( ' ' ).append( entry.getCount() ).append(
                '\n' ) ;
        }
        return s.toString() ;
    }

    /**
     * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
     */
    public void load( InputStream is )
        throws IOException
    {
        BufferedReader bis =
            new BufferedReader( new UnicodeReader( is , "UTF-8" ) ) ;
        String line ;

        ngrams.clear() ;
        int storeCount = -1 ;
        String eliminators = "" ; // XXX ad hoc correction of reference
        int discards = 0 ;

        while ( ( line = bis.readLine() ) != null )
        {
            line = line.trim() ;
            if ( line.length() < 2 )
            {
                continue ;
            }
            // # starts a comment line
            // - starts a correction line
            if ( line.charAt( 0 ) == '-' )
            {
                eliminators += line.charAt( 1 ) ;
            }
            else if ( line.startsWith( FINISHREAD_STR ) )
            {
                break ;
            }
            else if ( line.charAt( 0 ) != '#' )
            {
                int spacepos = line.indexOf( ' ' ) ;
                String ngramsequence =
                    line.substring( 0 , spacepos ).trim().replace( '_' , ' ' ) ;

                if ( " ".equals( ngramsequence ) )
                {
                    // Single spaces are so paar as n-grams (1-grams), that
                    // we throw them away!!
                    continue ;
                }
                int count =
                    Integer.parseInt( line.substring( spacepos + 1 ).trim() ) ;

                if ( line.startsWith( NORMALIZATION_STR ) )
                {
                    storeCount = count ;
                }
                else if ( ngramsequence.length() >= minNGramLength &&
                    ngramsequence.length() <= maxNGramLength )
                {
                    // XXX Check for eliminations!
                    int l ;

                    for ( l = 0 ; l < eliminators.length() ; l++ )
                    {
                        if ( ngramsequence.indexOf( eliminators.charAt( l ) ) >=
                            0 )
                        {
                            break ;
                        }
                    }
                    if ( l < eliminators.length() )
                    {
                        discards++ ;
                        // System.out.println(" "+discards+".DISCARD --> <"+ngramsequence+"> <"+eliminators.charAt(l)+">");
                    }
                    else
                    {
                        // System.out.println("<"+ngramsequence+"> "+" "+((int)ngramsequence.charAt(ngramsequence.length()-1))+" "+count);
                        NGram en = new NGramImpl( ngramsequence , count ) ;

                        ngrams.put( ngramsequence , en ) ;
                        normalization += count ;
                    }
                }
            }
        }
        if ( storeCount != -1 )
        {
            if ( storeCount != normalization )
            {
                System.err.println(
                    " WARNING " + storeCount + " != " + normalization ) ;
            }
            // XXX Which one is better :-) normalization = storeCount;
        }
        // if ( discards > 0 ) System.err.println("  "+getName()+" has "+discards+" discards.");
    }

    /**
     * Create a new Language profile from (preferably quite large) text file
     *
     * @param name name of profile
     * @param is
     * @param encoding encoding of stream
     */
    public static NGramProfileImpl createProfile( String name , InputStream is ,
        String encoding )
        throws IOException
    {
        NGramProfileImpl newProfile = new NGramProfileImpl( name ) ;
        BufferedReader bis =
            new BufferedReader( new UnicodeReader( is , encoding ) ) ;
        String line ;

        while ( ( line = bis.readLine() ) != null )
        {
            newProfile.analyze( line ) ;
        }
        return newProfile ;
    }

    /**
     * Writes NGramProfile content into OutputStream, content is outputted with
     * UTF-8 encoding
     *
     * @param os Stream to output to
     * @throws IOException
     */

    public void save( OutputStream os )
        throws IOException
    {
        Iterator i = getSorted() ;

        os.write(
            ( "# NgramProfile generated at " + new Date() + " for Language Identification\n" ).getBytes() ) ;
        os.write(
            ( NORMALIZATION_STR + " " + normalization + "\n" ).getBytes() ) ;
        while ( i.hasNext() )
        {
            NGram e = (NGram)i.next() ;
            String line = e + " " + e.getCount() + "\n" ;

            os.write( line.getBytes( "UTF-8" ) ) ;
        }
        os.flush() ;
    }

    /**
     * @return Returns the name.
     */
    public String getName()
    {
        return name ;
    }

    /**
     * @param name
     *          The name to set.
     */
    public void setName( String name )
    {
        this.name = name ;
    }
}