package de.spieleck.app.cngram ; /* Please see the license information in the header below. */ /* NGramJ - n-gram based text classification Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de) This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program (lesser.txt); if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ import java.io.BufferedInputStream ; import java.io.BufferedReader ; import java.io.IOException ; import java.io.InputStream ; import java.io.InputStreamReader ; import java.io.OutputStream ; import java.util.Date ; import java.util.Collections ; import java.util.HashMap ; import java.util.Iterator ; import java.util.Arrays ; import java.util.Set ; import edu.northwestern.at.utils.UnicodeReader; /** * Actual implementation of a NGramProfile * * Methods are provided to build new NGramProfiles profiles. * @author frank nestel * @author $Author: nestefan $ * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $ */ @SuppressWarnings("unchecked") public class NGramProfileImpl implements NGramProfile { /** separator char */ public static final char SEPARATOR = '_' ; /** default min length of ngram. */ public static final int DEFAULT_MIN_NGRAM_LENGTH = 1 ; /** default max length of ngram */ public static final int DEFAULT_MAX_NGRAM_LENGTH = 5 ; /** Name this Profile. */ private String name ; /** Internal to provide sorted access. */ private volatile NGram[] sorted = null ; /** Internal to provide ordered access. */ private volatile NGram[] ordered = null ; /** Minimum length for an ngram. */ private int minNGramLength ; /** Maximum length for an ngram. */ private int maxNGramLength ; /** Norm factor for this profile. */ private int normalization = 0 ; /** Map to store char sequences and ngrams */ private HashMap ngrams = null ; /** */ private Set restricted = null ; /** * Create a new ngram profile with default lengths. * * @param name Name of profile */ public NGramProfileImpl( String name ) { this( name , DEFAULT_MIN_NGRAM_LENGTH , DEFAULT_MAX_NGRAM_LENGTH ) ; } /** * Create a new ngram profile * * @param name Name of profile * @param minlen min length of ngram sequences * @param maxlen max length of ngram sequences */ public NGramProfileImpl( String name , int minlen , int maxlen ) { ngrams = new HashMap() ; this.maxNGramLength = maxlen ; this.minNGramLength = minlen ; setName( name ) ; } public void setRestricted( Set restricted ) { this.restricted = restricted ; } /** * Analyze a piece of text * * @param text the text to be analyzed */ public void analyze( CharSequence text ) { StringBuffer word = new StringBuffer( 30 ).append( SEPARATOR ) ; for ( int i = 0 ; i < text.length() ; i++ ) { char c = Character.toLowerCase( text.charAt( i ) ) ; if ( Character.isLetter( c ) ) { word.append( c ) ; } else { addAnalyze( word ) ; word.setLength( 1 ) ; } } addAnalyze( word ) ; } private void addAnalyze( StringBuffer word ) { if ( word.length() > 1 ) { word.append( SEPARATOR ) ; addNGrams( word ) ; } } public void clear() { if ( ngrams != null ) { ngrams.clear() ; } normalization = 0 ; ordered = sorted = null ; } public int getCount() { return ngrams.size() ; } public int getNormalization() { return normalization ; } /** * Add ngrams from a single word to this profile * * @param word */ public void addNGrams( CharSequence word ) { for ( int i = minNGramLength ; i <= maxNGramLength && i < word.length() ; i++ ) { addNGrams( word , i ) ; } } /** * @param word * @param n sequence length */ private void addNGrams( CharSequence word , int n ) { for ( int i = 0 , end = word.length() - n ; i <= end ; i++ ) { CharSequence cs = word.subSequence( i , i + n ) ; NGram nge = (NGram)ngrams.get( cs ) ; if ( nge == null ) { nge = new NGramImpl( cs ) ; if ( restricted != null && !restricted.contains( nge ) ) { continue ; } ngrams.put( cs , nge ) ; ordered = null ; // A new element invalidates the ordered access } nge.inc() ; normalization++ ; sorted = null ; } } public Iterator getSorted() { if ( sorted == null ) { sorted = (NGram[])ngrams.values().toArray( NO_NGRAM ) ; Arrays.sort( sorted ) ; } return Arrays.asList( sorted ).iterator() ; } public NGram get( CharSequence seq ) { if ( ordered == null ) { ordered = (NGram[])ngrams.values().toArray( NO_NGRAM ) ; Arrays.sort( ordered , CHAR_SEQ_COMPARATOR ) ; } int i = Arrays.binarySearch( ordered , seq , CHAR_SEQ_COMPARATOR ) ; if ( i < 0 ) { return null ; } return ordered[ i ] ; } /** * Return ngramprofile as text * * @return ngramprofile as text */ public String toString() { StringBuffer s = new StringBuffer( 2000 ) ; Iterator i = getSorted() ; s.append( "NGramProfile: " ).append( name ).append( '\n' ) ; while ( i.hasNext() ) { NGram entry = (NGram)i.next() ; s.append( entry ).append( ' ' ).append( entry.getCount() ).append( '\n' ) ; } return s.toString() ; } /** * Loads a ngram profile from InputStream (assumes UTF-8 encoded content) */ public void load( InputStream is ) throws IOException { BufferedReader bis = new BufferedReader( new UnicodeReader( is , "UTF-8" ) ) ; String line ; ngrams.clear() ; int storeCount = -1 ; String eliminators = "" ; // XXX ad hoc correction of reference int discards = 0 ; while ( ( line = bis.readLine() ) != null ) { line = line.trim() ; if ( line.length() < 2 ) { continue ; } // # starts a comment line // - starts a correction line if ( line.charAt( 0 ) == '-' ) { eliminators += line.charAt( 1 ) ; } else if ( line.startsWith( FINISHREAD_STR ) ) { break ; } else if ( line.charAt( 0 ) != '#' ) { int spacepos = line.indexOf( ' ' ) ; String ngramsequence = line.substring( 0 , spacepos ).trim().replace( '_' , ' ' ) ; if ( " ".equals( ngramsequence ) ) { // Single spaces are so paar as n-grams (1-grams), that // we throw them away!! continue ; } int count = Integer.parseInt( line.substring( spacepos + 1 ).trim() ) ; if ( line.startsWith( NORMALIZATION_STR ) ) { storeCount = count ; } else if ( ngramsequence.length() >= minNGramLength && ngramsequence.length() <= maxNGramLength ) { // XXX Check for eliminations! int l ; for ( l = 0 ; l < eliminators.length() ; l++ ) { if ( ngramsequence.indexOf( eliminators.charAt( l ) ) >= 0 ) { break ; } } if ( l < eliminators.length() ) { discards++ ; // System.out.println(" "+discards+".DISCARD --> <"+ngramsequence+"> <"+eliminators.charAt(l)+">"); } else { // System.out.println("<"+ngramsequence+"> "+" "+((int)ngramsequence.charAt(ngramsequence.length()-1))+" "+count); NGram en = new NGramImpl( ngramsequence , count ) ; ngrams.put( ngramsequence , en ) ; normalization += count ; } } } } if ( storeCount != -1 ) { if ( storeCount != normalization ) { System.err.println( " WARNING " + storeCount + " != " + normalization ) ; } // XXX Which one is better :-) normalization = storeCount; } // if ( discards > 0 ) System.err.println(" "+getName()+" has "+discards+" discards."); } /** * Create a new Language profile from (preferably quite large) text file * * @param name name of profile * @param is * @param encoding encoding of stream */ public static NGramProfileImpl createProfile( String name , InputStream is , String encoding ) throws IOException { NGramProfileImpl newProfile = new NGramProfileImpl( name ) ; BufferedReader bis = new BufferedReader( new UnicodeReader( is , encoding ) ) ; String line ; while ( ( line = bis.readLine() ) != null ) { newProfile.analyze( line ) ; } return newProfile ; } /** * Writes NGramProfile content into OutputStream, content is outputted with * UTF-8 encoding * * @param os Stream to output to * @throws IOException */ public void save( OutputStream os ) throws IOException { Iterator i = getSorted() ; os.write( ( "# NgramProfile generated at " + new Date() + " for Language Identification\n" ).getBytes() ) ; os.write( ( NORMALIZATION_STR + " " + normalization + "\n" ).getBytes() ) ; while ( i.hasNext() ) { NGram e = (NGram)i.next() ; String line = e + " " + e.getCount() + "\n" ; os.write( line.getBytes( "UTF-8" ) ) ; } os.flush() ; } /** * @return Returns the name. */ public String getName() { return name ; } /** * @param name * The name to set. */ public void setName( String name ) { this.name = name ; } }