package de.spieleck.app.cngram ;
/* Please see the license information in the header below. */
/*
NGramJ - n-gram based text classification
Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program (lesser.txt); if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
import java.io.File ;
import java.io.FileInputStream ;
import java.io.FileReader ;
import java.io.BufferedInputStream ;
import java.io.BufferedReader ;
import java.io.Reader ;
import java.io.FileOutputStream ;
import java.io.InputStreamReader ;
import java.io.IOException ;
import java.io.PrintStream ;
import java.text.DecimalFormat ;
import edu.northwestern.at.utils.UnicodeReader;
/**
* Commandline interface that runs a ngram analysis over submitted text,
* results can be used for automatic language identification.
*
* @author Frank S. Nestel
* @author $Author: nestefan $
* @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
*/
public class RunNGram
{
public static final int CREATE = 1 ;
public static final int SIMILARITY = 2 ;
public static final int SCORE = 3 ;
public static final int LANG = 4 ;
public static final int TEST = 5 ;
public static final int LANG2 = 6 ;
public static final int LANG2B = 7 ;
public static final int CHECK = 8 ;
public static final int PROFILES = 9 ;
public final static DecimalFormat DF = new DecimalFormat( "0.000" ) ;
public final static DecimalFormat DFE = new DecimalFormat( "0.0E0" ) ;
private static void usage( PrintStream out )
{
out.println( "Usage: RunNGram commandset" ) ;
out.println(
" [-create profilename(out) textfile [encoding]]" ) ;
out.println(
" or [-similarity metricName textfile1 textfile2 [encoding]]" ) ;
out.println(
" or [-score metricName profile-name textfile [encoding]]" ) ;
out.println( " or [-lang metricName textfile [encoding]]" ) ;
out.println( " or [-test ]" ) ;
out.println( " or [-lang2 textfile [encoding]]" ) ;
out.println( " or [-lang2b textfile [encoding]]" ) ;
out.println( " or [-check textlistFile]" ) ;
out.println( " or [-profiles metricName profile1 profile2]" ) ;
System.exit( 42 ) ;
}
public static void main( String args[] )
throws Exception
{
int command = 0 ;
if ( args.length == 0 )
{
usage( System.out ) ;
}
for ( int i = 0 ; i < args.length ; i++ )
{
String profilename = "" ;
String profilename2 = "" ;
String textfile = "" ;
String filename2 = "" ;
String metricName = null ;
NGramMetric metric = null ;
String encoding = "" ;
if ( "-c".equals( args[ i ] ) || "-create".equals( args[ i ] ) )
{
command = CREATE ;
profilename = args[ ++i ] ;
textfile = args[ ++i ] ;
}
else if ( "-i".equals( args[ i ] ) ||
"-similarity".equals( args[ i ] ) )
{
command = SIMILARITY ;
metricName = args[ ++i ] ;
metric =
(NGramMetric)Class.forName( metricName ).newInstance() ;
textfile = args[ ++i ] ;
filename2 = args[ ++i ] ;
}
else if ( "-s".equals( args[ i ] ) || args[ i ].equals( "-score" ) )
{
command = SCORE ;
metricName = args[ ++i ] ;
metric =
(NGramMetric)Class.forName( metricName ).newInstance() ;
profilename = args[ ++i ] ;
textfile = args[ ++i ] ;
}
else if ( "-p".equals( args[ i ] ) ||
"-profiles".equals( args[ i ] ) )
{
command = PROFILES ;
metricName = args[ ++i ] ;
metric =
(NGramMetric)Class.forName( metricName ).newInstance() ;
profilename = args[ ++i ] ;
profilename2 = args[ ++i ] ;
}
else if ( "-l".equals( args[ i ] ) || "-lang".equals( args[ i ] ) )
{
command = LANG ;
metricName = args[ ++i ] ;
metric =
(NGramMetric)Class.forName( metricName ).newInstance() ;
textfile = args[ ++i ] ;
}
else if ( "-l2".equals( args[ i ] ) ||
"-lang2".equals( args[ i ] ) )
{
command = LANG2 ;
textfile = args[ ++i ] ;
}
else if ( "-l2b".equals( args[ i ] ) ||
"-lang2b".equals( args[ i ] ) )
{
command = LANG2B ;
textfile = args[ ++i ] ;
}
else if ( "-x".equals( args[ i ] ) || "-check".equals( args[ i ] ) )
{
command = CHECK ;
textfile = args[ ++i ] ;
}
else if ( "-t".equals( args[ i ] ) || "-test".equals( args[ i ] ) )
{
command = TEST ;
}
else
{
usage( System.err ) ;
}
if ( i + 1 < args.length && args[ i ].charAt( 0 ) != '-' )
{
encoding = args[ ++i ] ;
}
else
{
encoding = "iso-8859-1" ;
}
if ( command == TEST )
{
NGramProfiles npi = new NGramProfiles() ;
npi.info() ;
}
else if ( command == LANG2 || command == LANG2B )
{
long t1 = System.currentTimeMillis() ;
NGramProfiles nps = new NGramProfiles() ;
NGramProfiles.Ranker ranker = nps.getRanker() ;
ranker.account( createReader( textfile , encoding ) ) ;
NGramProfiles.RankResult res = ranker.getRankResult() ;
long t2 = System.currentTimeMillis() ;
printRankResult( "speed" , res , t2 - t1 ) ;
if ( command == LANG2B )
{
t1 = t2 ;
ranker.reset() ;
ranker.account( createReader( textfile , encoding ) ) ;
res = ranker.getRankResult() ;
t2 = System.currentTimeMillis() ;
printRankResult( "speed" , res , t2 - t1 ) ;
}
}
else if ( command == CHECK )
{
NGramProfiles npi = new NGramProfiles() ;
NGramProfiles.Ranker ranker = npi.getRanker() ;
File fi = new File( textfile ) ;
BufferedReader br = new BufferedReader( new FileReader( fi ) ) ;
String line ;
while ( ( line = br.readLine() ) != null )
{
line = line.trim() ;
if ( line.charAt( 0 ) == '#' )
{
continue ;
}
String[] ss = line.split( ";" ) ;
long t1 = System.currentTimeMillis() ;
ranker.reset() ;
ranker.account( createReader( ss[ 0 ] , ss[ 1 ] ) ) ;
long t2 = System.currentTimeMillis() ;
NGramProfiles.RankResult res = ranker.getRankResult() ;
printRankResult( ss[ 0 ] , res , t2 - t1 ) ;
}
}
else if ( command == PROFILES )
{
FileInputStream fis ;
File f2 = new File( profilename ) ;
fis = new FileInputStream( f2 ) ;
NGramProfileImpl comp1 = new NGramProfileImpl( profilename ) ;
comp1.load( fis ) ;
File f3 = new File( profilename2 ) ;
fis = new FileInputStream( f3 ) ;
NGramProfileImpl comp2 = new NGramProfileImpl( profilename2 ) ;
comp2.load( fis ) ;
System.out.println(
"diff(" + profilename + ":" + profilename2 + ")=" +
DFE.format( metric.diff( comp1 , comp2 ) ) ) ;
}
else
{
long t1 = System.currentTimeMillis() ;
NGramProfileImpl newProf = create( textfile , encoding ) ;
long t2 = System.currentTimeMillis() ;
switch ( command )
{
case CREATE :
String fname =
profilename + "." +
NGramProfile.NGRAM_PROFILE_EXTENSION ;
File f = new File( fname ) ;
FileOutputStream fos = new FileOutputStream( f ) ;
newProf.save( fos ) ;
System.out.println(
"new profile '" + fname + "' was created." ) ;
break ;
case SIMILARITY :
NGramProfile newProf2 = create( filename2 , encoding ) ;
System.out.println(
"Difference is " +
DFE.format( metric.diff( newProf , newProf2 ) ) ) ;
break ;
case SCORE :
File f2 =
new File(
profilename + "." +
NGramProfile.NGRAM_PROFILE_EXTENSION ) ;
FileInputStream fis = new FileInputStream( f2 ) ;
NGramProfileImpl compare =
new NGramProfileImpl( profilename ) ;
compare.load( fis ) ;
System.out.println(
"Score (" + profilename + ") is " +
DFE.format( metric.diff( compare , newProf ) ) ) ;
break ;
case LANG :
NGramProfiles nps = new NGramProfiles() ;
// Set restrict = nps.getAllNGrams();
long dt1 = t2 - t1 ;
t1 = System.currentTimeMillis() ;
NGramProfiles.RankResult res =
nps.rank( metric , newProf ) ;
t2 = System.currentTimeMillis() ;
int ppos = metricName.lastIndexOf( "." ) ;
printRankResult(
metricName.substring( ppos + 1 ) + "(" + dt1 + ")" ,
res , t2 - t1 ) ;
break ;
}
}
}
}
public static Reader createReader( String textfile , String encoding )
throws IOException
{
return new UnicodeReader(
new BufferedInputStream( new FileInputStream( textfile ) ) ,
encoding ) ;
}
public static void printRankResult( String msg , NGramProfiles.RankResult res ,
long dt )
{
System.out.println(
msg + ": " + res.getName( 0 ) + ":" +
DF.format( res.getScore( 0 ) ) + " " + res.getName( 1 ) + ":" +
DF.format( res.getScore( 1 ) ) + " " + res.getName( 2 ) + ":" +
DF.format( res.getScore( 2 ) ) + " .. " + res.getName( -1 ) + ":" +
DF.format( res.getScore( -1 ) ) + " |" +
DFE.format( res.getScore( 1 ) / res.getScore( 0 ) ) + " |" +
DFE.format( res.getScore( -1 ) / res.getScore( 0 ) ) + " dt=" + dt ) ;
}
public static NGramProfileImpl create( String textfile , String encoding )
throws IOException
{
File f = new File( textfile ) ;
FileInputStream fis = new FileInputStream( f ) ;
NGramProfileImpl prof =
NGramProfileImpl.createProfile( textfile , fis , encoding ) ;
fis.close() ;
return prof ;
}
}