/*
Copyright (C) 2010 by
*
* Cam-Tu Nguyen
* ncamtu@ecei.tohoku.ac.jp or ncamtu@gmail.com
*
* Xuan-Hieu Phan
* pxhieu@gmail.com
*
* College of Technology, Vietnamese University, Hanoi
* Graduate School of Information Sciences, Tohoku University
*
* JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published
* by the Free Software Foundation; either version 2 of the License,
* or (at your option) any later version.
*
* JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
package jvnsensegmenter;
import jmaxent.*;
import java.util.*;
import java.io.*;
// TODO: Auto-generated Javadoc
/**
* The Class JVnSenSegmenter.
*/
public class JVnSenSegmenter {
/** The positive label. */
public static String positiveLabel = "y";
/** The classifier. */
public Classification classifier = null;
/** The fea gen. */
public FeatureGenerator feaGen = null;
/**
* Creates a new instance of JVnSenSegmenter.
*
* @param modelDir the model dir
* @return true, if successful
*/
public boolean init(String modelDir){
try {
classifier = new Classification(modelDir);
feaGen = new FeatureGenerator();
classifier.init();
return true;
}
catch(Exception e){
System.out.println("Error while initilizing classifier: " + e.getMessage());
return false;
}
}
/**
* Sen segment.
*
* @param text the text
* @return the string
*/
public String senSegment(String text){
//text normalization
text = text.replaceAll("([\t \n])+", "$1");
//System.out.println(text);
//generate context predicates
List markList = new ArrayList();
List data = FeatureGenerator.doFeatureGen(new HashMap(), text, markList, false);
if (markList.isEmpty())
return text + "\n";
//classify
List labels = classifier.classify(data);
String result = text.substring(0, ((Integer)markList.get(0)).intValue());
for (int i =0; i < markList.size(); ++i){
int curPos = ((Integer) markList.get(i)).intValue();
if ( ((String)labels.get(i)).equals(positiveLabel)){
result += " " + text.charAt(curPos) + "\n";
}
else result += text.charAt(curPos);
if (i < markList.size() - 1){
int nexPos = ((Integer) markList.get(i + 1)).intValue();
result += text.substring(curPos + 1, nexPos);
}
}
int finalMarkPos = ((Integer) markList.get(markList.size() - 1)).intValue();
result += text.substring(finalMarkPos + 1, text.length());
//System.out.println(result);
result = result.replaceAll("\n ", "\n");
result = result.replaceAll("\n\n", "\n");
result = result.replaceAll("\\.\\. \\.", "...");
return result;
}
/**
* Sen segment.
*
* @param text the text
* @param senList the sen list
*/
public void senSegment(String text, List senList){
senList.clear();
String resultStr = senSegment(text);
StringTokenizer senTknr = new StringTokenizer(resultStr, "\n");
while(senTknr.hasMoreTokens()){
senList.add(senTknr.nextToken());
}
}
/**
* main method of JVnSenSegmenter
* to use this tool from command line.
*
* @param args the arguments
*/
public static void main(String args[]){
if (args.length != 4){
displayHelp();
System.exit(1);
}
try{
JVnSenSegmenter senSegmenter = new JVnSenSegmenter();
senSegmenter.init(args[1]);
String option = args[2];
if (option.equalsIgnoreCase("-inputfile"))
{
senSegmentFile(args[3], args[3] + ".sent", senSegmenter);
}
else if (option.equalsIgnoreCase("-inputdir")){
//segment only files ends with .txt
File inputDir = new File(args[3]);
File [] childrent = inputDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".txt");
}
});
for (int i = 0; i <childrent.length; ++i)
{
System.out.println("Segmenting sentences in " + childrent[i]);
senSegmentFile(childrent[i].getPath(), childrent[i].getPath() + ".sent", senSegmenter);
}
}
else
displayHelp();
}
catch (Exception e)
{
System.out.println(e.getMessage());
return;
}
}
/**
* Segment sentences.
*
* @param infile the infile
* @param outfile the outfile
* @param senSegmenter the sen segmenter
*/
private static void senSegmentFile(String infile, String outfile, JVnSenSegmenter senSegmenter ){
try{
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(infile), "UTF-8"));
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(outfile), "UTF-8"));
String para = "", line = "", text = "";
while ((line = in.readLine()) != null){
if (!line.equals("")){
if (line.charAt(0) == '#'){
//skip comment line
text += line + "\n";
continue;
}
para = senSegmenter.senSegment(line).trim();
text += para.trim() + "\n\n";
}
else{
//blank line
text += "\n";
}
}
text = text.trim();
out.write(text);
out.newLine();
in.close();
out.close();
}
catch (Exception e){
System.out.println("Error in sensegment file " + infile);
}
}
/**
* Display help.
*/
public static void displayHelp(){
System.out.println("Usage:");
System.out.println("\tCase 1: JVnSenSegmenter -modeldir <model directory> -inputfile <input data file>");
System.out.println("\tCase 2: JVnSenSegmenter -modeldir <model directory> -inputdir <input data directory>");
System.out.println("Where:");
System.out.println("\t<model directory> is the directory contain the model and option files");
System.out.println("\t<input data file> is the file containing input text that need to");
System.out.println("\thave sentences segmented (each sentence on a line)");
System.out.println("\t<input data directory> is the directory containing multiple input .tkn files");
System.out.println();
}
}