VietnameseMaxentTagger.java example

Explorer
vnTagger-master
- src
  - vn
    - hus
      - nlp
        tagger
        IConstants.java
        TaggerOptions.java
        TaggerTester.java
        TaggerTrainer.java
        VietnameseMaxentTagger.java
        VietnameseMaxentTaggerProvider.java
        io
        IOutputer.java
        PlainOutputer.java
        XMLOutputer.java
        test
        VietnameseTaggerTest.java
        util
        ClosedTagDistribution.java
        CollectionNc.java
        CorpusSplitter.java
        TagDistribution.java
        TreeToTaggedSentence.java
package vn.hus.nlp.tagger;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

import vn.hus.nlp.tagger.io.IOutputer;
import vn.hus.nlp.tagger.io.PlainOutputer;
import vn.hus.nlp.tagger.io.XMLOutputer;
import vn.hus.nlp.tokenizer.VietTokenizer;
import vn.hus.nlp.utils.UTF8FileUtility;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.tagger.maxent.TaggerConfig;

/**
 * @author LE HONG Phuong, phuonglh@gmail.com
 * <p>
 * Apr 9, 2009, 5:50:08 PM
 * <p>
 * The tagger for the Vietnamese language.
 * <p>
 * Updated: 04/2010.
 * 
 */
public class VietnameseMaxentTagger {
	
	/**
	 * The underlying tokenizer
	 */
	private static VietTokenizer tokenizer = null;

	/**
	 * The maxent tagger.
	 */
	private MaxentTagger tagger;
	
	
	/**
	 * Initializes the tagger using the default model resource.
	 */
	public VietnameseMaxentTagger() {
		tagger = VietnameseMaxentTaggerProvider.getInstance();
	}
	
	/**
	 * Initializes the tagger using a model file.
	 * @param modelFile a tagger model.
	 */
	public VietnameseMaxentTagger(String modelFile) {
		tagger = VietnameseMaxentTaggerProvider.getInstance(modelFile);
	}
	
	/**
	 * Tags a list of words. Each word in the list may be a normal Vietnamese word with spaces separating 
	 * syllables.  
	 * @param words a list of words.
	 * @return a list of tagged words
	 * @throws Exception 
	 */
	public List<WordTag> tagList(List<String> words) throws Exception {
		List<WordTag> tokens = new  ArrayList<WordTag>();
		// replace the space characters of the word in the word list by 
		// underscores chars and build a string containing all the words
		StringBuffer buffer = new StringBuffer(words.size()*5);
		for (String word : words) {
			buffer.append(word.replace(' ', '_'));
			buffer.append(" ");
		}
		// tag the tokenized string 
		// changed from version 2.0 of Stanford Maxent Tagger
		String taggedString = MaxentTagger.tagTokenizedString(buffer.toString());

		// split the tagged string using the word/tag delimiter
		String[] pairs = taggedString.split("\\s+");
		String word, tag;
		for (String pair : pairs) {
			String[] wt = pair.split(IConstants.DELIM);
			if (wt.length == 2) {
				word = wt[0];
				// recover the space character if user don't want underscores
				if (!TaggerOptions.UNDERSCORE) {
					word = wt[0].replaceAll("_", " ");
				}
				tag = wt[1];
			} else if (wt.length > 2) {
				// the case of date with / separator, for example 20/10/1980/N
				// the word is 20/10/1980
				word = wt[0];
				for (int j = 1; j < wt.length - 1; j++) {
					word += IConstants.DELIM + wt[j];
				}
				// the tag is the last part
				tag = wt[wt.length-1]; 
			} else { // wt.length < 2
				word = "";
				tag = "";
				System.err.println("There is an error.");
			}
			tokens.add(new WordTag(word, tag));
		}
		return tokens;
	}
	
	/**
	 * Tags a text.
	 * @param text a text to tag
	 * @return a list of word/tag pairs.
	 */
	public List<WordTag> tagText2(String text) {
		try {
			// tokenizer the reader
			String tokenizedString = null;
			if (TaggerOptions.SKIP_TOKENIZATION)
				tokenizedString = text;
			else
				tokenizedString = getTokenizer().segment(text);
			
			String[] arr = tokenizedString.split("\\s+");
			List<String> words = new ArrayList<String>(Arrays.asList(arr));
			return tagList(words);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	/**
	 * Tags a text
	 * @param text
	 * @return an array of tags
	 */
	public String[] tagText3(String text) {
		String[] arr = text.split("\\s+");
		List<String> words = new ArrayList<String>(Arrays.asList(arr));
		List<WordTag> list;
		try {
			list = tagList(words);
			List<String> tags = new ArrayList<String>();
			for (WordTag wt : list) {
				tags.add(wt.tag());
			}
			return tags.toArray(new String[list.size()]);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	/**
	 * Tags a text.
	 * @param text a text to tag.
	 * @see #tagText2(String)
	 * @return a string
	 */
	public String tagText(String text) {
		StringBuffer result = new StringBuffer(1024);
		List<WordTag> list = tagText2(text);
		for (WordTag wordTag : list) {
			result.append(wordTag.word());
			result.append(IConstants.DELIM);
			result.append(wordTag.tag());
			result.append(" ");
		}
		return result.toString().trim();
	}
	/**
	 * Tags a text file and write the result to an output file. All files are written in UTF-8 encoding.
	 * @param inputFile input file
	 * @param outputFile output file.
	 * @param outputer an outputer 
	 */
	public void tagFile(String inputFile, String outputFile, IOutputer outputer) {
		// get all lines of the input file
		String[] lines = UTF8FileUtility.getLines(inputFile);
		// create output file
		UTF8FileUtility.createWriter(outputFile);
		if (outputer instanceof XMLOutputer) {
			UTF8FileUtility.write("<doc>\n");
		}
		for (String line : lines) {
			List<WordTag> list = tagText2(line);
			UTF8FileUtility.write(outputer.output(list));
		}
		if (outputer instanceof XMLOutputer) {
			UTF8FileUtility.write("</doc>");
		}
		UTF8FileUtility.closeWriter();
	}
	
	/**
	 * Tags a text file and write the result to an output file using a plain output format.
	 * @param inputFile input file
	 * @param outputFile output file.
	 */
	public void tagFile(String inputFile, String outputFile) {
		if (TaggerOptions.PLAIN_TEXT_FORMAT) {
			tagFile(inputFile, outputFile, new PlainOutputer());
		} else {
			tagFile(inputFile, outputFile, new XMLOutputer());
		}
	}

	/**
	 * Gets the Vietnamese tokenizer.
	 * @return the tokenizer
	 */
	public static VietTokenizer getTokenizer() {
		if (tokenizer == null) {
			tokenizer = new VietTokenizer();
		}
		return tokenizer;
	}
	
	/**
	 * @return the tagger in use
	 */
	public MaxentTagger getTagger() {
		return tagger;
	}
	
	/**
	 * Tests a file.
	 * @param filename a file to test. This file contains words which are correctly 
	 * tagged by human annotators. 
	 */
	public void testFile(String filename) {
		TaggerConfig config;
		// create an array of arguments
		String[] arguments = {"-model", IConstants.DEFAULT_MODEL_FILE, "-testFile", filename};
		config = new TaggerConfig(arguments);
		if (config.getMode() == TaggerConfig.Mode.TEST) {
			try {
				// build the tagger
				MaxentTagger tagger = new MaxentTagger(config.getModel(), config);
				// test the file 
				tagger.runTestPublic(config);
			} catch (Exception e) {
				e.printStackTrace();
			} 
			
		}
		
	}
	/**
	 * Main entry of the package.
	 * @param args
	 */
	public static void main(String[] args) {
		//  
		Options options = new Options();
		// create boolean options
		Option underscoreOpt = new Option("u", "Use underscore character for separating syllables of words");
		options.addOption(underscoreOpt);
		//
		Option plainTextFormatOpt = new Option("p", "Use plain text format for saving tagging results.");
		options.addOption(plainTextFormatOpt);
		
		Option skipTokenizationOpt = new Option("st", "Skip tokenization (input must be already tokenized)");
		options.addOption(skipTokenizationOpt);
		
		// create obligatory input/output options
		Option inpOpt = new Option("i", true, "Input filename");
		options.addOption(inpOpt);
		
		Option outOpt = new Option("o", true, "Output filename");
		options.addOption(outOpt);
		
		// create a test option
		Option testOpt = new Option("t", true, "Test filename");
		options.addOption(testOpt);
		
		// a help formatter
		HelpFormatter formatter = new HelpFormatter();;
		
		if (args.length < 1) {
			// automatically generate the help statement
			formatter.printHelp( "VietnameseMaxentTagger", options );
			System.exit(1);
		}
		
		CommandLineParser commandLineParser = new PosixParser();
		try {
			CommandLine commandLine = commandLineParser.parse(options, args);
			
			String testFile = commandLine.getOptionValue("t");
			if (testFile == null) {
				// we are in tag mode
				if (commandLine.hasOption("u")) {
					TaggerOptions.UNDERSCORE = true;
				}
				
				if (commandLine.hasOption("p")) {
					TaggerOptions.PLAIN_TEXT_FORMAT = true;
				}
				
				if (commandLine.hasOption("st")) {
					TaggerOptions.SKIP_TOKENIZATION = true;
				}
				
				String inputFile = commandLine.getOptionValue("i");
				if (inputFile == null) {
					System.err.println("Input filename is required.");
					formatter.printHelp( "VietnameseMaxentTagger", options );
					System.exit(1);
				}
				String outputFile = commandLine.getOptionValue("o");
				if (outputFile == null) {
					System.err.println("Output filename is required.");
					formatter.printHelp( "VietnameseMaxentTagger", options );
					System.exit(1);
				}
				// create and run the tagger
				VietnameseMaxentTagger tagger = new VietnameseMaxentTagger();
				System.out.println("Tagging the file. Please wait...");
				tagger.tagFile(inputFile, outputFile);
			} else {
				// we are in test mode
				// create and run the tagger
				VietnameseMaxentTagger tagger = new VietnameseMaxentTagger();
				System.out.println("Testing the file. Please wait...");
				tagger.testFile(testFile);
			}
			System.err.println("Done.");
		} catch (ParseException exp) {
			System.err.println( "Parsing failed.  Reason: " + exp.getMessage());
		}
		
	}
	
}