package vn.hus.nlp.tagger; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import vn.hus.nlp.tagger.io.IOutputer; import vn.hus.nlp.tagger.io.PlainOutputer; import vn.hus.nlp.tagger.io.XMLOutputer; import vn.hus.nlp.tokenizer.VietTokenizer; import vn.hus.nlp.utils.UTF8FileUtility; import edu.stanford.nlp.ling.WordTag; import edu.stanford.nlp.tagger.maxent.MaxentTagger; import edu.stanford.nlp.tagger.maxent.TaggerConfig; /** * @author LE HONG Phuong, phuonglh@gmail.com * <p> * Apr 9, 2009, 5:50:08 PM * <p> * The tagger for the Vietnamese language. * <p> * Updated: 04/2010. * */ public class VietnameseMaxentTagger { /** * The underlying tokenizer */ private static VietTokenizer tokenizer = null; /** * The maxent tagger. */ private MaxentTagger tagger; /** * Initializes the tagger using the default model resource. */ public VietnameseMaxentTagger() { tagger = VietnameseMaxentTaggerProvider.getInstance(); } /** * Initializes the tagger using a model file. * @param modelFile a tagger model. */ public VietnameseMaxentTagger(String modelFile) { tagger = VietnameseMaxentTaggerProvider.getInstance(modelFile); } /** * Tags a list of words. Each word in the list may be a normal Vietnamese word with spaces separating * syllables. * @param words a list of words. * @return a list of tagged words * @throws Exception */ public List<WordTag> tagList(List<String> words) throws Exception { List<WordTag> tokens = new ArrayList<WordTag>(); // replace the space characters of the word in the word list by // underscores chars and build a string containing all the words StringBuffer buffer = new StringBuffer(words.size()*5); for (String word : words) { buffer.append(word.replace(' ', '_')); buffer.append(" "); } // tag the tokenized string // changed from version 2.0 of Stanford Maxent Tagger String taggedString = MaxentTagger.tagTokenizedString(buffer.toString()); // split the tagged string using the word/tag delimiter String[] pairs = taggedString.split("\\s+"); String word, tag; for (String pair : pairs) { String[] wt = pair.split(IConstants.DELIM); if (wt.length == 2) { word = wt[0]; // recover the space character if user don't want underscores if (!TaggerOptions.UNDERSCORE) { word = wt[0].replaceAll("_", " "); } tag = wt[1]; } else if (wt.length > 2) { // the case of date with / separator, for example 20/10/1980/N // the word is 20/10/1980 word = wt[0]; for (int j = 1; j < wt.length - 1; j++) { word += IConstants.DELIM + wt[j]; } // the tag is the last part tag = wt[wt.length-1]; } else { // wt.length < 2 word = ""; tag = ""; System.err.println("There is an error."); } tokens.add(new WordTag(word, tag)); } return tokens; } /** * Tags a text. * @param text a text to tag * @return a list of word/tag pairs. */ public List<WordTag> tagText2(String text) { try { // tokenizer the reader String tokenizedString = null; if (TaggerOptions.SKIP_TOKENIZATION) tokenizedString = text; else tokenizedString = getTokenizer().segment(text); String[] arr = tokenizedString.split("\\s+"); List<String> words = new ArrayList<String>(Arrays.asList(arr)); return tagList(words); } catch (Exception e) { e.printStackTrace(); } return null; } /** * Tags a text * @param text * @return an array of tags */ public String[] tagText3(String text) { String[] arr = text.split("\\s+"); List<String> words = new ArrayList<String>(Arrays.asList(arr)); List<WordTag> list; try { list = tagList(words); List<String> tags = new ArrayList<String>(); for (WordTag wt : list) { tags.add(wt.tag()); } return tags.toArray(new String[list.size()]); } catch (Exception e) { e.printStackTrace(); } return null; } /** * Tags a text. * @param text a text to tag. * @see #tagText2(String) * @return a string */ public String tagText(String text) { StringBuffer result = new StringBuffer(1024); List<WordTag> list = tagText2(text); for (WordTag wordTag : list) { result.append(wordTag.word()); result.append(IConstants.DELIM); result.append(wordTag.tag()); result.append(" "); } return result.toString().trim(); } /** * Tags a text file and write the result to an output file. All files are written in UTF-8 encoding. * @param inputFile input file * @param outputFile output file. * @param outputer an outputer */ public void tagFile(String inputFile, String outputFile, IOutputer outputer) { // get all lines of the input file String[] lines = UTF8FileUtility.getLines(inputFile); // create output file UTF8FileUtility.createWriter(outputFile); if (outputer instanceof XMLOutputer) { UTF8FileUtility.write("<doc>\n"); } for (String line : lines) { List<WordTag> list = tagText2(line); UTF8FileUtility.write(outputer.output(list)); } if (outputer instanceof XMLOutputer) { UTF8FileUtility.write("</doc>"); } UTF8FileUtility.closeWriter(); } /** * Tags a text file and write the result to an output file using a plain output format. * @param inputFile input file * @param outputFile output file. */ public void tagFile(String inputFile, String outputFile) { if (TaggerOptions.PLAIN_TEXT_FORMAT) { tagFile(inputFile, outputFile, new PlainOutputer()); } else { tagFile(inputFile, outputFile, new XMLOutputer()); } } /** * Gets the Vietnamese tokenizer. * @return the tokenizer */ public static VietTokenizer getTokenizer() { if (tokenizer == null) { tokenizer = new VietTokenizer(); } return tokenizer; } /** * @return the tagger in use */ public MaxentTagger getTagger() { return tagger; } /** * Tests a file. * @param filename a file to test. This file contains words which are correctly * tagged by human annotators. */ public void testFile(String filename) { TaggerConfig config; // create an array of arguments String[] arguments = {"-model", IConstants.DEFAULT_MODEL_FILE, "-testFile", filename}; config = new TaggerConfig(arguments); if (config.getMode() == TaggerConfig.Mode.TEST) { try { // build the tagger MaxentTagger tagger = new MaxentTagger(config.getModel(), config); // test the file tagger.runTestPublic(config); } catch (Exception e) { e.printStackTrace(); } } } /** * Main entry of the package. * @param args */ public static void main(String[] args) { // Options options = new Options(); // create boolean options Option underscoreOpt = new Option("u", "Use underscore character for separating syllables of words"); options.addOption(underscoreOpt); // Option plainTextFormatOpt = new Option("p", "Use plain text format for saving tagging results."); options.addOption(plainTextFormatOpt); Option skipTokenizationOpt = new Option("st", "Skip tokenization (input must be already tokenized)"); options.addOption(skipTokenizationOpt); // create obligatory input/output options Option inpOpt = new Option("i", true, "Input filename"); options.addOption(inpOpt); Option outOpt = new Option("o", true, "Output filename"); options.addOption(outOpt); // create a test option Option testOpt = new Option("t", true, "Test filename"); options.addOption(testOpt); // a help formatter HelpFormatter formatter = new HelpFormatter();; if (args.length < 1) { // automatically generate the help statement formatter.printHelp( "VietnameseMaxentTagger", options ); System.exit(1); } CommandLineParser commandLineParser = new PosixParser(); try { CommandLine commandLine = commandLineParser.parse(options, args); String testFile = commandLine.getOptionValue("t"); if (testFile == null) { // we are in tag mode if (commandLine.hasOption("u")) { TaggerOptions.UNDERSCORE = true; } if (commandLine.hasOption("p")) { TaggerOptions.PLAIN_TEXT_FORMAT = true; } if (commandLine.hasOption("st")) { TaggerOptions.SKIP_TOKENIZATION = true; } String inputFile = commandLine.getOptionValue("i"); if (inputFile == null) { System.err.println("Input filename is required."); formatter.printHelp( "VietnameseMaxentTagger", options ); System.exit(1); } String outputFile = commandLine.getOptionValue("o"); if (outputFile == null) { System.err.println("Output filename is required."); formatter.printHelp( "VietnameseMaxentTagger", options ); System.exit(1); } // create and run the tagger VietnameseMaxentTagger tagger = new VietnameseMaxentTagger(); System.out.println("Tagging the file. Please wait..."); tagger.tagFile(inputFile, outputFile); } else { // we are in test mode // create and run the tagger VietnameseMaxentTagger tagger = new VietnameseMaxentTagger(); System.out.println("Testing the file. Please wait..."); tagger.testFile(testFile); } System.err.println("Done."); } catch (ParseException exp) { System.err.println( "Parsing failed. Reason: " + exp.getMessage()); } } }