package edu.stanford.nlp.international.spanish.pipeline; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.stats.TwoDimensionalCounter; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeReader; import edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer; import edu.stanford.nlp.trees.international.spanish.SpanishTreebankLanguagePack; import edu.stanford.nlp.trees.international.spanish.SpanishXMLTreeReaderFactory; import edu.stanford.nlp.util.StringUtils; import java.io.*; import java.util.*; /** * A utility to build unigram part-of-speech tagging data from XML * corpus files from the AnCora corpus. * * The constructed tagger is used to tag the constituent tokens of * multi-word expressions, which have no tags in the AnCora corpus. * * For invocation options, run the program with no arguments. * * @author Jon Gauthier */ public class AnCoraPOSStats { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(AnCoraPOSStats.class); private final TwoDimensionalCounter<String, String> unigramTagger; private static final String ANCORA_ENCODING = "ISO8859_1"; private List<File> fileList; private String outputPath; public AnCoraPOSStats(List<File> fileList, String outputPath) { this.fileList = fileList; this.outputPath = outputPath; unigramTagger = new TwoDimensionalCounter<>(); } public void process() throws IOException { SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(); Tree t; for (File file : fileList) { Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), ANCORA_ENCODING)); TreeReader tr = trf.newTreeReader(in); // Tree reading will implicitly perform tree normalization for us while ((t = tr.readTree()) != null) { // Update tagger with this tree List<CoreLabel> yield = t.taggedLabeledYield(); for (CoreLabel leafLabel : yield) { if (leafLabel.tag().equals(SpanishTreeNormalizer.MW_TAG)) continue; unigramTagger.incrementCount(leafLabel.word(), leafLabel.tag()); } } } } public TwoDimensionalCounter<String, String> getUnigramTagger() { return unigramTagger; } private static final String usage = String.format("Usage: java %s -o <output_path> file(s)%n%n", AnCoraPOSStats.class.getName()); private static final Map<String, Integer> argOptionDefs = new HashMap<>(); static { argOptionDefs.put("o", 1); } public static void main(String[] args) throws IOException { if (args.length < 1) { log.info(usage); System.exit(1); } Properties options = StringUtils.argsToProperties(args, argOptionDefs); String outputPath = options.getProperty("o"); if (outputPath == null) throw new IllegalArgumentException("-o argument (output path for built tagger) is required"); String[] remainingArgs = options.getProperty("").split(" "); List<File> fileList = new ArrayList<>(); for (String arg : remainingArgs) fileList.add(new File(arg)); AnCoraPOSStats stats = new AnCoraPOSStats(fileList, outputPath); stats.process(); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(outputPath)); TwoDimensionalCounter<String, String> tagger = stats.getUnigramTagger(); oos.writeObject(tagger); System.out.printf("Wrote tagger to %s%n", outputPath); } }