/** * Copyright (c) 2014, the LESK-WSD-DSM AUTHORS. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the University of Bari nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 * */ package di.uniba.it.exec; import di.uniba.it.wsd.OldXMLTextReader; import di.uniba.it.wsd.PlainTextReader; import di.uniba.it.wsd.RevisedLesk; import di.uniba.it.wsd.Utils; import di.uniba.it.wsd.XMLTextReader; import di.uniba.it.wsd.data.SynsetOut; import di.uniba.it.wsd.data.TextReader; import di.uniba.it.wsd.data.Token; import di.uniba.it.wsd.dsm.DataVectorStore; import di.uniba.it.wsd.dsm.LuceneVectorStore; import di.uniba.it.wsd.dsm.VectorStore; import di.uniba.it.wsd.SenseFreqAPI; import it.uniroma1.lcl.jlt.util.Language; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.util.List; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author Pierpaolo Basile pierpaolo.basile@gmail.com */ public class RunWSD { private static final String OUTFORMAT_PLAIN = "plain"; private static final String OUTFORMAT_TASK = "task"; /** * * README.md file contains information about main arguments * @param args the command line arguments */ public static void main(String[] args) { try { Logger.getLogger(RunWSD.class.getName()).log(Level.INFO, "Init..."); TextReader reader; Properties props = Utils.parseCmd(args); if (!props.containsKey("-i")) { System.exit(1); } int mode = TextReader.SENTENCE_MODE; String modeS = props.getProperty("-cm"); if (modeS != null) { switch (modeS) { case "sent": mode = TextReader.SENTENCE_MODE; break; case "doc": mode = TextReader.DOC_MODE; break; case "text": mode = TextReader.TEXT_MODE; break; default: Logger.getLogger(RunWSD.class.getName()).log(Level.WARNING, "Mode {0} not valid...using default SENTENCE_MODE", modeS); break; } } String fileType = props.getProperty("-f"); if (fileType != null && fileType.equals("xml")) { reader = new XMLTextReader(new File(props.getProperty("-i")), mode); } else if (fileType != null && fileType.equals("oldxml")) { reader = new OldXMLTextReader(new File(props.getProperty("-i")), mode); } else { reader = new PlainTextReader(new File(props.getProperty("-i")), mode); } reader.openTextReader(); String langS = props.getProperty("-lang"); Language language = Language.EN; if (langS != null) { switch (langS) { case "en": language = Language.EN; break; case "es": language = Language.ES; break; case "de": language = Language.DE; break; case "it": language = Language.IT; break; case "fr": language = Language.FR; break; default: Logger.getLogger(RunWSD.class.getName()).log(Level.WARNING, "Language {0} not valid...using default language {1}", new Object[]{langS, language}); break; } } String dsmFilename = props.getProperty("-dsm"); RevisedLesk wsd; if (dsmFilename == null) { wsd = new RevisedLesk(language); wsd.setStemming(true); } else { String dsmType = props.getProperty("-dsmType"); VectorStore dsm = null; if (dsmType != null && dsmType.equals("java")) { dsm = new DataVectorStore(); } else { dsm = new LuceneVectorStore(); } dsm.init(new File(dsmFilename)); wsd = new RevisedLesk(language, dsm); wsd.setStemming(false); } String linComW = props.getProperty("-lc"); if (linComW != null) { String[] split = linComW.split(":"); if (split.length == 2) { try { wsd.setWeightWsd(Double.parseDouble(split[0])); wsd.setWeightSd(Double.parseDouble(split[1])); } catch (NumberFormatException nfex) { throw new Exception("Not valid weight", nfex); } } else { throw new Exception("Not valid weight"); } } String sdType = props.getProperty("-sdType"); if (sdType != null) { switch (sdType) { case "prob": wsd.setSdType(RevisedLesk.SD_PROB); break; case "prob_cross": wsd.setSdType(RevisedLesk.SD_PROB_CROSS); break; case "occ": wsd.setSdType(RevisedLesk.SD_OCC); break; default: throw new Exception("Not valid synset distribution type: " + sdType); } } String wikiType = props.getProperty("-wikiType"); if (wikiType != null) { switch (wikiType) { case "lev": wsd.setWikiType(RevisedLesk.WIKI_LEV); break; case "uni": wsd.setWikiType(RevisedLesk.WIKI_UNI); break; default: throw new Exception("Not valid wiki score type: " + sdType); } } String senseFreqDir = props.getProperty("-sc"); if (senseFreqDir != null) { SenseFreqAPI sfapi = new SenseFreqAPI(new File(senseFreqDir + "/sense.freq"), new File(senseFreqDir + "/sense.occ")); sfapi.init(); wsd.setSenseFreq(sfapi); } String projDepth = props.getProperty("-depth"); if (projDepth != null) { wsd.setMaxDepth(Integer.parseInt(projDepth)); } String scoreGloss = props.getProperty("-sg"); if (scoreGloss != null) { wsd.setScoreGloss(Boolean.parseBoolean(scoreGloss)); } String sf = props.getProperty("-sf"); if (sf == null) { wsd.setOutType(RevisedLesk.OUT_BABELNET); } else if (sf.equals("wn")) { wsd.setOutType(RevisedLesk.OUT_WORDNET); } else if (sf.equals("bn")) { wsd.setOutType(RevisedLesk.OUT_BABELNET); } else { throw new Exception("Synset format not valid: " + sf); } String cs = props.getProperty("-c"); if (cs != null) { if (cs.equalsIgnoreCase("max")) { wsd.setContextSize(Integer.MAX_VALUE); } else { wsd.setContextSize(Integer.parseInt(cs)); } } else { Logger.getLogger(RunWSD.class.getName()).log(Level.WARNING, "Context size {0} not valid...using default context size=5", cs); wsd.setContextSize(5); } String outFormat = OUTFORMAT_PLAIN; String of = props.getProperty("-of"); if (of != null) { if (outFormat.equals(OUTFORMAT_PLAIN) || outFormat.equals(OUTFORMAT_TASK)) { outFormat = of; } else { throw new Exception("Output format not valid: " + of); } } String stemming = props.getProperty("-stem"); if (stemming != null) { wsd.setStemming(Boolean.parseBoolean(stemming)); } else { wsd.setStemming(false); } String outFile = props.getProperty("-o"); BufferedWriter writer = null; if (outFile != null) { writer = new BufferedWriter(new FileWriter(outFile)); } wsd.init(); Logger.getLogger(RunWSD.class.getName()).log(Level.INFO, "Starting disambiguation..."); List<Token> tokens = reader.getTokenList(); while (tokens != null) { wsd.disambiguate(tokens); if (writer == null) { for (Token token : tokens) { System.out.println(token.print()); } System.out.println(); } else { switch (outFormat) { case OUTFORMAT_PLAIN: for (Token token : tokens) { writer.append(token.print()); writer.newLine(); } writer.newLine(); writer.flush(); break; case OUTFORMAT_TASK: for (Token token : tokens) { String id = token.getId(); if (id != null && token.isToDisambiguate() && !token.getSynsetList().isEmpty()) { writer.append(id.substring(0, id.indexOf("."))).append(" "); writer.append(id).append(" "); List<SynsetOut> out = token.getSynsetList(); if (out.size() == 1) { writer.append(out.get(0).getSynset()).append(" "); } else { double variance = wsd.getVariance(out); if (out.get(out.size() - 1).getScore() - out.get(out.size() - 2).getScore() <= variance) { writer.append(out.get(out.size() - 1).getSynset()).append(" ").append(out.get(out.size() - 2).getSynset()).append(" "); } else { writer.append(out.get(out.size() - 1).getSynset()).append(" "); } } writer.append("!! lemma=" + token.getLemma()).append("#").append(token.getPos().name()); writer.newLine(); } } //write sentence to file writer.flush(); break; } } tokens = reader.getTokenList(); } reader.closeTextReader(); if (writer != null) { writer.close(); } System.out.println(wsd.getExecStats()); wsd.close(); } catch (Exception ex) { Logger.getLogger(RunWSD.class.getName()).log(Level.SEVERE, null, ex); } } }