package io.lumify.opennlpDictionary; import com.altamiracorp.bigtable.model.ModelSession; import io.lumify.core.cmdline.CommandLineBase; import io.lumify.core.model.ontology.Concept; import io.lumify.core.model.ontology.OntologyRepository; import io.lumify.core.user.User; import io.lumify.opennlpDictionary.model.DictionaryEntryRepository; import com.google.inject.Inject; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.io.FilenameUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URLDecoder; import static com.google.common.base.Preconditions.checkNotNull; public class DictionaryImporter extends CommandLineBase { private ModelSession modelSession; private DictionaryEntryRepository dictionaryEntryRepository; private String directory; private String extension; public static void main(String[] args) throws Exception { int res = new DictionaryImporter().run(args); if (res != 0) { System.exit(res); } } @Override protected void processOptions(CommandLine cmd) throws Exception { super.processOptions(cmd); this.directory = cmd.getOptionValue("directory"); this.extension = cmd.getOptionValue("extension") == null ? "dict" : cmd.getOptionValue("extension"); } @Override protected Options getOptions() { Options options = super.getOptions(); options.addOption( OptionBuilder .withLongOpt("directory") .withDescription("The directory to search for dictionary files") .isRequired() .hasArg(true) .withArgName("dir") .create() ); options.addOption( OptionBuilder .withLongOpt("extension") .withDescription("Extension of dictionary files (default: dict)") .hasArg(true) .withArgName("extension") .create() ); return options; } @Override protected int run(CommandLine cmd) throws Exception { User user = getUser(); FileSystem fs = getFileSystem(); Path dictionaryPath = new Path(directory); FileStatus[] files = fs.listStatus(dictionaryPath, new DictionaryPathFilter(this.extension)); for (FileStatus fileStatus : files) { LOGGER.info("Importing dictionary file: " + fileStatus.getPath().toString()); String conceptName = FilenameUtils.getBaseName(fileStatus.getPath().toString()); conceptName = URLDecoder.decode(conceptName, "UTF-8"); Concept concept = getOntologyRepository().getConceptByIRI(conceptName); checkNotNull(concept, "Could not find concept with name " + conceptName); writeFile(fs.open(fileStatus.getPath()), conceptName, user); } modelSession.close(); return 0; } protected void writeFile(InputStream in, String concept, User user) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line; while ((line = br.readLine()) != null) { dictionaryEntryRepository.saveNew(line, concept, user); } in.close(); } @Inject public void setModelSession(ModelSession modelSession) { this.modelSession = modelSession; } @Inject public void setDictionaryEntryRepository(DictionaryEntryRepository dictionaryEntryRepository) { this.dictionaryEntryRepository = dictionaryEntryRepository; } public static class DictionaryPathFilter implements PathFilter { private String extension; public DictionaryPathFilter(String extension) { this.extension = extension; } @Override public boolean accept(Path path) { return FilenameUtils.getExtension(path.toString()).equals(extension); } } }