/** * Copyright 2014 Diego Ceccarelli * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Copyright 2014 Diego Ceccarelli * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.cnr.isti.hpc.erd.cli; import it.cnr.isti.hpc.cli.AbstractCommandLineInterface; import it.cnr.isti.hpc.erd.WikipediaLabelToFreebaseRecord; import it.cnr.isti.hpc.io.reader.RecordReader; import it.cnr.isti.hpc.log.ProgressLogger; import it.cnr.isti.hpc.mapdb.MapDB; import java.io.File; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it> * * Created on Mar 15, 2014 */ public class IndexWikipediaLabelToFreebaseIdCLI extends AbstractCommandLineInterface { private static final Logger logger = LoggerFactory .getLogger(IndexWikipediaLabelToFreebaseIdCLI.class); private static String[] params = new String[] { "input", "dbdir" }; private static String usage = "java -cp $jar it.cnr.isti.hpc.erd.cli.IndexWikipediaLabelToFreebaseIdCLI -input entity.tsv -dbdir index directory"; public IndexWikipediaLabelToFreebaseIdCLI(String[] args) { super(args, params, usage); } public static void main(String[] args) { IndexWikipediaLabelToFreebaseIdCLI cli = new IndexWikipediaLabelToFreebaseIdCLI( args); RecordReader<WikipediaLabelToFreebaseRecord> reader = new RecordReader<WikipediaLabelToFreebaseRecord>( cli.getInput(), new WikipediaLabelToFreebaseRecord.Parser()); File f = new File(cli.getParam("dbdir")); if (!f.exists()) { f.mkdirs(); } File dbfile = new File(f, "mapdb"); MapDB db = new MapDB(dbfile, false); ProgressLogger pl = new ProgressLogger("indexed {} records", 100000); Map<String, String> map = db.getCollection("index"); Map<String, String> labels = db.getCollection("label"); for (WikipediaLabelToFreebaseRecord record : reader) { map.put(record.getCleanWikipediaLabel(), record.getFreebaseId()); labels.put(record.getFreebaseId(), record.getLabel()); pl.up(); } db.commit(); db.close(); logger.info("file indexed, index in {}", cli.getParam("dbdir")); } }