/** * */ package com.maalaang.omtwitter.tools; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.regex.Pattern; import com.maalaang.omtwitter.io.CollectionTextReader; import com.maalaang.omtwitter.io.CollectionTextWriter; import com.maalaang.omtwitter.ontology.DBPediaConstant; import com.maalaang.omtwitter.ontology.DBPediaDomainOntologyStat; /** * note: heap size -Xmx4000 * @author Sangwon Park * */ public class BuildDBPediaDomainOntologyStat { /** * @param args */ public static void main(String[] args) { try { DBPediaDomainOntologyStat stat = new DBPediaDomainOntologyStat(); Properties prop = new Properties(); prop.load(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")); Set<String> resources = CollectionTextReader.readSetString(prop.getProperty("resource.expanded.file")); Set<String> stopwords = CollectionTextReader.readSetString(prop.getProperty("stopword.set.file")); // properties Pattern propertyFilterPattern = Pattern.compile(prop.getProperty("property.filter.pattern")); Map<String,Integer> propertyFreqDomainMap = stat.propertyFreq(prop.getProperty("infobox.property.file"), resources, DBPediaConstant.DBPEDIA_INFOBOX_PROP_URI_PREFIX, propertyFilterPattern, false, prop.getProperty("property.freq.domain.file")); CollectionTextWriter.writeMapStringInteger(propertyFreqDomainMap, prop.getProperty("property.freq.domain.file"), true); Map<String,Integer> propertyFreqEntireMap = stat.propertyFreq(prop.getProperty("infobox.property.file"), null, DBPediaConstant.DBPEDIA_INFOBOX_PROP_URI_PREFIX, propertyFilterPattern, false, prop.getProperty("property.freq.entire.file")); CollectionTextWriter.writeMapStringInteger(propertyFreqEntireMap, prop.getProperty("property.freq.entire.file"), true); Map<String,Double> pisMap = stat.propertyImportanceScore(propertyFreqDomainMap, propertyFreqEntireMap, true); CollectionTextWriter.writeMapStringDouble(pisMap, prop.getProperty("property.importance.score.file"), true); Set<String> delegateProperties = stat.delegateProperties(pisMap, 0.1); CollectionTextWriter.writeSetString(delegateProperties, prop.getProperty("property.delegate.file"), true); // words Map<String,Integer> wordFreqDomainMap = stat.infoboxValueWordFreq(prop.getProperty("infobox.property.file"), stopwords, resources, prop.getProperty("word.freq.domain.file")); CollectionTextWriter.writeMapStringInteger(wordFreqDomainMap, prop.getProperty("word.freq.domain.file"), true); Map<String,Integer> wordFreqEntireMap = stat.infoboxValueWordFreq(prop.getProperty("infobox.property.file"), stopwords, null, prop.getProperty("word.freq.entire.file")); CollectionTextWriter.writeMapStringInteger(wordFreqEntireMap, prop.getProperty("word.freq.entire.file"), true); Map<String,Double> wrsMap = stat.infoboxValueWordRelevanceScore(wordFreqDomainMap, wordFreqEntireMap, true); CollectionTextWriter.writeMapStringDouble(wrsMap, prop.getProperty("word.relevance.score.file"), true); // value & properties Map<String,Integer> valuePropertyFreqMap = stat.infoboxValuePropertyFreq(prop.getProperty("domain.ontology.file"), prop.getProperty("value.property.freq.file"), delegateProperties, stopwords, Integer.parseInt(prop.getProperty("value.token.min")), Integer.parseInt(prop.getProperty("value.token.max"))); CollectionTextWriter.writeMapStringInteger(valuePropertyFreqMap, prop.getProperty("value.property.freq.file"), false); stat.infoboxValueToPropertyMap(prop.getProperty("value.property.freq.file"), prop.getProperty("value.to.property.map.file"), delegateProperties, false); } catch (Exception e) { e.printStackTrace(); } } }