/** * */ package com.maalaang.omtwitter.ontology; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.StringReader; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.ResIterator; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import com.maalaang.omtwitter.io.CollectionTextWriter; import com.maalaang.omtwitter.text.InfoboxValueTokenizer; import com.maalaang.omtwitter.text.WordPattern; /** * @author Sangwon Park * */ public class DBPediaDomainOntologyStat { private final int STMT_SEGMENT_SIZE = 2000000; private final int INFOBOX_VALUE_WORD_LEN_MAX = 20; private final int INFOBOX_VALUE_WORD_LEN_MIN = 2; private final int INFOBOX_PROP_NAME_LEN_MIN = 2; private final String STMT_LANG = "en"; private Logger logger = null; public DBPediaDomainOntologyStat() { logger = Logger.getLogger(getClass()); } /** * Note: Words are extracted from the literals of infobox properties by InfoboxValueTokenizer.tokenizeToWord() * @param infoboxPropertiesFile * @param stopwords * @param resources * @param infoboxValueUnigramFreqMapFile * @return * @throws IOException */ public Map<String,Integer> infoboxValueWordFreq(String infoboxPropertiesFile, Set<String> stopwords, Set<String> resources, String infoboxValueUnigramFreqMapFile) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(infoboxPropertiesFile), "UTF-8")); String line = null; int segmentCnt = 0; long tokenCnt = 0; long literalCnt = 0; long statementCnt = 0; boolean doProcessing = true; while (doProcessing) { Model model = ModelFactory.createDefaultModel(); HashMap<String,Integer> map = new HashMap<String,Integer>(); int remainedStmtNum = STMT_SEGMENT_SIZE; while ((line = br.readLine()) != null && remainedStmtNum > 0) { try { statementCnt++; model.read(new StringReader(line), null, "N-TRIPLE"); remainedStmtNum--; } catch (Exception e) { logger.info("exception on reading (skipped) - " + line); } } if (line == null) { doProcessing = false; } StmtIterator iter = model.listStatements(); while (iter.hasNext()) { Statement stmt = iter.next(); if (resources != null && !resources.contains(stmt.getSubject().getURI())) { continue; } RDFNode obj = stmt.getObject(); if (obj.isLiteral()) { literalCnt++; String[] tokens = InfoboxValueTokenizer.tokenizeToWord(stmt.getString().toLowerCase()); for (String t : tokens) { if (t.length() > INFOBOX_VALUE_WORD_LEN_MAX || t.length() < INFOBOX_VALUE_WORD_LEN_MIN) { continue; } if (stopwords.contains(t)) { continue; } Integer value = map.get(t); if (value == null) { map.put(t, 1); } else { map.put(t, ++value); } tokenCnt++; } } } String tmpFileName = String.format("%s.%03d", infoboxValueUnigramFreqMapFile, segmentCnt++); CollectionTextWriter.writeMapStringInteger(map, tmpFileName, false); logger.info("write temp file - " + tmpFileName); } br.close(); Map<String,Integer> map = freqMapFileMerge(infoboxValueUnigramFreqMapFile, segmentCnt, true); logger.info("total " + statementCnt + " statements processed"); logger.info("total " + tokenCnt + " tokens observed"); logger.info("total " + literalCnt + " literals observed"); return map; } private Map<String,Integer> freqMapFileMerge(String infoboxValueUnigramFreqMapFile, int segmentNumber, boolean deleteTmpFiles) throws IOException { String line = null; HashMap<String,Integer> map = new HashMap<String,Integer>(); for (int i = 0; i < segmentNumber; i++) { String tmpFileName = String.format("%s.%03d", infoboxValueUnigramFreqMapFile, i); File tmpFile = new File(tmpFileName); logger.info("merge data in " + tmpFileName); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(tmpFile), "UTF-8")); while ((line = br.readLine()) != null) { String[] tokens = line.split("\t"); Integer value = map.get(tokens[0]); if (value == null) { map.put(tokens[0], Integer.parseInt(tokens[1])); } else { map.put(tokens[0], value + Integer.parseInt(tokens[1])); } } br.close(); if (deleteTmpFiles) { logger.info("delete temp file - " + tmpFileName); tmpFile.delete(); } } return map; } public Map<String,Double> infoboxValueWordRelevanceScore(Map<String,Integer> wordFreqDomainMap, Map<String,Integer> wordFreqEntireMap, boolean normalize) { Map<String,Double> map = new HashMap<String,Double>(); Collection<Integer> values = null; int freqSumDomain = 0; int freqSumEntire = 0; double maxValue = 0.0; values = wordFreqDomainMap.values(); for (Integer v : values) { freqSumDomain += v; } values = wordFreqEntireMap.values(); for (Integer v : values) { freqSumEntire += v; } Set<Entry<String,Integer>> wordFreqDomainSet = wordFreqDomainMap.entrySet(); for (Entry<String,Integer> e : wordFreqDomainSet) { String key = e.getKey(); Integer o1 = e.getValue(); Integer o2 = wordFreqEntireMap.get(key); if (o2 == null) { logger.warn("'" + key + "' doesn't exist in the infobox value map " + wordFreqEntireMap); } else if (o2 < o1) { logger.warn("the infobox value maps are not consistent about '" + key + "'"); } double value = ((double) o1 / (double) freqSumDomain) * Math.log((double)(freqSumEntire - freqSumDomain) / (double)(o2 - o1 + 1)); if (value > maxValue) { maxValue = value; } map.put(key, value); } if (normalize) { Set<String> keys = map.keySet(); for (String key : keys) { map.put(key, map.get(key) / maxValue); } } return map; } public Map<String,Integer> propertyFreq(String infoboxPropertiesFile, Set<String> resources, String prefix, Pattern filterPattern, boolean useLocalname, String propertyFreqMapFile) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(infoboxPropertiesFile), "UTF-8")); String line = null; int segmentCnt = 0; long statementCnt = 0; boolean doProcessing = true; HashSet<String> propSet = new HashSet<String>(); while (doProcessing) { Model model = ModelFactory.createDefaultModel(); HashMap<String,Integer> map = new HashMap<String,Integer>(); int remainedStmtNum = STMT_SEGMENT_SIZE; while ((line = br.readLine()) != null && remainedStmtNum > 0) { try { statementCnt++; model.read(new StringReader(line), null, "N-TRIPLE"); remainedStmtNum--; } catch (Exception e) { logger.info("exception on reading (skipped) - " + line); } } if (line == null) { doProcessing = false; } ResIterator resIt = model.listSubjects(); while (resIt.hasNext()) { Resource res = resIt.next(); if (resources != null && !resources.contains(res.getURI())) { continue; } StmtIterator stmtIt = res.listProperties(); while (stmtIt.hasNext()) { Statement stmt = stmtIt.next(); Property p = stmt.getPredicate(); String pUri = p.getURI(); String pLocalName = p.getLocalName(); if (pLocalName.length() < INFOBOX_PROP_NAME_LEN_MIN || ((prefix != null) && !pUri.startsWith(prefix))) { continue; } String pName = null; if (useLocalname) { pName = pLocalName; } else { pName = pUri; } Matcher matcher = filterPattern.matcher(pName.toLowerCase()); if (!matcher.find() && pName.length() > 1) { if (!propSet.contains(pName)) { Integer value = map.get(pName); if (value == null) { map.put(pName, 1); } else { map.put(pName, ++value); } propSet.add(pName); } } } propSet.clear(); } String tmpFileName = String.format("%s.%03d", propertyFreqMapFile, segmentCnt++); CollectionTextWriter.writeMapStringInteger(map, tmpFileName, false); logger.info("write temp file - " + tmpFileName); } br.close(); Map<String,Integer> map = freqMapFileMerge(propertyFreqMapFile, segmentCnt, true); logger.info("total " + statementCnt + " statements processed"); return map; } public Map<String,Double> propertyImportanceScore(Map<String,Integer> propertyFreqDomainMap, Map<String,Integer> propertyFreqEntireMap, boolean normalize) { Map<String,Double> map = new HashMap<String,Double>(); Collection<Integer> values = null; int freqSumDomain = 0; int freqSumEntire = 0; double maxValue = 0.0; values = propertyFreqDomainMap.values(); for (Integer v : values) { freqSumDomain += v; } values = propertyFreqEntireMap.values(); for (Integer v : values) { freqSumEntire += v; } Set<Entry<String,Integer>> propertyFreqDomainSet = propertyFreqDomainMap.entrySet(); for (Entry<String,Integer> e : propertyFreqDomainSet) { String key = e.getKey(); Integer o1 = e.getValue(); Integer o2 = propertyFreqEntireMap.get(key); if (o2 == null) { logger.warn("'" + key + "' doesn't exist in the infobox value map " + propertyFreqEntireMap); } else if (o2 < o1) { logger.warn("the infobox value maps are not consistent about '" + key + "'"); } double value = ((double) o1 / (double) freqSumDomain) * Math.log((double)(freqSumEntire - freqSumDomain) / (double)(o2 - o1 + 1)); if (value > maxValue) { maxValue = value; } map.put(key, value); } if (normalize) { Set<String> keys = map.keySet(); for (String key : keys) { map.put(key, map.get(key) / maxValue); } } return map; } public Set<String> delegateProperties(Map<String,Double> propertyFreqMap, double paramDelegateProperty) throws IOException { ArrayList<Entry<String,Double>> entryList = new ArrayList<Entry<String,Double>>(propertyFreqMap.entrySet()); Collections.sort(entryList, new Comparator<Entry<String,Double>>() { public int compare(Entry<String, Double> o1, Entry<String, Double> o2) { if (o2.getValue() - o1.getValue() >= 0) { return 1; } else { return -1; } } }); int limit = (int)(entryList.size() * paramDelegateProperty); HashSet<String> set = new HashSet<String>(limit); for (int i = 0; i < limit; i++) { set.add(entryList.get(i).getKey()); } return set; } public Map<String,Integer> infoboxValuePropertyFreq(String ontologyFile, String propertyValueFreqFile, Set<String> delegateProperties, Set<String> stopwords, int minToken, int maxToken) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(ontologyFile), "UTF-8")); String line = null; int segmentCnt = 0; long literalCnt = 0; long statementCnt = 0; boolean doProcessing = true; while (doProcessing) { Model model = ModelFactory.createDefaultModel(); HashMap<String,Integer> map = new HashMap<String,Integer>(); int remainedStmtNum = STMT_SEGMENT_SIZE; while ((line = br.readLine()) != null && remainedStmtNum > 0) { try { statementCnt++; model.read(new StringReader(line), null, "N-TRIPLE"); remainedStmtNum--; } catch (Exception e) { logger.info("exception on reading (skipped) - " + line); } } if (line == null) { doProcessing = false; } StmtIterator iter = model.listStatements(); while (iter.hasNext()) { Statement stmt = iter.next(); if (delegateProperties != null && !delegateProperties.contains(stmt.getPredicate().getURI())) { continue; } RDFNode obj = stmt.getObject(); if (obj.isLiteral()) { String lang = stmt.getLanguage(); String propertyName = stmt.getPredicate().getLocalName(); literalCnt++; if (STMT_LANG.equals(lang) || lang.length() == 0) { String[] valueTokens = InfoboxValueTokenizer.tokenizeToValues(stmt.getString().toLowerCase()); for (String s : valueTokens) { s = s.trim(); if (s.length() < 3) continue; if (stopwords.contains(s)) continue; boolean isNumber = true; try { Double.parseDouble(s); } catch (NumberFormatException e) { isNumber = false; } if (isNumber) continue; String[] tokens = s.split(" "); if (tokens.length > maxToken || tokens.length < minToken) continue; WordPattern.normalize(tokens); StringBuilder sb = null; for (String t : tokens) { if (sb == null) { sb = new StringBuilder(); sb.append(t); } else { sb.append(' '); sb.append(t); } } String normValue = sb.toString(); if (normValue.matches("^\\p{Punct}+")) continue; String key = propertyName + ";" + normValue; Integer value = map.get(key); if (value == null) { map.put(key, 1); } else { map.put(key, ++value); } } } } } String tmpFileName = String.format("%s.%03d", propertyValueFreqFile, segmentCnt++); CollectionTextWriter.writeMapStringInteger(map, tmpFileName, false); logger.info("write temp file - " + tmpFileName); } br.close(); Map<String,Integer> map = freqMapFileMerge(propertyValueFreqFile, segmentCnt, true); logger.info("total " + statementCnt + " statements processed"); logger.info("total " + literalCnt + " literals observed"); return map; } public void infoboxValueToPropertyMap(String valuePropertyFreqMapFile, String valueToPropertyMapFile, Set<String> delegateProperties, boolean sort) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(valuePropertyFreqMapFile), "UTF-8")); String line = null; HashMap<String, Object[]> map = new HashMap<String, Object[]>(); Object[] propFreq = null; while ((line = br.readLine()) != null) { String[] tokens = line.trim().split("[\t;]+"); if (tokens.length != 3) { continue; } if (!delegateProperties.contains(DBPediaConstant.DBPEDIA_INFOBOX_PROP_URI_PREFIX + tokens[0])) { continue; } propFreq = map.get(tokens[1]); if (propFreq == null) { propFreq = new Object[2]; propFreq[0] = tokens[0]; propFreq[1] = Integer.parseInt(tokens[2]); map.put(tokens[1], propFreq); } else { Integer num = Integer.parseInt(tokens[2]); if (!((String)propFreq[0]).equals("name") && num > (Integer)propFreq[1]) { propFreq[0] = tokens[0]; propFreq[1] = num; } } } br.close(); for (String p : delegateProperties) { String propName = p.substring(p.lastIndexOf('/') + 1); propFreq = map.get(propName); if (propFreq == null) { propFreq = new Object[2]; propFreq[0] = propName; propFreq[1] = 1; map.put(propName, propFreq); } else { propFreq[0] = propName; propFreq[1] = 1; } } BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(valueToPropertyMapFile), "UTF-8")); Set<Entry<String,Object[]>> set = map.entrySet(); if (sort) { ArrayList<Entry<String,Object[]>> list = new ArrayList<Entry<String,Object[]>>(set); Collections.sort(list, new Comparator<Entry<String,Object[]>>() { public int compare(Entry<String, Object[]> o1, Entry<String, Object[]> o2) { return o1.getKey().compareTo(o2.getKey()); } }); for (Entry<String,Object[]> e : list) { bw.write(e.getKey()); bw.write('\t'); bw.write((String)e.getValue()[0]); bw.write('\n'); } } else { for (Entry<String,Object[]> e : set) { bw.write(e.getKey()); bw.write('\t'); bw.write((String)e.getValue()[0]); bw.write('\n'); } } bw.close(); } }