/**
*
*/
package com.maalaang.omtwitter.corpus;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.ResIterator;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
/**
* @author Sangwon Park
*
*/
public class TwitterQueryGenerator {
public static Map<String,Set<String>> generateQueries(Model domainOntologyModel, Set<String> stopwords, String lang, int minToken) throws IOException {
Property labelProperty = domainOntologyModel.getProperty("http://www.w3.org/2000/01/rdf-schema#label");
HashMap<String,Set<String>> map = new HashMap<String,Set<String>>();
ResIterator resIt = domainOntologyModel.listSubjects();
while (resIt.hasNext()) {
Resource res = resIt.next();
String uri = res.getURI();
StmtIterator stmtIter = res.listProperties(labelProperty);
while (stmtIter.hasNext()) {
Statement stmt = stmtIter.next();
if (lang != null && lang.equals(stmt.getLanguage())) {
String label = stmt.getLiteral().getString();
label = label.toLowerCase().replaceAll("\\(|\\)|,", "");
String[] tokens = label.split("\\s+");
for (int i = 0; i < tokens.length; i++) {
if (tokens.length != 1 && tokens.length - i < minToken) {
break;
}
String strNgram = null;
for (int j = i; j < tokens.length; j++) {
if (tokens[j].matches("[\\W]+")) {
continue;
}
if (j == i) {
strNgram = tokens[j];
} else {
strNgram += " " + tokens[j];
}
}
if (stopwords.contains(strNgram)) {
continue;
}
if (strNgram.matches("[0-9\\W]+")) {
continue;
}
if (strNgram.length() < 5) {
continue;
}
strNgram = strNgram.replaceFirst("\\.+$", "").replaceAll("-", "");
String query = "\"" + strNgram + "\"";
String hashtag = "#" + strNgram.replaceAll("[\\p{Punct}\\s]+", "");
Set<String> value = map.get(query);
if (value == null) {
HashSet<String> set = new HashSet<String>();
set.add(uri);
map.put(query, set);
} else {
value.add(uri);
}
value = map.get(hashtag);
if (value == null) {
HashSet<String> set = new HashSet<String>();
set.add(uri);
map.put(hashtag, set);
} else {
value.add(uri);
}
}
}
}
}
domainOntologyModel.close();
return map;
}
}