package org.wikipedia.miner.util; import gnu.trove.set.hash.TIntHashSet; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import org.apache.log4j.Logger; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.wikipedia.miner.comparison.ArticleComparer; import org.wikipedia.miner.comparison.ArticleComparer.DataDependency; import org.wikipedia.miner.db.WDatabase.CachePriority; import org.wikipedia.miner.db.WDatabase.DatabaseType; import org.wikipedia.miner.model.Article; import org.wikipedia.miner.util.text.TextProcessor; import org.xml.sax.SAXException; public class WikipediaConfiguration { private enum ParamName{langCode,databaseDirectory,dataDirectory,defaultTextProcessor,minLinksIn,minSenseProbability,minLinkProbability, articlesOfInterest, databaseToCache,stopwordFile,articleComparisonDependency,articleComparisonModel, labelDisambiguationModel, labelComparisonModel, comparisonSnippetModel, topicDisambiguationModel, linkDetectionModel, tokenModel, sentenceModel, unknown} ; private String langCode ; private File dbDirectory ; private File dataDirectory ; private TextProcessor defaultTextProcessor = null ; private HashMap<DatabaseType, CachePriority> databasesToCache = new HashMap<DatabaseType, CachePriority>() ; private HashSet<String> stopwords = new HashSet<String>() ; private EnumSet<DataDependency> articleComparisonDependencies ; private File articleComparisonModel ; private File labelDisambiguationModel ; private File labelComparisonModel ; private File comparisonSnippetModel ; private File topicDisambiguationModel ; private File linkDetectionModel ; private Tokenizer tokenizer ; private SentenceDetector sentenceDetector ; private int minLinksIn = 0; private float minLinkProbability = 0 ; private float minSenseProbability = 0 ; private TIntHashSet articlesOfInterest ; public WikipediaConfiguration(Element xml) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { initFromXml(xml) ; } public WikipediaConfiguration(File configFile) throws ParserConfigurationException, SAXException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(configFile); doc.getDocumentElement().normalize(); initFromXml(doc.getDocumentElement()) ; } public WikipediaConfiguration(String langCode, File dbDirectory) { this.langCode = langCode ; this.dbDirectory = dbDirectory ; } public String getLangCode() { return langCode ; } public File getDatabaseDirectory() { return dbDirectory ; } public File getDataDirectory() { return dataDirectory ; } public void setDataDirectory(File f) { dataDirectory = f ; } public void setDefaultTextProcessor(TextProcessor tp) { defaultTextProcessor = tp ; } public TextProcessor getDefaultTextProcessor() { return defaultTextProcessor ; } public void addDatabaseToCache(DatabaseType type) { databasesToCache.put(type, CachePriority.space) ; } public void addDatabaseToCache(DatabaseType type, CachePriority priority) { System.out.println("Will cache " + type + " for " + priority) ; databasesToCache.put(type, priority) ; } public void clearDatabasesToCache() { databasesToCache.clear(); } public Set<DatabaseType> getDatabasesToCache() { return databasesToCache.keySet() ; } public CachePriority getCachePriority(DatabaseType databaseType) { return databasesToCache.get(databaseType) ; } public int getMinLinksIn() { return minLinksIn; } public void setMinLinksIn(int minLinksIn) { this.minLinksIn = minLinksIn; } public float getMinLinkProbability() { return minLinkProbability; } public void setMinLinkProbability(float minLinkProbability) { this.minLinkProbability = minLinkProbability; } public float getMinSenseProbability() { return minSenseProbability; } public void setMinSenseProbability(float minSenseProbability) { this.minSenseProbability = minSenseProbability; } public TIntHashSet getArticlesOfInterest() { return this.articlesOfInterest ; } public void setArticlesOfInterest(TIntHashSet articlesOfInterest) { this.articlesOfInterest = articlesOfInterest; } public boolean isStopword(String stopword) { return stopwords.contains(stopword.trim()) ; } public void setStopwords(HashSet<String> stopwords) { this.stopwords = stopwords ; } public void setStopwords(File stopwordFile) throws IOException { stopwords = new HashSet<String>() ; BufferedReader input = new BufferedReader(new FileReader(stopwordFile)) ; String line ; while ((line=input.readLine()) != null) stopwords.add(line.trim()) ; } public EnumSet<DataDependency> getArticleComparisonDependancies() { return articleComparisonDependencies ; } public void setArticleComparisonDependancies(EnumSet<DataDependency> dependancies) { articleComparisonDependencies = dependancies ; } public File getArticleComparisonModel() { return articleComparisonModel; } public void setArticleComparisonModel(File model) { articleComparisonModel = model; } public File getLabelDisambiguationModel() { return labelDisambiguationModel; } public void setLabelDisambiguationModel(File model) { labelDisambiguationModel = model; } public File getLabelComparisonModel() { return labelComparisonModel; } public void setLabelComparisonModel(File model) { labelComparisonModel = model; } public File getComparisonSnippetModel() { return comparisonSnippetModel; } public void setComparisonSnippetModel(File model) { comparisonSnippetModel = model; } public File getLinkDetectionModel() { return linkDetectionModel; } public void setLinkDetectionModel(File model) { linkDetectionModel = model; } public File getTopicDisambiguationModel() { return topicDisambiguationModel; } public void setTopicDisambiguationModel(File model) { topicDisambiguationModel = model; } public Tokenizer getTokenizer() { if (tokenizer == null) tokenizer = SimpleTokenizer.INSTANCE ; return tokenizer ; } public void setTokenizer(Tokenizer t) { tokenizer = t ; } public void setTokenizer(File modelFile) throws IOException{ InputStream modelStream = new FileInputStream(modelFile); TokenizerModel model = null ; model = new TokenizerModel(modelStream); tokenizer = new TokenizerME(model) ; } public SentenceDetector getSentenceDetector() { return sentenceDetector ; } public void setSentenceDetector(SentenceDetector sd) { sentenceDetector = sd ; } public void setSentenceDetector(File modelFile) throws IOException{ InputStream modelStream = new FileInputStream(modelFile); SentenceModel model = null ; model = new SentenceModel(modelStream); sentenceDetector = new SentenceDetectorME(model) ; } public EnumSet<DataDependency> getReccommendedRelatednessDependancies() { ArrayList<DataDependency> dependancies = new ArrayList<DataDependency>() ; boolean valid = false ; if (this.databasesToCache.containsKey(DatabaseType.pageLinksIn)) { dependancies.add(DataDependency.pageLinksIn) ; valid = true ; } if (this.databasesToCache.containsKey(DatabaseType.pageLinksOut)) { dependancies.add(DataDependency.pageLinksOut) ; valid = true ; } if (this.databasesToCache.containsKey(DatabaseType.pageLinkCounts)) { dependancies.add(DataDependency.linkCounts) ; } if (!valid) dependancies.add(DataDependency.pageLinksIn) ; return EnumSet.copyOf(dependancies) ; } @SuppressWarnings("rawtypes") private void initFromXml(Element xml) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { ArrayList<ArticleComparer.DataDependency> artCompDependencies = new ArrayList<ArticleComparer.DataDependency>() ; NodeList children = xml.getChildNodes() ; for (int i=0 ; i<children.getLength() ; i++) { Node xmlChild = children.item(i) ; if (xmlChild.getNodeType() == Node.ELEMENT_NODE) { Element xmlParam = (Element)xmlChild ; String paramName = xmlParam.getNodeName() ; String paramValue = getParamValue(xmlParam) ; if (paramValue == null) continue ; switch(resolveParamName(xmlParam.getNodeName())) { case langCode: this.langCode = paramValue ; break ; case databaseDirectory: this.dbDirectory = new File(paramValue) ; break ; case dataDirectory: this.dataDirectory = new File(paramValue) ; break ; case defaultTextProcessor: Class tpClass = Class.forName(paramValue) ; this.defaultTextProcessor = (TextProcessor)tpClass.newInstance() ; break ; case minLinksIn: this.minLinksIn = Integer.valueOf(paramValue) ; break ; case minSenseProbability: this.minSenseProbability = Float.valueOf(paramValue) ; break ; case minLinkProbability: this.minLinkProbability = Float.valueOf(paramValue) ; break ; case articlesOfInterest: this.articlesOfInterest = gatherArticles(new File(paramValue)) ; break ; case databaseToCache: if (xmlParam.hasAttribute("priority")) addDatabaseToCache(DatabaseType.valueOf(paramValue), CachePriority.valueOf(xmlParam.getAttribute("priority"))) ; else addDatabaseToCache(DatabaseType.valueOf(paramValue)) ; break ; case stopwordFile: this.setStopwords(new File(paramValue)) ; break ; case articleComparisonDependency: artCompDependencies.add(ArticleComparer.DataDependency.valueOf(paramValue)) ; break ; case articleComparisonModel: articleComparisonModel = new File(paramValue) ; break ; case labelDisambiguationModel: labelDisambiguationModel = new File(paramValue) ; break ; case labelComparisonModel: labelComparisonModel = new File(paramValue) ; break ; case comparisonSnippetModel: comparisonSnippetModel = new File(paramValue) ; break ; case topicDisambiguationModel: topicDisambiguationModel = new File(paramValue) ; break ; case linkDetectionModel: this.linkDetectionModel = new File(paramValue) ; break ; case tokenModel: this.setTokenizer(new File(paramValue)) ; break ; case sentenceModel: this.setSentenceDetector(new File(paramValue)) ; break ; default: Logger.getLogger(WikipediaConfiguration.class).warn("Ignoring unknown parameter: '" + paramName + "'") ; } ; } if (!artCompDependencies.isEmpty()) articleComparisonDependencies = EnumSet.copyOf(artCompDependencies) ; ; //TODO: throw fit if mandatory params (langCode, dbDirectory) are missing. } } private String getParamValue(Element xmlParam) { Node nodeContent = xmlParam.getChildNodes().item(0) ; if (nodeContent == null) return null ; if (nodeContent.getNodeType() != Node.TEXT_NODE) return null ; String content = nodeContent.getTextContent().trim() ; if (content.length() == 0) return null ; return content ; } private ParamName resolveParamName(String name) { try { return ParamName.valueOf(name.trim()) ; } catch (Exception e) { return ParamName.unknown ; } } private TIntHashSet gatherArticles(File file) throws NumberFormatException, IOException { TIntHashSet artIds = new TIntHashSet() ; BufferedReader reader = new BufferedReader(new FileReader(file)) ; String line ; while ((line = reader.readLine()) != null) { String[] values = line.split("\t") ; int id = new Integer(values[0].trim()) ; artIds.add(id) ; } reader.close(); return artIds ; } }