/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.util.List; import java.util.ArrayList; import java.util.regex.Pattern; /** * INTERNAL: A term analyzer which recognizes certain kinds of terms * using regexps and adjusts their scores accordingly. At the moment * it only recognizes emails addresses and HTTP URLs. These are scored * down dramatically. */ public class RegexpTermAnalyzer implements TermAnalyzerIF { private List<Rule> rules; public RegexpTermAnalyzer() { this.rules = new ArrayList<Rule>(); rules.add(new Rule("email address", "[-A-Za-z.0-9]+@([-A-Za-z.0-9]+\\.)+[A-Za-z]+", 0.002d)); rules.add(new Rule("http URL", "http://[-.A-Za-z?+&=0-9#/]+", 0.002d)); } public void startAnalysis(TermDatabase tdb) { } public void analyzeTerm(Term term) { for (int ix = 0; ix < rules.size(); ix++) { Rule rule = rules.get(ix); if (rule.matches(term)) term.multiplyScore(rule.getFactor(), "matched " + rule.getName() + " rule"); } } public void endAnalysis() { } // --- Internal static class Rule { private String name; private Pattern pattern; private double factor; public Rule(String name, String pattern, double factor) { this.name = name; this.pattern = Pattern.compile(pattern); this.factor = factor; } public boolean matches(Term term) { return pattern.matcher(term.getPreferredName()).matches(); } public String getName() { return name; } public double getFactor() { return factor; } } }