// Copyright 2014 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.tokenize.rules; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; public class EnglishRuleProvider extends RuleProvider { @Override public Collection<Rule> getTokRules() { List<Rule> rules = new LinkedList<Rule>(); rules.add(new Rule(Pattern.compile("(``)") ,"\"")); rules.add(new Rule(Pattern.compile("(`)") ,"'")); rules.add(new Rule(Pattern.compile("('')") ,"\"")); rules.add(new Rule(Pattern.compile("(--)") ,"-")); rules.add(new Rule(Pattern.compile("(\\.\\.\\.)") ,"…")); rules.add(new Rule(Pattern.compile("(\\. \\.)$") ,".")); return rules; } @Override public Collection<Rule> getUnTokRules() { List<Rule> rules = new LinkedList<Rule>(); rules.add(new Rule(Pattern.compile("(\\()") ,"-LRB-")); rules.add(new Rule(Pattern.compile("(\\))") ,"-RRB-")); rules.add(new Rule(Pattern.compile("(\\{)") ,"-LCB-")); rules.add(new Rule(Pattern.compile("(\\})") ,"-RCB-")); rules.add(new Rule(Pattern.compile("(\\[)") ,"-LSB-")); rules.add(new Rule(Pattern.compile("(\\])") ,"-RSB-")); rules.add(new Rule(Pattern.compile("(¼)") ,"1/4")); rules.add(new Rule(Pattern.compile("(\\. ?\\.)$") ,".")); return rules; } }