/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.tools.checker.checkers; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.util.StringList; import org.cogroo.entities.Mistake; import org.cogroo.entities.Sentence; import org.cogroo.tools.checker.AbstractTypedChecker; import org.cogroo.tools.checker.JavaRuleDefinition; import org.cogroo.tools.checker.RuleDefinition; import org.cogroo.tools.checker.rules.model.Example; public class SpaceChecker extends AbstractTypedChecker { private static final String ID_PREFIX = "space:"; private static final String[] SUGGESTION_ONE_SPACE = { " " }; private static final String[] SUGGESTION_NO_SPACE = { "" }; private static final String CATEGORY = "Erros mecânicos"; private static final String GROUP = "Uso do espaço"; // abc abc static final String EXTRA_BETWEEN_WORDS_ID = ID_PREFIX + "EXTRA_BETWEEN_WORDS"; private static final Pattern EXTRA_BETWEEN_WORDS = Pattern .compile("\\S(\\s{2,})\\S"); // abc . abc static final String EXTRA_BEFORE_RIGHT_PUNCT_ID = ID_PREFIX + "EXTRA_BEFORE_RIGHT_PUNCT"; private static final Pattern EXTRA_BEFORE_RIGHT_PUNCT = Pattern .compile("\\S(\\s{1,})[.?!;,:”)}\\]]"); // abc ( abc static final String EXTRA_AFTER_LEFT_PUNCT_ID = ID_PREFIX + "EXTRA_AFTER_LEFT_PUNCT"; private static final Pattern EXTRA_AFTER_LEFT_PUNCT = Pattern .compile("[\\[{(](\\s{1,})"); // abc...abc static final String MISSING_SPACE_AFTER_PUNCT_ID = ID_PREFIX + "MISSING_SPACE_AFTER_PUNCT"; private static final Pattern MISSING_SPACE_AFTER_PUNCT = Pattern .compile("([.”?!;,:)}\\]]+)[^\\s.”'\",;:!?)}\\]]"); // e-mail%10_+A@linux.ime.usp.br private static final Pattern EMAIL = Pattern .compile("([\\S]+@([\\w]+\\.)+[\\w]+)"); // R$ 4,00 || 1.000.000 || Chapter 1.2.4 private static final Pattern NUMBER = Pattern.compile("((\\d+[,.])+\\d+)"); // Ele, J.B.C. morreu em 154 a.C.! private static final Pattern INITIALS = Pattern .compile("((\\p{L}\\.){2,})(?!\\p{L})"); // Ele, J.B.C. morreu em 154 a.C.! private static final Pattern URL = Pattern.compile("(^(http|www)[\\S]+)"); private Dictionary dic; public SpaceChecker(Dictionary dic) { this.dic = dic; add( createRuleDefinition( EXTRA_BETWEEN_WORDS_ID, EXTRA_BETWEEN_WORDS, "Excesso de espaços entre as palavras. Entre palavras deve haver apenas um espaço.", "Excesso de espaços entre as palavras.", createExample("Este programa é bom.", "Este programa é bom."))) .add(createRuleDefinition( EXTRA_BEFORE_RIGHT_PUNCT_ID, EXTRA_BEFORE_RIGHT_PUNCT, "Excesso de espaço antes de símbolo. O símbolo deve ser mantido junto à palavra que o precede.", "Excesso de espaço antes de símbolo.", createExample("Este programa é bom .", "Este programa é bom."))) .add(createRuleDefinition( EXTRA_AFTER_LEFT_PUNCT_ID, EXTRA_AFTER_LEFT_PUNCT, "Excesso de espaço depois de símbolo. O símbolo deve ser mantido junto à palavra que o sucede.", "Excesso de espaço depois de símbolo.", createExample("Este programa é ( era) bom.", "Este programa é (era) bom."))) .add(createRuleDefinition( MISSING_SPACE_AFTER_PUNCT_ID, MISSING_SPACE_AFTER_PUNCT, "Falta espaço entre símbolo e palavra à direita. Deve haver um espaço entre o símbolo e a palavra que o sucede.", "Falta espaço entre símbolo e palavra à direita.", createExample( "Este programa é era bom.Mas agora é melhor.", "Este programa é era bom. Mas agora é melhor."))); } private RuleDefinition createRuleDefinition(String id, Pattern regex, String message, String shortMessage, Example example) { String description = "Aplica a expressão regular " + regex.pattern() + " na sentença."; List<Example> examples = new LinkedList<Example>(); examples.add(example); return new JavaRuleDefinition(id, CATEGORY, GROUP, description, message, shortMessage, examples); } public String getIdPrefix() { return ID_PREFIX; } public List<Mistake> check(Sentence sentence) { String text = sentence.getSentence(); List<Mistake> mistakes = new LinkedList<Mistake>(); int offset = sentence.getSpan().getStart(); if (isCheckRule(EXTRA_BETWEEN_WORDS_ID)) { Matcher m = EXTRA_BETWEEN_WORDS.matcher(text); while (m.find()) { int start = m.start(1) + offset; int end = m.end(1) + offset; mistakes.add(createMistake(EXTRA_BETWEEN_WORDS_ID, SUGGESTION_ONE_SPACE, start, end, sentence.getSentence())); } } if (isCheckRule(EXTRA_BEFORE_RIGHT_PUNCT_ID)) { Matcher m = EXTRA_BEFORE_RIGHT_PUNCT.matcher(text); while (m.find()) { int start = m.start(1) + offset; int end = m.end(1) + offset; mistakes.add(createMistake(EXTRA_BEFORE_RIGHT_PUNCT_ID, SUGGESTION_NO_SPACE, start, end, sentence.getSentence())); } } if (isCheckRule(EXTRA_AFTER_LEFT_PUNCT_ID)) { Matcher m = EXTRA_AFTER_LEFT_PUNCT.matcher(text); while (m.find()) { int start = m.start(1) + offset; int end = m.end(1) + offset; mistakes.add(createMistake(EXTRA_AFTER_LEFT_PUNCT_ID, SUGGESTION_NO_SPACE, start, end, sentence.getSentence())); } } if (isCheckRule(MISSING_SPACE_AFTER_PUNCT_ID)) { Matcher m = MISSING_SPACE_AFTER_PUNCT.matcher(text); while (m.find()) { String error = getsSupposedError(text, m.start(1)); boolean initials = getsSupposedAbbreviation(text, m.start(1)); if (!isEmail(error) && !isNumber(error) && !isURL(error) && !(initials)) { int start = m.start(1) + offset; int end = m.end(1) + offset; String[] suggestion = { m.group(1) + " " }; mistakes.add(createMistake(MISSING_SPACE_AFTER_PUNCT_ID, suggestion, start, end, sentence.getSentence())); } } } return mistakes; } private boolean isOpenBracket(char c) { switch (c) { case '“': case '(': case '[': case '{': return true; } return false; } private boolean isEnding(char c) { switch (c) { case '”': case ')': case ']': case '}': case '.': case ',': case ';': case '!': case '?': return true; } return false; } /** * Analyze a sentence and gets the word which contains the position of the * error in the parameter * * @param text * the entire sentence to be analyzed * @param position * where in the sentence the supposed error was found * @return the word which contains the supposed error */ private String getsSupposedError(String text, int position) { int ini; boolean end = false; String word = text; // Indicates where the position of the supposed word begin for (ini = position; ini >= 0; ini--) if (Character.isWhitespace(text.charAt(ini)) || isOpenBracket(text.charAt(ini))) break; // Indicates where the supposed word should end for (int i = position + 1; i < text.length() && end == false; i++) { switch (text.charAt(i)) { // Indicates the end of the supposed error case ' ': case '!': case '?': case ',': case ';': case ')': case ']': case '}': case '”': case '\n': // The supposed e-mail is attributed in its proper variable word = word.substring(ini + 1, i); end = true; break; // Possible end of sentence or just part of the supposed error case '.': if (Character.isWhitespace(text.charAt(i + 1))) { word = word.substring(ini + 1, i); end = true; break; } // Character or digit that is part of the supposed error default: break; } } return word; } /** * Analyze a sentence and gets the word which contains the position of the * error in the parameter and tells if it is an initial or if the * abbreviation dictionary contains it or not. * * @param text * the entire sentence to be analyzed * @param position * where in the sentence the supposed error was found * @return true if the error is actually an initial, and false if not. */ private boolean getsSupposedAbbreviation(String text, int position) { int ini; boolean end = false; String word = text; // Indicates where the position of the supposed abbreviation begins for (ini = position; ini >= 0; ini--) if (Character.isWhitespace(text.charAt(ini)) || isOpenBracket(text.charAt(ini)) ) break; // Indicates where the supposed abbreviation should end for (int i = position + 1; i < text.length() - 1 && end == false; i++) { switch (text.charAt(i)) { // Possible end of the sentence or just part of the supposed // abbreviation case '.': if (Character.isWhitespace(text.charAt(i + 1)) || isEnding(text.charAt(i + 1)) ) { word = word.substring(ini + 1, i + 1); end = true; } break; // Character that is part of the abbreviation default: break; } } if (end == true) { if (INITIALS.matcher(word).find()) return true; else if (this.dic.contains(new StringList(word))) return true; } return false; } /** * Verifies whether the String is or isn't an e-mail. * * @param email * supposed e-mail in the sentence * @return true if the variable email is an e-mail, otherwise returns false. */ private boolean isEmail(String email) { return (EMAIL.matcher(email).find()); } /** * Verifies whether the String is or isn't some sort of number combination * * @param number * String to be verified whether it is a number or not. * @return true if the String is a number and false in the contrary. */ private boolean isNumber(String number) { return (NUMBER.matcher(number).find()); } private boolean isURL(String url) { return (URL.matcher(url).find()); } public int getPriority() { return 2000; } }