/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.utilities; import java.util.ArrayList; import java.util.List; import org.erasmusmc.collections.Pair; public class ShortFormLongFormMatcher { private int lastEOS; public List<Pair<String, String>> extractSFLFmatches(String text) { List<Pair<String, String>> result = new ArrayList<Pair<String, String>>(); // Detect parenthesis int startIndex = -1; lastEOS = 0; for (int i = 0; i < text.length(); i++) { char ch = text.charAt(i); if (ch == '.' | ch == ',' | ch == '!' | ch == '?' | (int) ch == 10) { startIndex = -1; lastEOS = i; } else if (ch == '(') { startIndex = i + 1; } else if (ch == ')') { if (startIndex != -1) { Pair<String, String> pair = processParenthesizedString(text, startIndex, i); if (pair != null) result.add(pair); startIndex = -1; } } } return result; } private Pair<String, String> processParenthesizedString(String text, int startIndex, int endIndex) { String string = text.substring(startIndex, endIndex); String longForm; String shortForm; boolean shortFormKnownSize; if (isLongForm(string)) { longForm = string; shortForm = findSFbefore(text, startIndex); shortFormKnownSize = false; } else { shortForm = string; longForm = text.substring(Math.max(lastEOS, startIndex - maxLFlength(shortForm) - 1), startIndex - 1); shortFormKnownSize = true; } if (shortForm.length() > 1 && Character.isLetter(shortForm.charAt(0)) && !shortForm.contains("=") && !StringUtilities.isRomanNumeral(shortForm)) { return findLongForm(longForm, shortForm, shortFormKnownSize); } return null; } private static boolean evenParenthesis(String string) { int open = 0; int close = 0; for (int i = 0; i < string.length(); i++) { char ch = string.charAt(i); if (ch == '(') open++; if (ch == ')') close++; } return open == close; } public static Pair<String, String> findLongForm(String longForm, String shortForm, boolean shortFormSizeIsKnown) { String longFormLC = longForm.toLowerCase(); String shortFormLC = shortForm.toLowerCase(); int lfIndex = longFormLC.length(); int lastLetterSFIndex = -1; int lastMatchedLFIndex = -1; boolean nonConsecutiveMatch = false; for (int sfIndex = shortFormLC.length() - 1; sfIndex >= 0; sfIndex--) { char ch = shortFormLC.charAt(sfIndex); boolean matched = false; boolean unmatchable = false; if (!Character.isLetterOrDigit(ch)) { // Character is neither digit nor // letter unmatchable = true; } else if (Character.isDigit(ch)) { // Character is digit // Attempt to find number: for (int numberIndex = lfIndex - 1; numberIndex >= 0; numberIndex--) { if (longFormLC.charAt(numberIndex) == ch) { lfIndex = numberIndex; matched = true; break; } } // If number not found: try looking for roman numeral: if (!matched && lfIndex > 0) { String romanNumeral = convertToNumeral(ch); int romanNumeralIndex = longFormLC.substring(0, lfIndex - 1).indexOf(romanNumeral); if (romanNumeralIndex != -1) { lfIndex = romanNumeralIndex; matched = true; } } } else { // Character is letter // Attempt to find letter: for (int letterIndex = lfIndex - 1; letterIndex >= 0; letterIndex--) { if (longFormLC.charAt(letterIndex) == ch) { if (sfIndex != 0 || letterIndex == 0 || !Character.isLetterOrDigit(longFormLC.charAt(letterIndex - 1))) { lfIndex = letterIndex; matched = true; lastLetterSFIndex = sfIndex; break; } } } } if (matched) { if (lastMatchedLFIndex != -1 && lastMatchedLFIndex > lfIndex + 1) nonConsecutiveMatch = true; lastMatchedLFIndex = lfIndex; } else if (!unmatchable) { if (shortFormSizeIsKnown || !nonConsecutiveMatch) return null; else // Could be that we have the complete shortform aligned. Do some // checks: // If last matched SF letter also occurres at begin of LF, place pointer // there: if (lastLetterSFIndex != -1 && longFormLC.charAt(0) == shortFormLC.charAt(lastLetterSFIndex)) lfIndex = 0; // Check if we have the complete long-form, and the last aligned SF // letter is preceded by a word delimiter: if (lfIndex == 0 && lastLetterSFIndex > sfIndex + 1) return checkSFLF(longForm, shortForm.substring(lastLetterSFIndex)); else return null; } } // All letters matched, return pair: if (nonConsecutiveMatch) return checkSFLF(longForm.substring(lfIndex), shortForm); else return null; } private static Pair<String, String> checkSFLF(String longForm, String shortForm) { // If LF contains parenthesis, these should be matching. if (evenParenthesis(longForm) && !longForm.toLowerCase().startsWith(shortForm.toLowerCase())) return new Pair<String, String>(shortForm, longForm); else return null; } private static String convertToNumeral(char ch) { switch (ch) { case '1': return "i"; case '2': return "ii"; case '3': return "iii"; case '4': return "iv"; case '5': return "v"; case '6': return "vi"; case '7': return "vii"; case '8': return "viii"; case '9': return "ix"; } return "0"; } private static String findSFbefore(String text, int index) { int delimiters = 0; int minIndex = Math.max(0, index - 10); int startIndex = minIndex; for (int i = index - 1; i >= minIndex; i--) { if (!Character.isLetterOrDigit(text.charAt(i))) { delimiters++; if (delimiters == 2) { startIndex = i + 1; break; } } } return text.substring(startIndex, index - 1); } private static boolean isLongForm(String string) { return (string.split(" ").length > 2 || string.length() > 7); } private static int maxLFlength(String shortForm) { int matchable = 0; for (int i = 0; i < shortForm.length(); i++) { if (Character.isLetterOrDigit(shortForm.charAt(i))) { matchable++; } } return matchable * 15; } }