/*
* Copyright (c) 2017 Cisco Systems, Inc. and others. All rights reserved.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License v1.0 which accompanies this distribution,
* and is available at http://www.eclipse.org/legal/epl-v10.html
*/
package org.opendaylight.yangtools.yang.model.util;
import com.google.common.collect.ImmutableSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Utilities for converting YANG XSD regexes into Java-compatible regexes
*/
public final class RegexUtils {
private static final Logger LOG = LoggerFactory.getLogger(RegexUtils.class);
private static final Pattern BETWEEN_CURLY_BRACES_PATTERN = Pattern.compile("\\{(.+?)\\}");
private static final Set<String> JAVA_UNICODE_BLOCKS = ImmutableSet.<String>builder()
.add("AegeanNumbers")
.add("AlchemicalSymbols")
.add("AlphabeticPresentationForms")
.add("AncientGreekMusicalNotation")
.add("AncientGreekNumbers")
.add("AncientSymbols")
.add("Arabic")
.add("ArabicPresentationForms-A")
.add("ArabicPresentationForms-B")
.add("ArabicSupplement")
.add("Armenian")
.add("Arrows")
.add("Avestan")
.add("Balinese")
.add("Bamum")
.add("BamumSupplement")
.add("BasicLatin")
.add("Batak")
.add("Bengali")
.add("BlockElements")
.add("Bopomofo")
.add("BopomofoExtended")
.add("BoxDrawing")
.add("Brahmi")
.add("BraillePatterns")
.add("Buginese")
.add("Buhid")
.add("ByzantineMusicalSymbols")
.add("Carian")
.add("Cham")
.add("Cherokee")
.add("CJKCompatibility")
.add("CJKCompatibilityForms")
.add("CJKCompatibilityIdeographs")
.add("CJKCompatibilityIdeographsSupplement")
.add("CJKRadicalsSupplement")
.add("CJKStrokes")
.add("CJKSymbolsandPunctuation")
.add("CJKUnifiedIdeographs")
.add("CJKUnifiedIdeographsExtensionA")
.add("CJKUnifiedIdeographsExtensionB")
.add("CJKUnifiedIdeographsExtensionC")
.add("CJKUnifiedIdeographsExtensionD")
.add("CombiningDiacriticalMarks")
.add("CombiningDiacriticalMarksSupplement")
.add("CombiningHalfMarks")
.add("CombiningDiacriticalMarksforSymbols")
.add("CommonIndicNumberForms")
.add("ControlPictures")
.add("Coptic")
.add("CountingRodNumerals")
.add("Cuneiform")
.add("CuneiformNumbersandPunctuation")
.add("CurrencySymbols")
.add("CypriotSyllabary")
.add("Cyrillic")
.add("CyrillicExtended-A")
.add("CyrillicExtended-B")
.add("CyrillicSupplementary")
.add("Deseret")
.add("Devanagari")
.add("DevanagariExtended")
.add("Dingbats")
.add("DominoTiles")
.add("EgyptianHieroglyphs")
.add("Emoticons")
.add("EnclosedAlphanumericSupplement")
.add("EnclosedAlphanumerics")
.add("EnclosedCJKLettersandMonths")
.add("EnclosedIdeographicSupplement")
.add("Ethiopic")
.add("EthiopicExtended")
.add("EthiopicExtended-A")
.add("EthiopicSupplement")
.add("GeneralPunctuation")
.add("GeometricShapes")
.add("Georgian")
.add("GeorgianSupplement")
.add("Glagolitic")
.add("Gothic")
.add("GreekandCoptic")
.add("GreekExtended")
.add("Gujarati")
.add("Gurmukhi")
.add("HalfwidthandFullwidthForms")
.add("HangulCompatibilityJamo")
.add("HangulJamo")
.add("HangulJamoExtended-A")
.add("HangulJamoExtended-B")
.add("HangulSyllables")
.add("Hanunoo")
.add("Hebrew")
.add("HighPrivateUseSurrogates")
.add("HighSurrogates")
.add("Hiragana")
.add("IdeographicDescriptionCharacters")
.add("ImperialAramaic")
.add("InscriptionalPahlavi")
.add("InscriptionalParthian")
.add("IPAExtensions")
.add("Javanese")
.add("Kaithi")
.add("KanaSupplement")
.add("Kanbun")
.add("Kangxi Radicals")
.add("Kannada")
.add("Katakana")
.add("KatakanaPhoneticExtensions")
.add("KayahLi")
.add("Kharoshthi")
.add("Khmer")
.add("KhmerSymbols")
.add("Lao")
.add("Latin-1Supplement")
.add("LatinExtended-A")
.add("LatinExtendedAdditional")
.add("LatinExtended-B")
.add("LatinExtended-C")
.add("LatinExtended-D")
.add("Lepcha")
.add("LetterlikeSymbols")
.add("Limbu")
.add("LinearBIdeograms")
.add("LinearBSyllabary")
.add("Lisu")
.add("LowSurrogates")
.add("Lycian")
.add("Lydian")
.add("MahjongTiles")
.add("Malayalam")
.add("Mandaic")
.add("MathematicalAlphanumericSymbols")
.add("MathematicalOperators")
.add("MeeteiMayek")
.add("MiscellaneousMathematicalSymbols-A")
.add("MiscellaneousMathematicalSymbols-B")
.add("MiscellaneousSymbols")
.add("MiscellaneousSymbolsandArrows")
.add("MiscellaneousSymbolsAndPictographs")
.add("MiscellaneousTechnical")
.add("ModifierToneLetters")
.add("Mongolian")
.add("MusicalSymbols")
.add("Myanmar")
.add("MyanmarExtended-A")
.add("NewTaiLue")
.add("NKo")
.add("NumberForms")
.add("Ogham")
.add("OlChiki")
.add("OldItalic")
.add("OldPersian")
.add("OldSouthArabian")
.add("OldTurkic")
.add("OpticalCharacterRecognition")
.add("Oriya")
.add("Osmanya")
.add("Phags-pa")
.add("PhaistosDisc")
.add("Phoenician")
.add("PhoneticExtensions")
.add("PhoneticExtensionsSupplement")
.add("PlayingCards")
.add("PrivateUseArea")
.add("Rejang")
.add("RumiNumeralSymbols")
.add("Runic")
.add("Samaritan")
.add("Saurashtra")
.add("Shavian")
.add("Sinhala")
.add("SmallFormVariants")
.add("SpacingModifierLetters")
.add("Specials")
.add("Sundanese")
.add("SuperscriptsandSubscripts")
.add("SupplementalArrows-A")
.add("SupplementalArrows-B")
.add("SupplementalMathematicalOperators")
.add("SupplementalPunctuation")
.add("SupplementaryPrivateUseArea-A")
.add("SupplementaryPrivateUseArea-B")
.add("SylotiNagri")
.add("Syriac")
.add("Tagalog")
.add("Tagbanwa")
.add("Tags")
.add("TaiLe")
.add("TaiTham")
.add("TaiViet")
.add("TaiXuanJingSymbols")
.add("Tamil")
.add("Telugu")
.add("Thaana")
.add("Thai")
.add("Tibetan")
.add("Tifinagh")
.add("TransportAndMapSymbols")
.add("Ugaritic")
.add("UnifiedCanadianAboriginalSyllabics")
.add("UnifiedCanadianAboriginalSyllabicsExtended")
.add("Vai")
.add("VariationSelectors")
.add("VariationSelectorsSupplement")
.add("VedicExtensions")
.add("VerticalForms")
.add("YiRadicals")
.add("YiSyllables")
.add("YijingHexagramSymbols").build();
private static final int UNICODE_SCRIPT_FIX_COUNTER = 30;
private RegexUtils() {
throw new UnsupportedOperationException("Utility class should not be instantiated.");
}
/**
* Converts XSD regex to Java-compatible regex
*
* @param xsdRegex XSD regex pattern as it is defined in a YANG source
* @return Java-compatible regex
*/
public static String getJavaRegexFromXSD(final String xsdRegex) {
return "^" + fixUnicodeScriptPattern(escapeChars(xsdRegex)) + '$';
}
/*
* As both '^' and '$' are special anchor characters in java regular
* expressions which are implicitly present in XSD regular expressions,
* we need to escape them in case they are not defined as part of
* character ranges i.e. inside regular square brackets.
*/
private static String escapeChars(final String regex) {
final StringBuilder result = new StringBuilder(regex.length());
int bracket = 0;
boolean escape = false;
for (int i = 0; i < regex.length(); i++) {
final char ch = regex.charAt(i);
switch (ch) {
case '[':
if (!escape) {
bracket++;
}
escape = false;
result.append(ch);
break;
case ']':
if (!escape) {
bracket--;
}
escape = false;
result.append(ch);
break;
case '\\':
escape = !escape;
result.append(ch);
break;
case '^':
case '$':
if (bracket == 0) {
result.append('\\');
}
escape = false;
result.append(ch);
break;
default:
escape = false;
result.append(ch);
}
}
return result.toString();
}
private static String fixUnicodeScriptPattern(String rawPattern) {
for (int i = 0; i < UNICODE_SCRIPT_FIX_COUNTER; i++) {
try {
Pattern.compile(rawPattern);
return rawPattern;
} catch(final PatternSyntaxException ex) {
LOG.debug("Invalid regex pattern syntax in: {}", rawPattern, ex);
if (ex.getMessage().contains("Unknown character script name")) {
rawPattern = fixUnknownScripts(ex.getMessage(), rawPattern);
} else {
return rawPattern;
}
}
}
LOG.warn("Regex pattern could not be fixed: {}", rawPattern);
return rawPattern;
}
private static String fixUnknownScripts(final String exMessage, final String rawPattern) {
StringBuilder result = new StringBuilder(rawPattern);
final Matcher matcher = BETWEEN_CURLY_BRACES_PATTERN.matcher(exMessage);
if (matcher.find()) {
final String capturedGroup = matcher.group(1);
if (JAVA_UNICODE_BLOCKS.contains(capturedGroup)) {
final int idx = rawPattern.indexOf("Is" + capturedGroup);
result = result.replace(idx, idx + 2, "In");
}
}
return result.toString();
}
}