// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataprofiler.core.migration.impl;
import java.util.Date;
import org.eclipse.core.resources.IFolder;
import org.talend.cwm.helper.TaggedValueHelper;
import org.talend.dataprofiler.core.migration.AbstractWorksapceUpdateTask;
import org.talend.dataquality.domain.pattern.Pattern;
import org.talend.dataquality.domain.pattern.RegularExpression;
import org.talend.dataquality.helpers.BooleanExpressionHelper;
import org.talend.dataquality.helpers.MetadataHelper;
import org.talend.dq.analysis.parameters.PatternParameter;
import org.talend.dq.pattern.PatternBuilder;
import org.talend.dq.writer.impl.ElementWriterFactory;
import org.talend.resource.ResourceManager;
public class CreatePatternsMigratorTask extends AbstractWorksapceUpdateTask {
private final String SQLLanguage = "SQL"; //$NON-NLS-1$
private final String PATH_ADDRESS = "address"; //$NON-NLS-1$
private final String PATH_CUSTOMER = "customer"; //$NON-NLS-1$
private final String PATH_NUMBER = "number"; //$NON-NLS-1$
private final String PATH_MONEY = "currency"; //$NON-NLS-1$
private final String PATH_DATE = "date"; //$NON-NLS-1$
private final String PATH_INTERNET = "internet"; //$NON-NLS-1$
private PatternParameter parameter = null;
public Date getOrder() {
return createDate(2015, 8, 13);
}
public MigrationTaskType getMigrationTaskType() {
return MigrationTaskType.FILE;
}
@Override
protected boolean doExecute() throws Exception {
parameter = new PatternParameter();
// FR Insee Code
IFolder folder = ResourceManager.getPatternRegexFolder().getFolder(PATH_ADDRESS);
if (folder.exists()) {
if (!folder.getFile("FR_Insee_Code_0.1.pattern").exists()) {
Pattern pattern = newPattern("FR Insee Code", SQLLanguage, "'^(F-|FRA?(-| ))?((2[A|B])|[0-9]{2})[0-9]{3}$'"); //$NON-NLS-1$ //$NON-NLS-2$
if (pattern != null) {
setTagValue(pattern, " FRA-2A235 | F-2B128 | FRA 2B356", //$NON-NLS-1$
"French Insee code of cities with Corsica and colonies"); //$NON-NLS-1$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
}
// SEDOL
folder = ResourceManager.getPatternRegexFolder().getFolder(PATH_CUSTOMER);
if (folder.exists()) {
if (!folder.getFile("SEDOL_0.1.pattern").exists()) {
Pattern pattern = newPattern("SEDOL", SQLLanguage, "'^([B-Db-dF-Hf-hJ-Nj-nP-Tp-tV-Xv-xYyZz0-9]{6}[0-9])$'"); //$NON-NLS-1$ //$NON-NLS-2$
if (pattern != null) {
setTagValue(pattern, "B01HL06 | 4155586", "Stock Exchange Daily Official List "); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
}
// IPV6 MAC Address
folder = ResourceManager.getPatternRegexFolder().getFolder(PATH_INTERNET);
if (folder.exists()) {
if (!folder.getFile("IPv6_Address_0.1.pattern").exists()) {
Pattern pattern = newPattern(
"IPv6 Address", //$NON-NLS-1$
SQLLanguage,
"'^((([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}:[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){5}:([0-9A-Fa-f]{1,4}:)?[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){4}:([0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){3}:([0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){2}:([0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}((\\b((25[0-5])|(1\\d{2})|(2[0-4]\\d)|(\\d{1,2}))\\b)\\.){3}(\\b((25[0-5])|(1\\d{2})|(2[0-4]\\d)|(\\d{1,2}))\\b))|(([0-9A-Fa-f]{1,4}:){0,5}:((\\b((25[0-5])|(1\\d{2})|(2[0-4]\\d)|(\\d{1,2}))\\b)\\.){3}(\\b((25[0-5])|(1\\d{2})|(2[0-4]\\d)|(\\d{1,2}))\\b))|(::([0-9A-Fa-f]{1,4}:){0,5}((\\b((25[0-5])|(1\\d{2})|(2[0-4]\\d)|(\\d{1,2}))\\b)\\.){3}(\\b((25[0-5])|(1\\d{2})|(2[0-4]\\d)|(\\d{1,2}))\\b))|([0-9A-Fa-f]{1,4}::([0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})|(::([0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){1,7}:))$'"); //$NON-NLS-1$
if (pattern != null) {
setTagValue(pattern, "Check if it is a IPv6 address", "IPv6 address"); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
if (!folder.getFile("MAC_Address_0.1.pattern").exists()) {
Pattern pattern = newPattern(
"MAC Address", SQLLanguage, "'^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$'"); //$NON-NLS-1$ //$NON-NLS-2$
if (pattern != null) {
setTagValue(pattern, "A4:4E:31:B9:C5:B4", "Match MAC Address"); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
}
// GPS Coordinate ,ISBN 13,UK SSN
folder = ResourceManager.getPatternRegexFolder().getFolder(PATH_NUMBER);
if (folder.exists()) {
if (!folder.getFile("GPS_Coordinate_0.1.pattern").exists()) {
Pattern pattern = newPattern("GPS Coordinate", SQLLanguage, //$NON-NLS-1$
"'^([0-9]{1,3}[\\.][0-9]*)[, ]+-?([0-9]{1,3}[\\.][0-9]*)$'"); //$NON-NLS-1$
if (pattern != null) {
setTagValue(pattern, "40.7127837,-74.00594130000002", "Google Maps style GPS Decimal format"); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
if (!folder.getFile("UK_SSN_0.1.pattern").exists()) {
Pattern pattern = newPattern("UK SSN", SQLLanguage, //$NON-NLS-1$
"'^[A-CEGHJ-PR-TW-Z]{1}[A-CEGHJ-NPR-TW-Z]{1}([0-9]{6}|( [0-9]{2}){3} )[A-DFM]{0,1}$'"); //$NON-NLS-1$
if (pattern != null) {
setTagValue(pattern, "AB123456C | AB 12 34 56 C", //$NON-NLS-1$
"National identification number, national identity number, or national insurance number generally called an NI Number (NINO)"); //$NON-NLS-1$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
}
// EN Amount Money ,FR Amount Money
folder = ResourceManager.getPatternRegexFolder().getFolder(PATH_MONEY);
if (!folder.exists()) {
folder.create(true, true, null);
}
if (folder.exists()) {
if (!folder.getFile("EN_Amount_Money_0.1.pattern").exists()) {
Pattern pattern = newPattern("EN Amount Money", SQLLanguage, //$NON-NLS-1$
"'^((US|CA)?\\$|\\£|\\€|\\¥)(([1-9][0-9]{0,2}(\\,[0-9]{3})*)|([1-9][0-9]*)|(0))(\\.[0-9]{2}|k|M|G|T)?$'"); //$NON-NLS-1$
if (pattern != null) {
RegularExpression regularExpr = BooleanExpressionHelper
.createRegularExpression("MySQL", //$NON-NLS-1$
"'^((US|CA)?\\\\$|\\£|\\€|\\¥)(([1-9][0-9]{0,2}(\\,[0-9]{3})*)|([1-9][0-9]*)|(0))(\\.[0-9]{2}|k|M|G|T)?$'"); //$NON-NLS-1$
regularExpr.setExpressionType("REGEXP"); //$NON-NLS-1$
pattern.getComponents().add(regularExpr);
setTagValue(pattern, "$3,000 || CA$3000", "Amount of money in English format"); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
if (!folder.getFile("FR_Amount_Money_0.1.pattern").exists()) {
Pattern pattern = newPattern("FR Amount Money", SQLLanguage, //$NON-NLS-1$
"'^(([1-9][0-9]{0,2}( [0-9]{3})*)|([1-9][0-9]*)|0)((,[0-9]{2} | (k|M|G|T))?| )(\\$( (US|CA))?|\\£|\\€|\\¥)$'"); //$NON-NLS-1$
if (pattern != null) {
setTagValue(pattern, "3000 € | 35 k€ | 35 054 T€", "Amount of money in French format"); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
}
// EN_Month_Abbrev,EN_Month
folder = ResourceManager.getPatternRegexFolder().getFolder(PATH_DATE);
if (folder.exists()) {
if (!folder.getFile("EN_Month_Abbrev_0.1.pattern").exists()) {
Pattern pattern = newPattern(
"EN_Month_Abbrev", SQLLanguage, "'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)$'"); //$NON-NLS-1$ //$NON-NLS-2$
if (pattern != null) {
setTagValue(pattern, "Jan | Feb ", "Month English abbreviation"); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
if (!folder.getFile("EN_Month_0.1.pattern").exists()) {
Pattern pattern = newPattern("EN_Month", SQLLanguage, //$NON-NLS-1$
"'^(January|June|July|February|March|May|April|August|September|October|November|December)$'"); //$NON-NLS-1$
if (pattern != null) {
setTagValue(pattern, "January | February ", "Month in English"); //$NON-NLS-1$ //$NON-NLS-2$
ElementWriterFactory.getInstance().createPatternWriter().create(pattern, folder);
}
}
}
return true;
}
private Pattern newPattern(String name, String lang, String express) {
PatternBuilder patternBuilder = new PatternBuilder();
boolean patternInitialized = patternBuilder.initializePattern(name);
if (patternInitialized) {
Pattern pattern = patternBuilder.getPattern();
RegularExpression regularExpr = BooleanExpressionHelper.createRegularExpression(lang, express);
regularExpr.setExpressionType("REGEXP"); //$NON-NLS-1$
pattern.getComponents().add(regularExpr);
return pattern;
}
return null;
}
private void setTagValue(Pattern pattern, String purpose, String decription) {
TaggedValueHelper.setTaggedValue(pattern, TaggedValueHelper.DESCRIPTION, decription);
TaggedValueHelper.setTaggedValue(pattern, TaggedValueHelper.PURPOSE, purpose);
TaggedValueHelper.setTaggedValue(pattern, TaggedValueHelper.VALID_STATUS, String.valueOf(true));
if (parameter == null) {
parameter = new PatternParameter();
}
MetadataHelper.setDevStatus(pattern, parameter.getStatus());
MetadataHelper.setAuthor(pattern, parameter.getAuthor());
MetadataHelper.setVersion(parameter.getVersion(), pattern);
}
}