// ============================================================================ // // Copyright (C) 2006-2015 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.semantic.index.utils; import org.talend.dataquality.semantic.index.utils.optimizer.AirportOptimizer; import org.talend.dataquality.semantic.index.utils.optimizer.CategoryOptimizer; import org.talend.dataquality.semantic.index.utils.optimizer.FrCommuneOptimizer; import org.talend.dataquality.semantic.index.utils.optimizer.UsCountyOptimizer; public enum DictionaryGenerationSpec { /** * the categories defined in Keyword index */ ADDRESS_LINE(GenerationType.KEYWORD, "street_type_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4, 5 }), FULL_NAME( GenerationType.KEYWORD, "civility_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4, 5 }), /** * the categories defined in Data Dictionary index */ ANIMAL(GenerationType.DICTIONARY, "animal_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4 }), ANSWER(GenerationType.DICTIONARY, "answer.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0, 1 }), AIRPORT( GenerationType.DICTIONARY, "airport-name-wiki.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }, new AirportOptimizer()), AIRPORT_CODE( GenerationType.DICTIONARY, "airport-code-wiki.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0 }), CITY( GenerationType.DICTIONARY, "city_cleaned_without_pinyin.csv", new CsvReaderConfig(CsvConstants.COMMA, false), new int[] {}), // CITY_COMPLEMENTED("city_complemented.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 2 }, null, // "CITY"), CIVILITY( GenerationType.DICTIONARY, "civility_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4, 5 }), CONTINENT( GenerationType.DICTIONARY, "continent_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0, 1, 2, 3, 4, 5 }), CONTINENT_CODE( GenerationType.DICTIONARY, "continent_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 6 }), COUNTRY(GenerationType.DICTIONARY, "country-codes.csv", new CsvReaderConfig(CsvConstants.COMMA, true), new int[] { 0, 1 }), COUNTRY_CODE_ISO2( GenerationType.DICTIONARY, "country-codes.csv", new CsvReaderConfig(CsvConstants.COMMA, true), new int[] { 2 }), COUNTRY_CODE_ISO3( GenerationType.DICTIONARY, "country-codes.csv", new CsvReaderConfig(CsvConstants.COMMA, true), new int[] { 3 }), CURRENCY_NAME( GenerationType.DICTIONARY, "country-codes.csv", new CsvReaderConfig(CsvConstants.COMMA, true), new int[] { 17 }), CURRENCY_CODE( GenerationType.DICTIONARY, "country-codes.csv", new CsvReaderConfig(CsvConstants.COMMA, true), new int[] { 14 }), HR_DEPARTMENT( GenerationType.DICTIONARY, "hr_department_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }), FIRST_NAME( GenerationType.DICTIONARY, "firstname_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0 }), LAST_NAME(GenerationType.DICTIONARY, "lastname12k.csv", new CsvReaderConfig(CsvConstants.COMMA, true), new int[] { 0 }), GENDER( GenerationType.DICTIONARY, "gender_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4, 5 }), JOB_TITLE( GenerationType.DICTIONARY, "job_title_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }), MONTH( GenerationType.DICTIONARY, "months_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4, 5 }), STREET_TYPE( GenerationType.DICTIONARY, "street_type_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4, 5 }), WEEKDAY( GenerationType.DICTIONARY, "weekdays_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0, 1, 2, 3, 4, 5 }), MUSEUM( GenerationType.DICTIONARY, "wordnet_museums_yago2.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }), US_COUNTY( GenerationType.DICTIONARY, "us_counties.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }, new UsCountyOptimizer()), ORGANIZATION( GenerationType.DICTIONARY, "wordnet_organizations_yago2.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }), COMPANY( GenerationType.DICTIONARY, "wordnet_companies_yago2_optimized.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0 }), BEVERAGE( GenerationType.DICTIONARY, "wordnet_beverages_yago2.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }), MEASURE_UNIT( GenerationType.DICTIONARY, "units_of_measurement_cleaned.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 0 }), INDUSTRY( GenerationType.DICTIONARY, "industry_GICS_simplified.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 1 }), INDUSTRY_GROUP( GenerationType.DICTIONARY, "industry_group_GICS_simplified.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 1 }), SECTOR( GenerationType.DICTIONARY, "industry_sector_GICS_simplified.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 1 }), FR_COMMUNE( GenerationType.DICTIONARY, "fr_comsimp2015.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 10, 11 }, new FrCommuneOptimizer()), FR_DEPARTEMENT( GenerationType.DICTIONARY, "fr_depts2015.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 5 }), FR_REGION(GenerationType.DICTIONARY, "fr_reg2016.txt", new CsvReaderConfig(CsvConstants.TAB, true), new int[] { 4 }), FR_REGION_LEGACY( GenerationType.DICTIONARY, "fr_reg2015.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, false), new int[] { 4 }), LANGUAGE( GenerationType.DICTIONARY, "languages_code_name.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 2, 3, 4, 5 }), LANGUAGE_CODE_ISO2( GenerationType.DICTIONARY, "languages_code_name.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0 }), LANGUAGE_CODE_ISO3( GenerationType.DICTIONARY, "languages_code_name.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 1 }), CA_PROVINCE_TERRITORY( GenerationType.DICTIONARY, "ca_province_territory.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0 }), CA_PROVINCE_TERRITORY_CODE( GenerationType.DICTIONARY, "ca_province_territory.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 2 }), MX_ESTADO(GenerationType.DICTIONARY, "mx_estado.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 0 }), MX_ESTADO_CODE( GenerationType.DICTIONARY, "mx_estado.csv", new CsvReaderConfig(CsvConstants.SEMICOLON, true), new int[] { 2 }); private GenerationType generationType; private String sourceFile; private CsvReaderConfig csvConfig; private int[] columnsToIndex; private CategoryOptimizer optimizer; private String categoryName; private DictionaryGenerationSpec(GenerationType generationType, String sourceFile, CsvReaderConfig csvConfig, int[] columnsToIndex) { this(generationType, sourceFile, csvConfig, columnsToIndex, null, null); } private DictionaryGenerationSpec(GenerationType generationType, String sourceFile, CsvReaderConfig csvConfig, int[] columnsToIndex, CategoryOptimizer optimizer) { this(generationType, sourceFile, csvConfig, columnsToIndex, optimizer, null); } private DictionaryGenerationSpec(GenerationType generationType, String sourceFile, CsvReaderConfig csvConfig, int[] columnsToIndex, CategoryOptimizer optimizer, String categoryName) { this.generationType = generationType; this.sourceFile = sourceFile; this.csvConfig = csvConfig; this.columnsToIndex = columnsToIndex; this.optimizer = optimizer; if (categoryName == null) { this.categoryName = this.name(); } else { this.categoryName = categoryName; } } public GenerationType getGenerationType() { return generationType; } public String getSourceFile() { return sourceFile; } public CsvReaderConfig getCsvConfig() { return csvConfig; } public int[] getColumnsToIndex() { return columnsToIndex; } public void setColumnsToIndex(int[] columnsToIndex) { this.columnsToIndex = columnsToIndex; } public CategoryOptimizer getOptimizer() { return optimizer; } public String getCategoryName() { return categoryName; } } enum GenerationType { DICTIONARY, KEYWORD }