// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dq.indicators; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import org.eclipse.core.runtime.IPath; import org.eclipse.core.runtime.Path; import org.eclipse.emf.common.util.EMap; import org.eclipse.jface.dialogs.MessageDialog; import org.eclipse.swt.widgets.Display; import org.eclipse.ui.PlatformUI; import org.talend.core.IRepositoryContextService; import org.talend.core.model.metadata.builder.connection.DelimitedFileConnection; import org.talend.core.model.metadata.builder.connection.Escape; import org.talend.core.model.metadata.builder.connection.MetadataColumn; import org.talend.core.model.metadata.builder.connection.MetadataTable; import org.talend.core.model.metadata.builder.database.JavaSqlFactory; import org.talend.core.runtime.CoreRuntimePlugin; import org.talend.cwm.helper.ColumnHelper; import org.talend.cwm.management.i18n.Messages; import org.talend.dataquality.PluginConstant; import org.talend.dataquality.analysis.Analysis; import org.talend.dataquality.analysis.AnalysisFactory; import org.talend.dataquality.analysis.AnalyzedDataSet; import org.talend.dataquality.indicators.DuplicateCountIndicator; import org.talend.dataquality.indicators.Indicator; import org.talend.dataquality.indicators.RowCountIndicator; import org.talend.dataquality.indicators.UniqueCountIndicator; import org.talend.dataquality.indicators.mapdb.MapDBUtils; import org.talend.dq.helper.AnalysisExecutorHelper; import org.talend.dq.helper.FileUtils; import org.talend.fileprocess.FileInputDelimited; import org.talend.utils.sql.TalendTypeConvert; import org.talend.utils.sugars.ReturnCode; import orgomg.cwm.objectmodel.core.ModelElement; import com.talend.csv.CSVReader; /** * DOC qiongli class global comment. Detailled comment */ public class DelimitedFileIndicatorEvaluator extends IndicatorEvaluator { protected DelimitedFileConnection delimitedFileconnection = null; private Logger log = Logger.getLogger(DelimitedFileIndicatorEvaluator.class); private boolean isBablyForm = false; /** * DOC qiongli DelimitedFileIndicatorEvaluator constructor comment. * * @param analysis */ public DelimitedFileIndicatorEvaluator(Analysis analysis) { super(analysis); this.analysis = analysis; } @Override protected ReturnCode executeSqlQuery(String sqlStatement) { ReturnCode returnCode = new ReturnCode(true); if (delimitedFileconnection == null) { delimitedFileconnection = (DelimitedFileConnection) analysis.getContext().getConnection(); } if (delimitedFileconnection.isContextMode()) { IRepositoryContextService service = CoreRuntimePlugin.getInstance().getRepositoryContextService(); delimitedFileconnection = (DelimitedFileConnection) service.cloneOriginalValueConnection(delimitedFileconnection); } String path = JavaSqlFactory.getURL(delimitedFileconnection); IPath iPath = new Path(path); File file = iPath.toFile(); if (!file.exists()) { returnCode.setReturnCode(Messages.getString("DelimitedFileIndicatorEvaluator.CanNotFindFile"), false); //$NON-NLS-1$ return returnCode; } List<ModelElement> analysisElementList = this.analysis.getContext().getAnalysedElements(); EMap<Indicator, AnalyzedDataSet> indicToRowMap = analysis.getResults().getIndicToRowMap(); indicToRowMap.clear(); List<MetadataColumn> columnElementList = new ArrayList<MetadataColumn>(); for (int i = 0; i < analysisElementList.size(); i++) { MetadataColumn mColumn = (MetadataColumn) analysisElementList.get(i); MetadataTable mTable = ColumnHelper.getColumnOwnerAsMetadataTable(mColumn); columnElementList = mTable == null ? columnElementList : mTable.getColumns(); if (!columnElementList.isEmpty()) { break; } } ReturnCode readDataReturnCode = new ReturnCode(true); // use CsvReader to parse. if (Escape.CSV.equals(delimitedFileconnection.getEscapeType())) { readDataReturnCode = useCsvReader(file, analysisElementList, columnElementList, indicToRowMap); } else { readDataReturnCode = useDelimitedReader(analysisElementList, columnElementList, indicToRowMap); } // handle error message if (!readDataReturnCode.isOk()) { Display.getDefault().asyncExec(new Runnable() { public void run() { MessageDialog.openWarning(PlatformUI.getWorkbench().getActiveWorkbenchWindow().getShell(), Messages.getString("DelimitedFileIndicatorEvaluator.badlyForm.Title"), //$NON-NLS-1$ Messages.getString("DelimitedFileIndicatorEvaluator.badlyForm.Message")); //$NON-NLS-1$ } }); } // Added yyin 20120608 TDQ-3589 for (MetadataColumn col : columnElementList) { List<Indicator> indicators = getIndicators(col.getLabel()); for (Indicator indicator : indicators) { if (indicator instanceof DuplicateCountIndicator) { AnalyzedDataSet analyzedDataSet = indicToRowMap.get(indicator); if (analyzedDataSet == null) { analyzedDataSet = AnalysisFactory.eINSTANCE.createAnalyzedDataSet(); indicToRowMap.put(indicator, analyzedDataSet); analyzedDataSet.setDataCount(analysis.getParameters().getMaxNumberRows()); analyzedDataSet.setRecordSize(0); } // indicator.finalizeComputation(); addResultToIndicatorToRowMap(indicator, indicToRowMap); } } }// ~ return returnCode; } /** * DOC talend Comment method "useDelimitedReader". * * @param file * @param delimitedFileconnection2 * @param analysisElementList * @param columnElementList * @param indicToRowMap * @return */ private ReturnCode useDelimitedReader(List<ModelElement> analysisElementList, List<MetadataColumn> columnElementList, EMap<Indicator, AnalyzedDataSet> indicToRowMap) { // use TOSDelimitedReader in FileInputDelimited to parse. ReturnCode returnCode = new ReturnCode(true); try { FileInputDelimited fileInputDelimited = createFileInputDelimited(); long currentRow = JavaSqlFactory.getHeadValue(delimitedFileconnection); while (fileInputDelimited.nextRecord()) { if (!continueRun()) { break; } currentRow++; int columsCount = fileInputDelimited.getColumnsCountOfCurrentRow(); String[] rowValues = new String[columsCount]; for (int i = 0; i < columsCount; i++) { rowValues[i] = fileInputDelimited.get(i); } returnCode.setOk(returnCode.isOk() && handleByARow(rowValues, currentRow, analysisElementList, columnElementList, indicToRowMap).isOk()); } fileInputDelimited.close(); } catch (IOException e) { log.error(e, e); } return returnCode; } /** * DOC zshen Comment method "createFileInputDelimited". * * @return * @throws IOException */ protected FileInputDelimited createFileInputDelimited() throws IOException { return AnalysisExecutorHelper.createFileInputDelimited(delimitedFileconnection); } /** * get the final result from duplicate indicator and set it into indicatorToRowMap Added yyin 20120608 TDQ-3589. * * @param indicator * @param indicToRowMap */ private void addResultToIndicatorToRowMap(Indicator indicator, EMap<Indicator, AnalyzedDataSet> indicToRowMap) { Map<Object, List<Object>> dupMap = ((DuplicateCountIndicator) indicator).getDuplicateMap(); Set<Object> duplicateValues = ((DuplicateCountIndicator) indicator).getDuplicateValues(); Iterator<Object> iterator = duplicateValues.iterator(); int maxNumberRows = analysis.getParameters().getMaxNumberRows(); while (iterator.hasNext()) { Object key = iterator.next(); List<Object> valueList = dupMap.get(key); if (valueList == null) { continue; } List<Object[]> valueObjectList = initDataSet(indicator, indicToRowMap, key); // MOD zshen add another loop to insert all of columnValue on the row into indicator. int NumberOfRecord = valueObjectList.size(); if (NumberOfRecord < maxNumberRows) { valueObjectList.add(valueList.toArray()); } else { break; } } } private ReturnCode useCsvReader(File file, List<ModelElement> analysisElementList, List<MetadataColumn> columnElementList, EMap<Indicator, AnalyzedDataSet> indicToRowMap) { ReturnCode returnCode = new ReturnCode(true); int limitValue = getCsvReaderLimit(); int headValue = JavaSqlFactory.getHeadValue(delimitedFileconnection); CSVReader csvReader = null; try { csvReader = FileUtils.createCsvReader(file, delimitedFileconnection); FileUtils.initializeCsvReader(delimitedFileconnection, csvReader); for (int i = 0; i < headValue && csvReader.readNext(); i++) { // do nothing, just ignore the header part } String[] rowValues = null; long currentRecord = 0; while (csvReader.readNext()) { currentRecord++; if (!continueRun() || limitValue != -1 && currentRecord > limitValue) { break; } rowValues = csvReader.getValues(); returnCode.setOk(returnCode.isOk() && handleByARow(rowValues, currentRecord, analysisElementList, columnElementList, indicToRowMap).isOk()); } } catch (IOException e) { log.error(e, e); } finally { if (csvReader != null) { try { csvReader.close(); } catch (IOException e) { log.error(e, e); } } } return returnCode; } /** * DOC zshen Comment method "getCsvReaderLimit". * * @return */ protected int getCsvReaderLimit() { return JavaSqlFactory.getLimitValue(delimitedFileconnection); } @Override protected ReturnCode closeConnection() { return new ReturnCode(true); } public DelimitedFileConnection getDelimitedFileconnection() { return this.delimitedFileconnection; } public void setDelimitedFileconnection(DelimitedFileConnection delimitedFileconnection) { this.delimitedFileconnection = delimitedFileconnection; } @Override protected ReturnCode checkConnection() { if (delimitedFileconnection == null) { return new ReturnCode(Messages.getString("Evaluator.openNullConnection"), false); //$NON-NLS-1$ } return new ReturnCode(true); } private ReturnCode handleByARow(String[] rowValues, long currentRow, List<ModelElement> analysisElementList, List<MetadataColumn> columnElementList, EMap<Indicator, AnalyzedDataSet> indicToRowMap) { ReturnCode returnCode = new ReturnCode(true); Object object = null; int maxNumberRows = analysis.getParameters().getMaxNumberRows(); int recordIncrement = 0; element: for (int i = 0; i < analysisElementList.size(); i++) { MetadataColumn mColumn = (MetadataColumn) analysisElementList.get(i); Integer position = ColumnHelper.getColumnIndex(mColumn); // warning with a file of badly form if (position == null || position >= rowValues.length) { log.warn(Messages.getString("DelimitedFileIndicatorEvaluator.incorrectData", //$NON-NLS-1$ mColumn.getLabel(), currentRow, delimitedFileconnection.getFilePath())); returnCode.setOk(false); continue; } object = TalendTypeConvert.convertToObject(mColumn.getTalendType(), rowValues[position], mColumn.getPattern()); List<Indicator> indicators = getIndicators(mColumn.getLabel()); for (Indicator indicator : indicators) { if (!continueRun()) { break element; } // bug 19036,to irregularly data,still compute for RowCountIndicator if (object == null && !(indicator instanceof RowCountIndicator)) { continue element; } // Added yyin 20120608 TDQ-3589 if (indicator instanceof DuplicateCountIndicator) { ((DuplicateCountIndicator) indicator).handle(object, rowValues); } else { // ~ indicator.handle(object); } AnalyzedDataSet analyzedDataSet = indicToRowMap.get(indicator); if (analyzedDataSet == null) { analyzedDataSet = AnalysisFactory.eINSTANCE.createAnalyzedDataSet(); indicToRowMap.put(indicator, analyzedDataSet); analyzedDataSet.setDataCount(maxNumberRows); analyzedDataSet.setRecordSize(0); } // TDQ-9413: fix the drill down for file connection get no values // see IndicatorEvaluator line 166, the logic is almost the same if (analysis.getParameters().isStoreData()) { if (indicator.mustStoreRow()) { List<Object[]> valueObjectList = initDataSet(indicator, indicToRowMap, object); recordIncrement = valueObjectList.size(); List<Object> inputRowList = new ArrayList<Object>(); for (int j = 0; j < rowValues.length; j++) { Object newobject = rowValues[j]; if (indicator.isUsedMapDBMode()) { inputRowList.add(newobject == null ? PluginConstant.NULL_STRING : newobject); continue; } else { if (recordIncrement < maxNumberRows) { if (recordIncrement < valueObjectList.size()) { valueObjectList.get(recordIncrement)[j] = newobject; } else { Object[] valueObject = new Object[rowValues.length]; valueObject[j] = newobject; valueObjectList.add(valueObject); } } else { break; } } } if (indicator.isUsedMapDBMode()) { MapDBUtils.handleDrillDownData(object, inputRowList, indicator); } } else if (indicator instanceof UniqueCountIndicator && analysis.getResults().getIndicToRowMap().get(indicator).getData() != null) { List<Object[]> removeValueObjectList = analysis.getResults().getIndicToRowMap().get(indicator).getData(); if (columnElementList.size() == 0) { continue; } int offsetting = columnElementList.indexOf(indicator.getAnalyzedElement()); for (Object[] dataObject : removeValueObjectList) { // Added yyin 20120611 TDQ5279 if (object instanceof Integer) { if (object.equals(Integer.parseInt((String) dataObject[offsetting]))) { removeValueObjectList.remove(dataObject); break; } }// ~ if (dataObject[offsetting].equals(object)) { removeValueObjectList.remove(dataObject); break; } } } } } } return returnCode; } }