package com.cse10.duplicateDetector; import com.cse10.entities.CrimeEntityGroup; import com.google.common.base.Charsets; import com.google.common.io.Files; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.util.*; import org.apache.log4j.Logger; /** * this class combine all the functionality of this module * Created by Chamath on 1/19/2015. */ public class DuplicateDetectorUIHandler extends Observable implements Runnable { SimHashCalculator simHashCalculator; DataHandler dataHandler; HammingDistanceCalculator hammingDistanceCalculator; private Logger log; public DuplicateDetectorUIHandler() { simHashCalculator = new SimHashCalculator(new FullWordSegmenter()); dataHandler = new DataHandler(); hammingDistanceCalculator = new HammingDistanceCalculator(); log = Logger.getLogger(this.getClass()); } /** * calculate sim hash values for each document * * @param documents * @return * @throws InterruptedException */ private HashMap<Integer, Long> calculateSimHashValues(HashMap<Integer, String> documents) throws InterruptedException { HashMap<Integer, Long> documentSimHashes = new HashMap<>(); if (documents != null) { int progress = 0; File articleHashValues = new File("DuplicateDetector\\src\\main\\resources\\hashValues.txt"); //clear the file before writing PrintWriter writer = null; try { writer = new PrintWriter(articleHashValues); } catch (FileNotFoundException e) { e.printStackTrace(); } writer.print(""); writer.close(); checkInterruption(); Iterator iterator = documents.keySet().iterator(); int increment = documents.keySet().size() / 40; int value = 0; //for each document calculate sim hash value while (iterator.hasNext()) { // Calculate the sim hash value of document. int id = (java.lang.Integer) iterator.next(); String document = documents.get(id); long docHash = simHashCalculator.getSimhash64Value(document); log.info(Thread.currentThread().getName() + "Duplicate Detector UI Handler->Document=[" + document + "] Hash=[" + docHash + " , " + java.lang.Long.toBinaryString(docHash) + "]" + "Bit Length of Hash:" + java.lang.Long.toBinaryString(docHash).length() + "bits"); try { Files.append("Document=[" + document + "] Hash=[" + docHash + " , " + java.lang.Long.toBinaryString(docHash) + "]" + "Bit Length of Hash:" + java.lang.Long.toBinaryString(docHash).length() + "bits \n", articleHashValues, Charsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } documentSimHashes.put(id, docHash); //send updates to GUI value++; if (value == increment) { progress += 1; notify(progress); value = 0; } checkInterruption(); } } notify(40); return documentSimHashes; } private void findDuplicates() throws InterruptedException { int progress = 0; // Read the documents from database, Integer is id in crime_entity_group and String is corresponding content HashMap<Integer, String> documents = dataHandler.readArticlesFromDB(); //this will contain all the calculated sim hash value (LONG) with related crime entity group id (Integer) HashMap<Integer, Long> documentSimHashes = calculateSimHashValues(documents); progress = 40; //if user stop the thread checkInterruption(); File articleHammingDistances = new File("DuplicateDetector\\src\\main\\resources\\hammingDistances.txt"); PrintWriter writer = null; try { writer = new PrintWriter(articleHammingDistances); } catch (FileNotFoundException e) { e.printStackTrace(); } writer.print(""); writer.close(); int currentDocumentId = 0; Iterator iterator = documents.keySet().iterator(); int increment = documents.keySet().size() / 60; int value = 0; //to store all the duplicate article ids, this is used to identify duplicates during iteration. List<Integer> duplicateArticleIds = new ArrayList(); //calculate duplicates for each document,add duplicate doc ids to a list //for each article while (iterator.hasNext()) { checkInterruption(); //calculate current doc's simHash currentDocumentId = (Integer) iterator.next(); //if current article is already detected as duplicate if (duplicateArticleIds.contains(currentDocumentId)) { continue; } String document = documents.get(currentDocumentId); long docHash = simHashCalculator.getSimhash64Value(document); //to store duplicates of this news article, in this iteration articles with this //ids will mark as duplicates List<Integer> similarDocIds = new ArrayList(); Map<Integer, Integer> docDistances = new HashMap(); Iterator iter = documentSimHashes.keySet().iterator(); //if user stop the thread checkInterruption(); //calculate hamming distance to each article and add it to similarDocs ArrayList while (iter.hasNext()) { checkInterruption(); int hashDocId = (Integer) iter.next(); Long hashValue = documentSimHashes.get(hashDocId); int distance = hammingDistanceCalculator.getHammingDistance(docHash, hashValue); //check the hamming distance difference if (distance <= 0) { similarDocIds.add(hashDocId); duplicateArticleIds.add(hashDocId); docDistances.put(hashDocId, distance); } } //if user stop the thread checkInterruption(); //update db to mark duplicate or write to file if (!similarDocIds.isEmpty()) { log.info((Thread.currentThread().getName() + " Duplicate Detector UI Handler-> Documents similar as [" + document + currentDocumentId + "]:\n")); try { Files.append("Documents similar as [" + document + " " + currentDocumentId + "]:\n", articleHammingDistances, Charsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } for (int i : similarDocIds) { if (i == currentDocumentId) { continue; } log.info((Thread.currentThread().getName() + " Duplicate Detector UI Handler-> [" + i + "]\tDistance=[" + docDistances.get(i) + "]\n")); checkInterruption(); //mark duplicate CrimeEntityGroups in DB CrimeEntityGroup crimeEntityGroup = dataHandler.fetchCrimeEntityGroup(i); crimeEntityGroup.setLabel("duplicate"); dataHandler.updateCrimeEntityGroup(crimeEntityGroup); try { Files.append("[" + i + "]\tDistance=[" + docDistances.get(i) + "]\n", articleHammingDistances, Charsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } try { Files.append("End\n", articleHammingDistances, Charsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } } } //update current entity's label. this must be done after all the duplicates are marked //don't swap the order CrimeEntityGroup crimeEntityGroup = dataHandler.fetchCrimeEntityGroup(currentDocumentId); crimeEntityGroup.setLabel("unique"); dataHandler.updateCrimeEntityGroup(crimeEntityGroup); //send updates to GUI value++; if (value == increment) { progress += 1; if (progress > 100) progress = 100; notify(progress); value = 0; } //if user stop the thread checkInterruption(); } //set progress to 100, sometimes it may not be exactly equal to 100 notify(100); } /** * start the process */ public void startDuplicateDetection() { try { findDuplicates(); } catch (InterruptedException e) { //if user has stopped the process log.info(Thread.currentThread().getName() + " Duplicate Detector UI Handler-> STOPPED"); dataHandler.closeDatabase(); } } @Override public void run() { startDuplicateDetection(); } /** * helper function to handle interruption */ private void checkInterruption() throws InterruptedException { if (Thread.currentThread().isInterrupted()) { throw new InterruptedException(); } } /** * helper function to notify observers * * @param progress */ private void notify(int progress) throws InterruptedException { checkInterruption(); setChanged(); notifyObservers(progress); } }