package com.github.martinprillard.shavadoop.slave; import java.util.List; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import com.github.martinprillard.shavadoop.util.Constant; import com.github.martinprillard.shavadoop.util.Util; /** * * @author martin prillard * */ public class SplitMappingThread extends Thread { private List<ConcurrentHashMap<String, AtomicInteger>> unsortedMaps; private List<String> chunk; private int nbWorker; public SplitMappingThread(List<ConcurrentHashMap<String, AtomicInteger>> _unsortedMaps, List<String> _chunk, int _nbWorker) { unsortedMaps = _unsortedMaps; chunk = _chunk; nbWorker = _nbWorker; } public void run() { for (String line : chunk) { line = cleanLine(line); if (!line.equals("") || !line.isEmpty()) { wordCount(nbWorker, line); } } } /** * Clean the line * * @param line * @return line clean */ private String cleanLine(String line) { String clean = line; clean = clean.trim(); // clean the non alpha numeric character or space clean = clean.replaceAll("[^a-zA-Z0-9\\s]", " "); // just one space beetween each words clean = clean.replaceAll("\\s+", " "); clean = clean.replaceAll("\\t+", " "); return clean; } /** * Count the occurence of each word in the sentence * * @param nbWorker * @param line * @return res */ private void wordCount(int nbWorker, String line) { // split the line word by word String words[] = line.split(Constant.SEP_WORD); for (int i = 0; i < words.length; i++) { String word = words[i]; // add counter value for this word int idNextWorker = getIdNextWorker(word, nbWorker); // increment atomically like the hadoop combiner increment(idNextWorker, word); } } /** * Return the id next worker from the key * * @param key * @param nbWorker * @return id next worker */ private int getIdNextWorker(String key, int nbWorker) { return Math.abs((int) (Util.simpleHash(key) % nbWorker)); } /** * Increment the value atomically * * @param idHashMap * @param key */ private void increment(int idHashMap, String key) { AtomicInteger value = unsortedMaps.get(idHashMap).get(key); if (value == null) { value = new AtomicInteger(0); AtomicInteger old = unsortedMaps.get(idHashMap).putIfAbsent(key, value); if (old != null) { value = old; } } value.incrementAndGet(); // increment the value atomically } }