package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* 用于实现分页的Pipeline。<br>
* 在使用redis做分布式爬虫时,请不要使用此功能。<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 下午5:15 <br>
*/
public class PagedPipeline implements Pipeline {
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> resultItemsAll = resultItems.getAll();
Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator();
while (iterator.hasNext()) {
handleObject(iterator);
}
}
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
Map.Entry<String, Object> objectEntry = iterator.next();
Object o = objectEntry.getValue();
if (o instanceof PagedModel) {
PagedModel pagedModel = (PagedModel) o;
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
if (pagedModel.getOtherPages() != null) {
for (String otherPage : pagedModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
if (aBoolean == null) {
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
}
}
}
//check if all pages are processed
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
if (booleanMap == null) {
return;
}
for (Map.Entry<String, Boolean> stringBooleanEntry : booleanMap.entrySet()) {
if (!stringBooleanEntry.getValue()) {
iterator.remove();
return;
}
}
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
if (entryList.size() != 0) {
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
@Override
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
try {
int i1 = Integer.parseInt(o1.getKey());
int i2 = Integer.parseInt(o2.getKey());
return i1 - i2;
} catch (NumberFormatException e) {
return o1.getKey().compareTo(o2.getKey());
}
}
});
PagedModel value = entryList.get(0).getValue();
for (int i = 1; i < entryList.size(); i++) {
value = value.combine(entryList.get(i).getValue());
}
objectEntry.setValue(value);
}
}
}
}