package com.brianway.webporter.collector.zhihu.processor; import com.brianway.webporter.data.DataProcessor; import com.brianway.webporter.data.HashSetDuplicateRemover; import com.brianway.webporter.data.elasticsearch.Document; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.selector.Json; import java.io.File; import java.util.ArrayList; import java.util.List; /** * 从原始数据生成满足 Elasticsearch 格式的 json 数据 */ public class ZhihuFolloweeDataProcessor implements DataProcessor<File, Document> { private HashSetDuplicateRemover<String> duplicateRemover = new HashSetDuplicateRemover<>(); @Override public List<Document> process(File inItem) { String s = MemberURLTokenGenerator.readFollowees(inItem); List<Document> documents = null; if (!StringUtils.isEmpty(s)) { documents = new ArrayList<>(20); Json json = new Json(s); List<String> users = json.jsonPath("$.data[*].[*]").all(); List<String> ids = json.jsonPath("$.data[*].id").all(); int i = 0; for (String id : ids) { if (!duplicateRemover.isDuplicate(id)) { documents.add(new Document(id, users.get(i))); } i++; } } return documents; } }