package diaoyu.test; import java.io.IOException; import java.util.Iterator; import com.aliyun.odps.data.Record; import com.aliyun.odps.data.TableInfo; import com.aliyun.odps.mapred.JobClient; import com.aliyun.odps.mapred.MapperBase; import com.aliyun.odps.mapred.ReducerBase; import com.aliyun.odps.mapred.conf.JobConf; import com.aliyun.odps.mapred.utils.InputUtils; import com.aliyun.odps.mapred.utils.OutputUtils; import com.aliyun.odps.mapred.utils.SchemaUtils; /** * 提取html中文 * */ public class Get_Html_Chinese { /** * 处理表,提取数据 */ public static class TokenizerMapper extends MapperBase { private Record url; private Record html; @Override public void setup(TaskContext context) throws IOException { url = context.createMapOutputKeyRecord(); html = context.createMapOutputValueRecord(); System.out.println("TaskID:" + context.getTaskID().toString()); } @Override public void map(long recordNum, Record record, TaskContext context) throws IOException { url.set(new Object[] { record.get(0).toString() });// 取url html.set(new Object[] { record.get(1).toString() });// 取html context.write(url, html); } } /** *获取html中文 **/ public static class Chinese extends ReducerBase { private Record Chinese; @Override public void setup(TaskContext context) throws IOException { Chinese = context.createOutputRecord(); } @Override public void reduce(Record key, Iterator<Record> values, TaskContext context) throws IOException { Chinese.set(0, key.get(0)); while (values.hasNext()) { Record val = values.next(); Chinese.set(1, getChinese((String)val.get(0))); } context.write(Chinese); } /** * 提取中文 * */ private String getChinese(String str){ String res=""; char[] ch = str.toCharArray(); for (int i = 0; i < ch.length; i++) { char c = ch[i]; if (isChinese(c)&&!isChinesePunctuation(c)) { res+=c; } } return res; } /** * 根据Unicode编码完美的判断中文汉字和符号 * */ private boolean isChinese(char c) { Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) { return true; } return false; } /** * 根据Unicode编码完美的判断中文汉字和符号 * */ private boolean isChinesePunctuation(char c) { Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); if (ub == Character.UnicodeBlock.GENERAL_PUNCTUATION || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS || ub == Character.UnicodeBlock.VERTICAL_FORMS) { return true; } else { return false; } } } public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: WordCount <in_table> <out_table>"); System.exit(2); } JobConf job = new JobConf(); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(Chinese.class); job.setMapOutputKeySchema(SchemaUtils.fromString("url:string")); job.setMapOutputValueSchema(SchemaUtils.fromString("Chinese:string")); InputUtils.addTable(TableInfo.builder().tableName(args[0]).build(), job); OutputUtils.addTable(TableInfo.builder().tableName(args[1]).build(), job); JobClient.runJob(job); } }