Get_Html_Chinese.java example

Explorer
mlcomp-master
- MLCompetition-master
package safe.fish;

import java.io.IOException;
import java.util.Iterator;

import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;


/**
 * 提取html中文
 * */
public class Get_Html_Chinese {
	
	/**
	 * 处理表，提取数据
	 */
	public static class TokenizerMapper extends MapperBase {
		private Record url;
		private Record html;

		@Override
		public void setup(TaskContext context) throws IOException {
			url = context.createMapOutputKeyRecord();
			html = context.createMapOutputValueRecord();
			System.out.println("TaskID:" + context.getTaskID().toString());
		}

		@Override
		public void map(long recordNum, Record record, TaskContext context) throws IOException {
			url.set(new Object[] { record.get(0).toString() });// 取url
			html.set(new Object[] { record.get(1).toString() });// 取html
			context.write(url, html);
		}
	}

	
	  /**
	   *获取html中文
	   **/
	  public static class Chinese extends ReducerBase {
		
	    private Record Chinese;
	    @Override
	    public void setup(TaskContext context) throws IOException {
	    	Chinese = context.createOutputRecord();
	    }

	    @Override
	    public void reduce(Record key, Iterator<Record> values, TaskContext context)
	        throws IOException {
	    	
	    	Chinese.set(0, key.get(0));
			while (values.hasNext()) {
				Record val = values.next();
				Chinese.set(1, getChinese((String)val.get(0)));
			}
			 context.write(Chinese);
	    }
	    
	    /**
		 * 提取中文
		 * */
		private String getChinese(String str){
			String res="";
			char[] ch = str.toCharArray();
			for (int i = 0; i < ch.length; i++) {
				char c = ch[i];
				if (isChinese(c)&&!isChinesePunctuation(c)) {
					res+=c;
				}
			}
			return res;
		}
		/**
		 *  根据Unicode编码完美的判断中文汉字和符号
		 * */
		private boolean isChinese(char c) {
			
			Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
			if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
					|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
					|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
					|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
					|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
					|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
					|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
				return true;
			}
			return false;
		}
		
		/**
		 *  根据Unicode编码完美的判断中文汉字和符号
		 * */
		private boolean isChinesePunctuation(char c) {
	        Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
	        if (ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
	                || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
	                || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
	                || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS
	                || ub == Character.UnicodeBlock.VERTICAL_FORMS) {
	            return true;
	        } else {
	            return false;
	        }
	    }
	}
	

	public static void main(String[] args) throws Exception {
		if (args.length != 2) {
			System.err.println("Usage: WordCount <in_table> <out_table>");
			System.exit(2);
		}

		JobConf job = new JobConf();

		job.setMapperClass(TokenizerMapper.class);
		job.setReducerClass(Chinese.class);

		job.setMapOutputKeySchema(SchemaUtils.fromString("url:string"));
		job.setMapOutputValueSchema(SchemaUtils.fromString("Chinese:string"));

		InputUtils.addTable(TableInfo.builder().tableName(args[0]).build(), job);
		OutputUtils.addTable(TableInfo.builder().tableName(args[1]).build(), job);

		JobClient.runJob(job);
	}


}