hive-4 hql语句对应mapreduce简介

chengjianxiaoxue

浏览: 1285503 次
性别:
来自: 北京

最近访客更多访客>>

liu_shui8

happy2012

nddht

yhtppp

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

hive

1.1 Join的实现原理

select u.name, o.orderid from order o join user u on o.uid = u.uid;

在map的输出value中为不同表的数据打上tag标记，在reduce阶段根据tag判断数据来源。MapReduce的过程如下：

对应map-reduce代码如下：

reduce :

1.2 Group By的实现原理

select rank, isonline, count(*) from city group by rank, isonline;

将GroupBy的字段组合为map的输出key值，利用MapReduce的排序，在reduce阶段保存LastKey区分不同的key

实现代码如下：

package mapreduce;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * 实现hive如下语句的mr代码
 * select rank, isonline, count(*) from city group by rank, isonline;
 * @author zm
 *
 */
public class GroupByApp {

	// 0 定义操作地址
	static final String FILE_ROOT = "hdfs://master:9000/";
	static final String INPUT_PATH = "hdfs://master:9000/files";
	static final String OUT_PATH = "hdfs://master:9000/out";
	
	public static void main(String[] args) throws Exception{
		
		Configuration conf = new Configuration();
		FileSystem fileSystem = FileSystem.get(new URI(FILE_ROOT),conf);
		Path outpath = new Path(OUT_PATH);
		if(fileSystem.exists(outpath)){
			fileSystem.delete(outpath, true);
		}
		
		// 0 定义干活的人
		Job job = new Job(conf);
		// 1.1 告诉干活的人 输入流位置     读取hdfs中的文件。每一行解析成一个<k,v>。每一个键值对调用一次map函数
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		// 指定如何对输入文件进行格式化，把输入文件每一行解析成键值对
		job.setInputFormatClass(TextInputFormat.class); //用户在启动MapReduce的时候需要指定一个InputFormat的implement
		
		//1.2 指定自定义的map类
		job.setMapperClass(GroupMapper.class);
		job.setMapOutputKeyClass(GroupBy.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		
		//1.3 分区
		job.setNumReduceTasks(1);
		
		//1.4 TODO 排序、分组    目前按照默认方式执行
		//1.5 TODO 规约
		
		//2.2 指定自定义reduce类
		job.setReducerClass(GroupReducer.class);
		job.setOutputKeyClass(GroupBy.class);
		job.setOutputValueClass(LongWritable.class);
		
		//2.3 指定写出到哪里
		FileOutputFormat.setOutputPath(job, outpath);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		// 让干活的人干活
		job.waitForCompletion(true);
		
	}
}

class GroupMapper extends Mapper<LongWritable, Text, GroupBy, LongWritable> {

	@Override
	protected void map(LongWritable key1, Text value1, Context context) throws IOException, InterruptedException {
		String v1 = value1.toString();
		String[] splits = v1.split(",");
		//GroupBy groupBy = new GroupBy(Long.parseLong(splits[0]),Long.parseLong(splits[1]));
		GroupBy groupBy = new GroupBy(splits[0],Long.parseLong(splits[1]));
		context.write(groupBy, new LongWritable(1));
	}
	
}

class GroupReducer extends Reducer<GroupBy, LongWritable, GroupBy, LongWritable>{

	protected void reduce(GroupBy k2, Iterable<LongWritable> v2s, Context context) throws IOException, InterruptedException {
		long count = 0;
		System.out.println("reduce----> k2: " + k2.toString());
		for(LongWritable v2 : v2s){
			System.out.println(v2.toString());
			count += v2.get();
		}
		context.write(k2, new LongWritable(count));
	}
	
}

class GroupBy implements WritableComparable<GroupBy> {

	private String rank;
	private long isonline;
	
	public GroupBy(){}
	public  GroupBy(String rank,long isonline){
		this.rank = rank;
		this.isonline = isonline;
	}
	@Override
	public void write(DataOutput out) throws IOException {
		Text.writeString(out, this.rank); // 使用Text 实现string类型读写入操作
		//out.writeLong(this.rank);
		out.writeLong(this.isonline);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.rank = Text.readString(in);
		//this.rank = in.readLong();
		this.isonline = in.readLong();
	}
	
	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + (int) (isonline ^ (isonline >>> 32));
		result = prime * result + ((rank == null) ? 0 : rank.hashCode());
		return result;
	}
	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		GroupBy other = (GroupBy) obj;
		if (isonline != other.isonline)
			return false;
		if (rank == null) {
			if (other.rank != null)
				return false;
		} else if (!rank.equals(other.rank))
			return false;
		return true;
	}
	@Override
	public String toString() {
		return this.rank + "	" + this.isonline;
	}
	
	@Override
	public int compareTo(GroupBy other) {
		long result;
		result = this.rank.compareTo(other.rank);
		//result = this.rank - other.rank;
		if(result == 0){
			result = this.isonline - other.isonline;
		}
		return (int)result;
	}
	
}

结果：

[root@master ~]# hadoop fs -text /out/part-r-00000
Warning: $HADOOP_HOME is deprecated.

A       1       3
B       0       1

查看图片附件

分享到：

java小练习 | hive-3 表连接和查询语句简介

2014-12-18 15:41
浏览 1446
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

hive-4 hql语句对应mapreduce简介

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

hive-4 hql语句对应mapreduce简介

评论

发表评论

相关推荐

hive开窗函数

hive分页

hive-脚本增量导入数据

HIVE备份之批量导出所有的HIVE建表字段

hive列存储格式对比

sql执行顺序

查看cdh使用组件的版本对应apache原生态版本

hive认知1

hive对应mysql 元数据表介绍

Linux下 $(cd `dirname $0`;pwd)

修复hive表存储格式为PARQUET的分区表中类型定义为int到float的过程

hive快速拷贝动态分区的两种方式

hive增量对比后将增量数据插入原表

hive -e 出现cannot recognize input nearXXX

hive log的分类和所在位置

将很多段逻辑sql放在一个hive文件执行 终止提交的任务做法

hive自定义函数 求和

hive 获取当前yyyy/MM/dd HH:mm:ss

hive -f执行整体脚本时，报错下如何知道前面执行多少个了

select join where执行顺序

最近访客更多访客>>

将很多段逻辑sql放在一个hive文件执行终止提交的任务做法

hive自定义函数求和