MapReduce求Top K的算法

lt200819

浏览: 182416 次
性别:
来自: 北京

最近访客更多访客>>

lizhiqiang

angelmm1234567

1988xuxuxu

jingkyks

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

hadoop

今天去百度面试，这么简单的题做法都有问题，悲哀啊，mark一下。

估计要和百度失之交臂了，悔恨。

其实当时有想法了的，不过被面试官问了一句“放内存够大吗？”一下打消了这个想法。愁啊。

算法如下。不知道对不对。回去再研究下

package com.bupt.mapreduce;

/**

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

import java.util.TreeMap;

//利用MapReduce求最大值海量数据中的K个数

public class Top_k_new extends Configured implements Tool {

public static class MapClass extends Mapper<LongWritable, Text, NullWritable, Text> {

public static final int K = 100;

private TreeMap<Integer, Text> fatcats = new TreeMap<Integer, Text>();

public void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

String[] str = value.toString().split(",", -2);

int temp = Integer.parseInt(str[8]);

fatcats.put(temp, value);

if (fatcats.size() > K)

fatcats.remove(fatcats.firstKey())

}

@Override

protected void cleanup(Context context) throws IOException, InterruptedException {

for(Text text: fatcats.values()){

context.write(NullWritable.get(), text);

}

public static class Reduce extends Reducer<NullWritable, Text, NullWritable, Text> {

public static final int K = 100;

private TreeMap<Integer, Text> fatcats = new TreeMap<Integer, Text>();

public void reduce(NullWritable key, Iterable<Text> values, Context context)

throws IOException, InterruptedException {

for (Text val : values) {

String v[] = val.toString().split("\t");

Integer weight = Integer.parseInt(v[1]);

fatcats.put(weight, val);

if (fatcats.size() > K)

fatcats.remove(fatcats.firstKey());

}

for (Text text: fatcats.values())

context.write(NullWritable.get(), text);

}

public int run(String[] args) throws Exception {

Configuration conf = getConf();

Job job = new Job(conf, "TopKNum");

job.setJarByClass(Top_k_new.class);

FileInputFormat.setInputPaths(job, new Path(args[0]));

FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.setMapperClass(MapClass.class);

// job.setCombinerClass(Reduce.class);

job.setReducerClass(Reduce.class);

job.setInputFormatClass(TextInputFormat.class);

job.setOutputFormatClass(TextOutputFormat.class);

job.setOutputKeyClass(NullWritable.class);

job.setOutputValueClass(Text.class);

System.exit(job.waitForCompletion(true) ? 0 : 1);

return 0;

}

public static void main(String[] args) throws Exception {

int res = ToolRunner.run(new Configuration(), new Top_k_new(), args);

System.exit(res);

}

分享到：

HashMap和TreeMap | Lucene 简介

2013-03-20 14:52
浏览 1391
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论