Topic
  • No replies
ExplosiveJoe
ExplosiveJoe
1 Post

Pinned topic Published wordcount application run with wrong output

‏2014-08-31T11:27:42Z |

--------------------------------------------------------------------------------------------

Input:

Word count example for hadoop
Word count example for
Word count example
word

word
word count

--------------------------------------------------------------------------------------------

Wrong output:

0    Word count example for hadoop
30    Word count example for
53    Word count example
72    word
77    
78    word
83    word count

---------------------------------------------------------------------------------------------

//The map class
package lhz.mr.wordcount.example;

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WordcountMapper extends Mapper<Object, Text, Text, IntWritable> {
    IntWritable one = new IntWritable(1);
        Text word = new Text();

        public void map(Object key, Text value, Context context)
                        throws IOException, InterruptedException {
                StringTokenizer itr = new StringTokenizer(value.toString());
                while (itr.hasMoreTokens()) {
                        word.set(itr.nextToken());
                        context.write(word, one);
                }
        }
}
//The reduce class:
package lhz.mr.wordcount.example;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WordcountReducer extends
        Reducer<Text, IntWritable, Text, IntWritable> {
                IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                        throws IOException, InterruptedException {
                int sum = 0;
                for (IntWritable val : values) {
                        sum += val.get();
                }
                result.set(sum);
                context.write(key, result);
        }
}
//The driver clsss
package lhz.mr.wordcount.example;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordcountDriver {

    public static void main(String[] args) throws Exception {
                Configuration conf = new Configuration();
                // Use programArgs array to retrieve program arguments.
                String[] programArgs = new GenericOptionsParser(conf, args)
                                .getRemainingArgs();
                Job job = new Job(conf);
                job.setJarByClass(WordcountDriver.class);
                job.setMapperClass(WordcountMapper.class);
                job.setReducerClass(WordcountReducer.class);

                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(IntWritable.class);

                // TODO: Update the input path for the location of the inputs of the
                // map-reduce job.
                FileInputFormat.addInputPath(job, new Path(programArgs[0]));
                // TODO: Update the output path for the output directory of the
                // map-reduce job.
                FileOutputFormat.setOutputPath(job, new Path(programArgs[1]));

                // Submit the job and wait for it to finish.
                job.waitForCompletion(true);
                // Submit and return immediately:
                // job.submit();
        }

}

How can I make it correct ?