===============
## WORD COUNT 
## MAPPER.PY
==============

#!/usr/bin/env python

# import sys because we need to read and write data to STDIN and STDOUT
import sys

# reading entire line from STDIN (standard input)
for line in sys.stdin:
    # to remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    
    # we are looping over the words array and printing the word
    # with the count of 1 to the STDOUT
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        print '%s\t%s' % (word, 1)


==========
## REDUCER.py
===========

#!/usr/bin/env python

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

# read the entire line from STDIN
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # splitting the data on the basis of tab we have provided in mapper.py
    word, count = line.split('\t', 1)
    # convert count (currently a string) to int
    try:
        count = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            print '%s\t%s' % (current_word, current_count)
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    print '%s\t%s' % (current_word, current_count)

======
## Word.txt
======

The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. It is designed to scale up from single servers to thousands of machines, each offering local computation and storage. Rather than rely on hardware to deliver high-availability, the library itself is designed to detect and handle failures at the application layer, so delivering a highly-available service on top of a cluster of computers, each of which may be prone to failures.

=======
## COMMAND
=======
cd /home/cloudera/wc
chmod 755 mapper.py reducer.py

cat word.txt | python mapper.py | sort | python reducer.py

sudo -u hdfs hadoop fs -mkdir /wc

sudo -u hdfs hadoop fs -mkdir /wc

sudo -u hdfs hadoop fs -put /home/cloudera/wc/word.txt /wc/word.txt

sudo -u hdfs hadoop fs -chown cloudera:cloudera /wc

hadoop fs -ls /wc

STREAMING_JAR=/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.6.0-mr1-cdh5.4.2.jar

# Remove old output if exists
hadoop fs -rm -r -f /wc/out

# Run job (ship the mapper & reducer with -file)
hadoop jar $STREAMING_JAR \
  -files /home/cloudera/wc/mapper.py,/home/cloudera/wc/reducer.py \
  -mapper "mapper.py" \
  -reducer "reducer.py" \
  -input /wc/word.txt \
  -output /wc/out

  