#1287

mamthakulal
Participant

Q: Word Count

Step 1:

w = load ‘/words’ as (wd:chararray);
dump w;
result:
(Hi how are you? I am Fine)
(Where are you? I am at Prwatech class)
(Are you learning Hadoop? Yes I am.)

Step 2:
w_split = foreach w generate FLATTEN(TOKENIZE(wd)) as wd;
dump w_split;
result:
(Hi)
(how)
(are)
(you?)
(I)
(am)
(Fine)
(Where)
(are)
(you?)
(I)
(am)
(at)
(Prwatech)
(class)
(Are)
(you)
(learning)
(Hadoop?)
(Yes)
(I)
(am.)

describe w_split;
w_split: {word: chararray}

illustrate w_split;
———————————————-
| w | wd: bytearray |
———————————————-
| | Are you learning Hadoop? Yes I am. |
———————————————-
———————————————-
| w | wd: chararray |
———————————————-
| | Are you learning Hadoop? Yes I am. |
———————————————-
———————————
| w_split | word: chararray |
———————————
| | Are |
| | you |
| | learning |
| | Hadoop? |
| | Yes |
| | I |
| | am. |
———————————

Step 3:

wrdgrp = group w_split by word;
dump wrdgrp;
result:

(I,{(I),(I),(I)})
(Hi,{(Hi)})
(am,{(am),(am)})
(at,{(at)})
(Are,{(Are)})
(Yes,{(Yes)})
(am.,{(am.)})
(are,{(are),(are)})
(how,{(how)})
(you,{(you)})
(Fine,{(Fine)})
(you?,{(you?),(you?)})
(Where,{(Where)})
(class,{(class)})
(Hadoop?,{(Hadoop?)})
(Prwatech,{(Prwatech)})
(learning,{(learning)})

describe wrdgrp;
wrdgrp: {group: chararray,w_split: {word: chararray}}

illustrate wrdgrp;
————————————————-
| w | wd: chararray |
————————————————-
| | Hi how are you? I am Fine |
| | Where are you? I am at Prwatech class |
| | Are you learning Hadoop? Yes I am. |
————————————————-
———————————
| w_split | word: chararray |
———————————
| | Hi |
| | how |
| | are |
| | you? |
| | I |
| | am |
| | Fine |
| | Where |
| | are |
| | you? |
| | I |
| | am |
| | at |
| | Prwatech |
| | class |
| | Are |
| | you |
| | learning |
| | Hadoop? |
| | Yes |
| | I |
| | am. |
———————————
——————————————————————-
| wrdgrp | group: chararray | w_split: bag({word: chararray}) |
——————————————————————-
| | Are | {(Are)} |
| | Fine | {(Fine)} |
| | Hadoop? | {(Hadoop?)} |
| | Hi | {(Hi)} |
| | I | {(I), (I), (I)} |
| | Prwatech | {(Prwatech)} |
| | Where | {(Where)} |
| | Yes | {(Yes)} |
| | am | {(am), (am)} |
| | am. | {(am.)} |
| | are | {(are), (are)} |
| | at | {(at)} |
| | class | {(class)} |
| | how | {(how)} |
| | learning | {(learning)} |
| | you | {(you)} |
| | you? | {(you?), (you?)} |
——————————————————————-

Step 4:

wrdcount = foreach wrdgrp generate group, COUNT(w_split);

dump wrdcount;
result:

(I,3)
(Hi,1)
(am,2)
(at,1)
(Are,1)
(Yes,1)
(am.,1)
(are,2)
(how,1)
(you,1)
(Fine,1)
(you?,2)
(Where,1)
(class,1)
(Hadoop?,1)
(Prwatech,1)
(learning,1)

Step 5:
store wrdcount into ‘/wordcount_pig’;

Prwatech