Pig
Pig is developed by Yahoo which is more faster than Hive . It uses Map reduces in deeper layer for processing .
Loading a Text File :
customers = LOAD ‘hdfs://localhost:9000/pig_data/customer.txt’ USING PigStorage(‘,’)as (id:int, firstname:chararray, age:int, address:chararray, salary:int);
C = COGROUP A BY firstname, B BY firstname;
D = FOREACH C GENERATE FLATTEN((IsEmpty(A) ? null : A)), FLATTEN((IsEmpty(B) ? null : B));
B = FILTER A BY firstname is not null;
(joe,18,2.5)
(sam,,3.0)
(bob,,3.5)
X = group A by age;
dump X;
(18,{(joe,18,2.5)})
(,{(sam,,3.0),(bob,,3.5)})
A = LOAD 'data' USING MyStorage() AS (T: tuple(name:chararray, age: int));
B = FILTER A BY T == ('john', 25);
D = FOREACH B GENERATE T.firstname, [25#5.6], {(1, 5, 18)};
X = FILTER A BY (f1==8) OR (NOT (f2+f3 > f1));
G = GROUP A BY $0;
C = FOREACH G GENERATE COUNT(*)
A = load ‘input’ as (x, y, z);
B = foreach A generate x+y;
Relation3_name = JOIN Relation1_firstname BY key, Relation2_firstname BY key ;
outer_full = JOIN customer BY id FULL OUTER, orders BY customer_id;
Relation3_name = JOIN Relation2_firstname BY (key1, key2), Relation3_name BY (key1, key2);
C = JOIN A BY $0 FULL, B BY $0;
C = JOIN A by $0 LEFT OUTER, B BY $0;
X = LIMIT A 3;
Relation3_name = CROSS Relation1_firstname, Relation2_firstname;
grunt> Relation_name3 = UNION Relation_name1, Relation_name2;
student = UNION student1, student2;
SPLIT student_details into student_details1 if age<23, student_details2 if (22<age and age<25);
Dump student_details2;
Relation2_firstname = FILTER Relation1_firstname BY (condition);
Relation_name2 = DISTINCT Relatin_name1;
Relation_name2 = FOREACH Relatin_name1 GENERATE (required data);
foreach_data = FOREACH student_details GENERATE id,age,city;
Relation_name2 = ORDER Relatin_name1 BY (ASC|DESC);
limit_data = LIMIT student_details 4;
student_group_all = Group student_details All;
student_gpa_avg = foreach student_group_all Generate (student_details.firstname,student_details.gpa), AVG(student_details.gpa);
Describe student_details;
STORE student INTO ‘ hdfs://localhost:9000/pig_Output/ ‘ USING PigStorage (‘,’);
tobag = FOREACH emp_data GENERATE TOBAG (id,name,age,city);
data_top = FOREACH emp_group {
top = TOP(2, 0, emp_data);
GENERATE top;
}
tomap = FOREACH emp_data GENERATE TOMAP(name, age);
grunt> startswith_data = FOREACH emp_data GENERATE (id,name), STARTSWITH (name,’Ro’);
grunt> replace_data = FOREACH emp_data GENERATE (id,city),REPLACE(city,’Bhuwaneshwar’,’Bhuw’);
A = LOAD ‘WordcountInput.txt’;
B = MAPREDUCE ‘wordcount.jar’ STORE A INTO ‘inputDir’ LOAD ‘outputDir’
AS (word:chararray, count: int) `org.myorg.WordCount inputDir outputDir`;
SPLIT A INTO X IF f1<7, Y IF f2==5, Z IF (f3<6 OR f3>6);
STORE A INTO 'myoutput' USING PigStorage ('*');
A = LOAD 'data';
B = STREAM A THROUGH `stream.pl -n 5`;
A = LOAD 'data';
DEFINE mycmd `stream.pl –n 5`;
B = STREAM A THROUGH mycmd;
Pig is developed by Yahoo which is more faster than Hive . It uses Map reduces in deeper layer for processing .
Loading a Text File :
customers = LOAD ‘hdfs://localhost:9000/pig_data/customer.txt’ USING PigStorage(‘,’)as (id:int, firstname:chararray, age:int, address:chararray, salary:int);
C = COGROUP A BY firstname, B BY firstname;
D = FOREACH C GENERATE FLATTEN((IsEmpty(A) ? null : A)), FLATTEN((IsEmpty(B) ? null : B));
B = FILTER A BY firstname is not null;
(joe,18,2.5)
(sam,,3.0)
(bob,,3.5)
X = group A by age;
dump X;
(18,{(joe,18,2.5)})
(,{(sam,,3.0),(bob,,3.5)})
A = LOAD 'data' USING MyStorage() AS (T: tuple(name:chararray, age: int));
B = FILTER A BY T == ('john', 25);
D = FOREACH B GENERATE T.firstname, [25#5.6], {(1, 5, 18)};
X = FILTER A BY (f1==8) OR (NOT (f2+f3 > f1));
G = GROUP A BY $0;
C = FOREACH G GENERATE COUNT(*)
A = load ‘input’ as (x, y, z);
B = foreach A generate x+y;
Relation3_name = JOIN Relation1_firstname BY key, Relation2_firstname BY key ;
outer_full = JOIN customer BY id FULL OUTER, orders BY customer_id;
Relation3_name = JOIN Relation2_firstname BY (key1, key2), Relation3_name BY (key1, key2);
C = JOIN A BY $0 FULL, B BY $0;
C = JOIN A by $0 LEFT OUTER, B BY $0;
X = LIMIT A 3;
Relation3_name = CROSS Relation1_firstname, Relation2_firstname;
grunt> Relation_name3 = UNION Relation_name1, Relation_name2;
student = UNION student1, student2;
SPLIT student_details into student_details1 if age<23, student_details2 if (22<age and age<25);
Dump student_details2;
Relation2_firstname = FILTER Relation1_firstname BY (condition);
Relation_name2 = DISTINCT Relatin_name1;
Relation_name2 = FOREACH Relatin_name1 GENERATE (required data);
foreach_data = FOREACH student_details GENERATE id,age,city;
Relation_name2 = ORDER Relatin_name1 BY (ASC|DESC);
limit_data = LIMIT student_details 4;
student_group_all = Group student_details All;
student_gpa_avg = foreach student_group_all Generate (student_details.firstname,student_details.gpa), AVG(student_details.gpa);
Describe student_details;
STORE student INTO ‘ hdfs://localhost:9000/pig_Output/ ‘ USING PigStorage (‘,’);
tobag = FOREACH emp_data GENERATE TOBAG (id,name,age,city);
data_top = FOREACH emp_group {
top = TOP(2, 0, emp_data);
GENERATE top;
}
tomap = FOREACH emp_data GENERATE TOMAP(name, age);
grunt> startswith_data = FOREACH emp_data GENERATE (id,name), STARTSWITH (name,’Ro’);
grunt> replace_data = FOREACH emp_data GENERATE (id,city),REPLACE(city,’Bhuwaneshwar’,’Bhuw’);
A = LOAD ‘WordcountInput.txt’;
B = MAPREDUCE ‘wordcount.jar’ STORE A INTO ‘inputDir’ LOAD ‘outputDir’
AS (word:chararray, count: int) `org.myorg.WordCount inputDir outputDir`;
SPLIT A INTO X IF f1<7, Y IF f2==5, Z IF (f3<6 OR f3>6);
STORE A INTO 'myoutput' USING PigStorage ('*');
A = LOAD 'data';
B = STREAM A THROUGH `stream.pl -n 5`;
A = LOAD 'data';
DEFINE mycmd `stream.pl –n 5`;
B = STREAM A THROUGH mycmd;
0 Comments